# Preprocessing 

Here the Web scraped Data of all episode-descriptions will be preprocessed

## Import modules

In [1]:
import pandas as pd

## Get RAW Data

In [2]:
PATH_TO_RAW_DATA = "/home/anton/Anton/Studium/DHBW /Semester_5/Text_Analysis/repo/Burg-NLP/data/episodes_descriptions.json"
PATH_TO_CLEAN_DATA = "/home/anton/Anton/Studium/DHBW /Semester_5/Text_Analysis/repo/Burg-NLP/topic_modeling/data/episodes_description_nouns.csv"

df = pd.read_json( PATH_TO_RAW_DATA , orient="index")
df


Unnamed: 0,title,text
0,Pilot,"In the middle of the night, an obviously drunk..."
1,Lawnmower Dog,"Jerry complains that the family dog, Snuffles..."
2,Anatomy Park (Episode),"It's Christmas, and Jerry tries to enforce the..."
3,M. Night Shaym-Aliens!,The episode opens with Rick dissecting a large...
4,Meeseeks and Destroy,"Mr. Meeseeks, existing to solve Beth's problem..."
...,...,...
76,Mercy Kill,"On the planet Dorian 5, Supernova and Vance ar..."
77,Girls Night,"At a bar, Calypso, Diablo Verde, Lady Katana, ..."
78,Kintsugi,"""On an unknown planet, a somber Alan Rails put..."
79,Little Trains,"Doom-Nomitron arrives to destroy Earth, only f..."


## Remove \n

In [3]:
df["text"] = pd.Series( x.replace("\n", "").replace("  ", "") for x in df["text"])
df

Unnamed: 0,title,text
0,Pilot,"In the middle of the night, an obviously drunk..."
1,Lawnmower Dog,"Jerry complains that the family dog, Snuffles..."
2,Anatomy Park (Episode),"It's Christmas, and Jerry tries to enforce the..."
3,M. Night Shaym-Aliens!,The episode opens with Rick dissecting a large...
4,Meeseeks and Destroy,"Mr. Meeseeks, existing to solve Beth's problem..."
...,...,...
76,Mercy Kill,"On the planet Dorian 5, Supernova and Vance ar..."
77,Girls Night,"At a bar, Calypso, Diablo Verde, Lady Katana, ..."
78,Kintsugi,"""On an unknown planet, a somber Alan Rails put..."
79,Little Trains,"Doom-Nomitron arrives to destroy Earth, only f..."


## Lowercase

In [4]:
df['text'] = df["text"].apply(lambda x: x.lower())
df

Unnamed: 0,title,text
0,Pilot,"in the middle of the night, an obviously drunk..."
1,Lawnmower Dog,"jerry complains that the family dog, snuffles..."
2,Anatomy Park (Episode),"it's christmas, and jerry tries to enforce the..."
3,M. Night Shaym-Aliens!,the episode opens with rick dissecting a large...
4,Meeseeks and Destroy,"mr. meeseeks, existing to solve beth's problem..."
...,...,...
76,Mercy Kill,"on the planet dorian 5, supernova and vance ar..."
77,Girls Night,"at a bar, calypso, diablo verde, lady katana, ..."
78,Kintsugi,"""on an unknown planet, a somber alan rails put..."
79,Little Trains,"doom-nomitron arrives to destroy earth, only f..."


## Abbreviation

Replace all ' with normal appostrophes

In [5]:
df["text"] = pd.Series( x.replace("'", "’")for x in df["text"])

Replacing Abbreviations to guaranty consistency

In [6]:
abbreviations_dict = {
    "don’t": "do not",
    "isn’t": "is not",
    "ain’t": "is not",
    "gotta": "got to",
    "can’t": "cannot",
    "won’t": "will not",
    "you’re": "you are",
    "they’re": "they are",
    "we’re": "we are",
    "I’m": "I am",
    "she’s": "she is",
    "he’s": "he is",
    "it’s": "it is",
    "there’s": "there is",
    "what’s": "what is",
    "here’s": "here is",
    "that’s": "that is",
    "who’s": "who is",
    "how’s": "how is",
    "aren’t": "are not",
    "wasn’t": "was not",
    "weren’t": "were not",
    "doesn’t": "does not",
    "didn’t": "did not",
    "hasn’t": "has not",
    "haven’t": "have not",
    "hadn’t": "had not",
    "shouldn’t": "should not",
    "wouldn’t": "would not",
    "couldn’t": "could not",
    "mustn’t": "must not",
    "mightn’t": "might not",
    "needn’t": "need not",
    "let’s": "let us",
    "y’all": "you all",
    "gonna": "going to",
    "wanna": "want to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "sorta": "sort of",
    "outta": "out of",
    "lotta": "lot of",
    "dunno": "do not know",
    "c’mon": "come on",
    "o’clock": "of the clock",
    "y’know": "you know",
    "ma’am": "madam",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "there’d": "there would",
    "it’d": "it would",
    "he’d": "he would",
    "she’d": "she would",
    "they’d": "they would",
    "I’d": "I would",
    "you’d": "you would",
    "we’d": "we would", 
    "it’s" : "it is"
}


# Characters

List of All Characters in order to dont stem them

In [7]:
rick_and_morty_characters = [
    "rick",
    "morty",
    "summer",
    "beth",
    "jerry",
    "birdperson",
    "mr. poopybutthole",
    "squanchy",
    "tammy",
    "noob-noob",
    "gearhead",
    "meeseeks",
    "jessica",
    "abradolph lincoler",
    "unity",
    "principal gene vagina",
    "cromulons",
    "krombopulos michael",
    "scary terry",
    "pickle rick",
    "president",
    "snowball",
    "snuffles"
]


### Replace Abbreviations

In [8]:
corrected_row = []
for row in df["text"]:
    for entry in list(abbreviations_dict.keys()):
        if entry in row:
            row = row.replace(entry, abbreviations_dict[entry])
        continue 
    corrected_row.append(row)

df["text"] = pd.Series(corrected_row)

df

Unnamed: 0,title,text
0,Pilot,"in the middle of the night, an obviously drunk..."
1,Lawnmower Dog,"jerry complains that the family dog, snuffles..."
2,Anatomy Park (Episode),"it is christmas, and jerry tries to enforce th..."
3,M. Night Shaym-Aliens!,the episode opens with rick dissecting a large...
4,Meeseeks and Destroy,"mr. meeseeks, existing to solve beth’s problem..."
...,...,...
76,Mercy Kill,"on the planet dorian 5, supernova and vance ar..."
77,Girls Night,"at a bar, calypso, diablo verde, lady katana, ..."
78,Kintsugi,"""on an unknown planet, a somber alan rails put..."
79,Little Trains,"doom-nomitron arrives to destroy earth, only f..."


## Remove Stopwords

### Import nltk

In [9]:
import nltk
from nltk.corpus import stopwords

### Download Stopwords

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/anton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Spacy Pipeline

### Import Modules

In [11]:
import spacy
from spacy.language import Language
from spacy.tokens import Doc
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.stem import PorterStemmer

### Load Small Spacy Model

In [12]:
nlp = spacy.load("en_core_web_sm")

### Test Spacy

In [13]:
doc = nlp("Rick and Morty")
print(type(doc[0].pos_))
print(doc)

<class 'str'>
Rick and Morty


## Update Spacy Pipeline

In [14]:
nlp.pipeline
nlp.remove_pipe("lemmatizer")
# To Do Pipeline Reihenfolge anpassen

('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7ff17e0c5000>)

### Custom Methods Pipeline

In [15]:
@Language.component("stopword_remover")
def custom_remover(doc):
    stop_words = set(stopwords.words('english'))
    stop_symbols = ["?", "..." ,"." , "!" , "&" , "," , "-"]
    valid_doc = [token.text for token in doc if token.text not in stop_words and token.text not in stop_symbols]
    valid_doc_pos = [token.pos_ for token in doc if token.text not in stop_words and token.text not in stop_symbols]

    return Doc(nlp.vocab , words=valid_doc , pos=valid_doc_pos)

In [16]:
nlp.add_pipe("stopword_remover" , after="ner")

<function __main__.custom_remover(doc)>

In [17]:
@Language.component("noun_getter")
def custom_stemmer(doc):
    ps = PorterStemmer()

    nouns = []
    nouns_tags = []

    for token in doc:
        if token.pos_ == "PROPN" or  token.pos_ == "NOUN":
            nouns.append(token.text)
            nouns_tags.append(token.tag_)
            
    return Doc(nlp.vocab , words=nouns, tags=nouns_tags)

In [18]:
nlp.add_pipe("noun_getter", after="stopword_remover" )

<function __main__.custom_stemmer(doc)>

In [19]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'stopword_remover', 'noun_getter']


In [20]:
doc = nlp("On the planet")
doc.text

'planet '

## Applying Spacy Pipeline to Dataframe

In [21]:
df["text"] = list(nlp.pipe(df["text"]))
df

Unnamed: 0,title,text
0,Pilot,"(middle, night, rick, bursts, morty, room, sur..."
1,Lawnmower Dog,"(jerry, family, dog, snuffles, rick, jerry, de..."
2,Anatomy Park (Episode),"(christmas, jerry, idea, holiday, rest, family..."
3,M. Night Shaym-Aliens!,"(episode, rick, rat, garage, workmanship, mort..."
4,Meeseeks and Destroy,"(mr, meeseeks, beth, adventure, morty, adventu..."
...,...,...
76,Mercy Kill,"(planet, supernova, vance, aliens, deaths, inn..."
77,Girls Night,"(bar, calypso, diablo, verde, lady, katana, su..."
78,Kintsugi,"(planet, somber, alan, rails, vase, glue, crac..."
79,Little Trains,"(doom, nomitron, earth, satellite, supernova, ..."


### Converting Docs to Strings again

In [22]:
df["text"] = [ " ".join([t.text for t in row]) for row in df["text"]] # Comverting DOc in Text


### Test if if workes

In [23]:
pd.options.display.max_colwidth = 250
display(df["text"].head(10))

0    middle night rick bursts morty room surprise vehicle rick neutrino bomb earth start end humanity morty jessica adam eve morty control situation wheel vehicle rick vehicle test morty bomb morning morty breakfast mother beth morty sister summer ric...
1    jerry family dog snuffles rick jerry device dog intelligence morty dreams morty math teacher morty class morty homework morty math teacher dreams dreams tv character mrs pancakes centaur dreams results rick morty terry parody krueger villain ina ...
2    christmas jerry idea holiday rest family hisparents morty beth summer gadgets rick man santa outfit ruben holiday jerry idea family christmas introduction jacob mother lover way relationship parents rest family relationship jerry anxiety parents ...
3    episode rick rat garage workmanship morty day beth car work rick day events morty morty school front math class mr goldenfold class morty answer math question classmate morty recipe matter morty jessica rick bursts morty away.rick

# Save to CSV

In [24]:
df.to_csv(PATH_TO_CLEAN_DATA)