# Preprocessing 

Here the Web scraped Data of all episode-descriptions will be preprocessed

## Import modules

In [None]:
import pandas as pd

## Get RAW Data

In [1]:
PATH_TO_RAW_DATA = "/home/anton/Anton/Studium/DHBW /Semester_5/Text_Analysis/repo/Burg-NLP/data/episodes_descriptions.json"
PATH_TO_CLEAN_DATA = "/home/anton/Anton/Studium/DHBW /Semester_5/Text_Analysis/repo/Burg-NLP/topic_modeling/data/episodes_description.csv"

df = pd.read_json( PATH_TO_RAW_DATA , orient="index")
df


Unnamed: 0,title,text
0,Pilot,"In the middle of the night, an obviously drunk..."
1,Lawnmower Dog,"Jerry complains that the family dog, Snuffles..."
2,Anatomy Park (Episode),"It's Christmas, and Jerry tries to enforce the..."
3,M. Night Shaym-Aliens!,The episode opens with Rick dissecting a large...
4,Meeseeks and Destroy,"Mr. Meeseeks, existing to solve Beth's problem..."
...,...,...
76,Mercy Kill,"On the planet Dorian 5, Supernova and Vance ar..."
77,Girls Night,"At a bar, Calypso, Diablo Verde, Lady Katana, ..."
78,Kintsugi,"""On an unknown planet, a somber Alan Rails put..."
79,Little Trains,"Doom-Nomitron arrives to destroy Earth, only f..."


## Remove \n

In [1]:
df["text"] = pd.Series( x.replace("\n", "").replace("  ", "") for x in df["text"])
df

NameError: name 'pd' is not defined

## Lowercase

In [3]:
df['text'] = df["text"].apply(lambda x: x.lower())
df

Unnamed: 0,title,text
0,Pilot,"in the middle of the night, an obviously drunk..."
1,Lawnmower Dog,"jerry complains that the family dog, snuffles..."
2,Anatomy Park (Episode),"it's christmas, and jerry tries to enforce the..."
3,M. Night Shaym-Aliens!,the episode opens with rick dissecting a large...
4,Meeseeks and Destroy,"mr. meeseeks, existing to solve beth's problem..."
...,...,...
76,Mercy Kill,"on the planet dorian 5, supernova and vance ar..."
77,Girls Night,"at a bar, calypso, diablo verde, lady katana, ..."
78,Kintsugi,"""on an unknown planet, a somber alan rails put..."
79,Little Trains,"doom-nomitron arrives to destroy earth, only f..."


## Abbreviation

Replace all ' with normal appostrophes

In [4]:
df["text"] = pd.Series( x.replace("'", "’")for x in df["text"])

Replacing Abbreviations to guaranty consistency

In [5]:
abbreviations_dict = {
    "don’t": "do not",
    "isn’t": "is not",
    "ain’t": "is not",
    "gotta": "got to",
    "can’t": "cannot",
    "won’t": "will not",
    "you’re": "you are",
    "they’re": "they are",
    "we’re": "we are",
    "I’m": "I am",
    "she’s": "she is",
    "he’s": "he is",
    "it’s": "it is",
    "there’s": "there is",
    "what’s": "what is",
    "here’s": "here is",
    "that’s": "that is",
    "who’s": "who is",
    "how’s": "how is",
    "aren’t": "are not",
    "wasn’t": "was not",
    "weren’t": "were not",
    "doesn’t": "does not",
    "didn’t": "did not",
    "hasn’t": "has not",
    "haven’t": "have not",
    "hadn’t": "had not",
    "shouldn’t": "should not",
    "wouldn’t": "would not",
    "couldn’t": "could not",
    "mustn’t": "must not",
    "mightn’t": "might not",
    "needn’t": "need not",
    "let’s": "let us",
    "y’all": "you all",
    "gonna": "going to",
    "wanna": "want to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "sorta": "sort of",
    "outta": "out of",
    "lotta": "lot of",
    "dunno": "do not know",
    "c’mon": "come on",
    "o’clock": "of the clock",
    "y’know": "you know",
    "ma’am": "madam",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "there’d": "there would",
    "it’d": "it would",
    "he’d": "he would",
    "she’d": "she would",
    "they’d": "they would",
    "I’d": "I would",
    "you’d": "you would",
    "we’d": "we would", 
    "it’s" : "it is"
}


# Characters

List of All Characters in order to dont stem them

In [6]:
rick_and_morty_characters = [
    "rick",
    "morty",
    "summer",
    "beth",
    "jerry",
    "birdperson",
    "mr. poopybutthole",
    "squanchy",
    "tammy",
    "noob-noob",
    "gearhead",
    "meeseeks",
    "jessica",
    "abradolph lincoler",
    "unity",
    "principal gene vagina",
    "cromulons",
    "krombopulos michael",
    "scary terry",
    "pickle rick",
    "president",
    "snowball",
    "snuffles"
]


### Replace Abbreviations

In [None]:
corrected_row = []
for row in df["text"]:
    for entry in list(abbreviations_dict.keys()):
        if entry in row:
            row = row.replace(entry, abbreviations_dict[entry])
        continue 
    corrected_row.append(row)

df["text"] = pd.Series(corrected_row)

df

## Remove Stopwords

### Import nltk

In [2]:
import nltk
from nltk.corpus import stopwords

### Download Stopwords

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/anton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Spacy Pipeline

### Import Modules

In [9]:
import spacy
from spacy.language import Language
from spacy.tokens import Doc
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.stem import PorterStemmer

### Load Small Spacy Model

In [None]:
nlp = spacy.load("en_core_web_sm")

### Test Spacy

In [11]:
doc = nlp("Rick and Morty")
print(type(doc[0].pos_))

<class 'str'>


## Update Spacy Pipeline

In [12]:
nlp.pipeline
nlp.remove_pipe("lemmatizer")
# To Do Pipeline Reihenfolge anpassen

('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fc51830eac0>)

### Custom Methods Pipeline

In [14]:
@Language.component("stopword_remover")
def custom_remover(doc):
    stop_words = set(stopwords.words('english'))
    stop_symbols = ["?", "..." ,"." , "!" , "&" , "," , "-"]
    valid_doc = [token.text for token in doc if token.text not in stop_words and token.text not in stop_symbols]
    valid_doc_pos = [token.pos_ for token in doc if token.text not in stop_words and token.text not in stop_symbols]

    return Doc(nlp.vocab , words=valid_doc , pos=valid_doc_pos)

In [15]:
nlp.add_pipe("stopword_remover" , after="ner")

<function __main__.custom_remover(doc)>

In [16]:
@Language.component("porter_stemmer")
def custom_stemmer(doc):
    ps = PorterStemmer()

    stemmed_words = [ps.stem(token.text) if token.pos_ != "PROPN" and token.text not in rick_and_morty_characters  else token.text for token in doc ]
    stemmed_words_tags = [str(token.tag_) if token.pos_ != "PROPN" and token.text not in rick_and_morty_characters else token.text for token in doc ]
    
    return Doc(nlp.vocab , words=stemmed_words, tags=stemmed_words_tags)

In [17]:
#nlp.add_pipe("porter_stemmer", after="stopword_remover" )

In [18]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'stopword_remover']


## Applying Spacy Pipeline to Dataframe

In [20]:
df["text"] = list(nlp.pipe(df["text"]))
df

Unnamed: 0,title,text
0,Pilot,"(middle, night, obviously, drunk, rick, bursts..."
1,Lawnmower Dog,"( , jerry, complains, family, dog, snuffles, s..."
2,Anatomy Park (Episode),"(christmas, jerry, tries, enforce, idea, "", hu..."
3,M. Night Shaym-Aliens!,"(episode, opens, rick, dissecting, large, rat,..."
4,Meeseeks and Destroy,"(mr, meeseeks, existing, solve, beth, ’s, prob..."
...,...,...
76,Mercy Kill,"(planet, dorian, 5, supernova, vance, slaughte..."
77,Girls Night,"(bar, calypso, diablo, verde, lady, katana, su..."
78,Kintsugi,"("", unknown, planet, somber, alan, rails, puts..."
79,Little Trains,"(doom, nomitron, arrives, destroy, earth, sate..."


### Converting Docs to Strings again

In [1]:
df["text"] = [ " ".join([t.text for t in row]) for row in df["text"]] # Comverting DOc in Text


NameError: name 'df' is not defined

### Test if if workes

In [22]:
pd.options.display.max_colwidth = 250
display(df["text"].head(10))

0    middle night obviously drunk rick bursts morty ’s room tells " surprise " show driving newly built flying vehicle rick explains made neutrino bomb : decided earth needs " fresh start " end wants destroy humanity planning start morty jessica new "...
1      jerry complains family dog snuffles stupid rick gives jerry device enhances dog ’s intelligence morty go dreams morty ’s math teacher persuade give morty " " class morty assigned less homework go morty ’s math teacher ’s dreams dreams strong bl...
2    christmas jerry tries enforce idea " human holiday " onto rest family hisparents visiting confiscating morty beth summer ’s electronic gadgets rick shows homeless man santa outfit briefly introduces ruben heading garage.as holiday progresses jerr...
3    episode opens rick dissecting large rat garage complaining sloppy workmanship morty stops comment beautiful day robotic beth gets car leaves work rick shown acting suspicious day ’s events well morty morty leaves school called fro

# Save to CSV

In [23]:
df.to_csv(PATH_TO_CLEAN_DATA)