# Введение в обработку естественного языка
## Урок 1. Предобработка текста

Осуществим предобработку данных с Твиттера, чтобы очищенные данные в дальнейшем использовать для задачи классификации. Данный датасет содержит негативные (label = 1) и нейтральные (label = 0) высказывания. Для работы объединим train_df и test_df.

In [25]:
import pandas as pd, numpy as np
import warnings
import re
import string
import nltk
import pickle

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
!unzip "/content/hw1_actual.zip" -d "/content/"

Archive:  /content/hw1_actual.zip
  inflating: /content/train_tweets.csv  
  inflating: /content/test_tweets.csv  
  inflating: /content/hw_lesson_1.ipynb  


In [6]:
df_train = pd.read_csv('/content/train_tweets.csv')
df_test = pd.read_csv('/content/test_tweets.csv')
df = pd.concat((df_train, df_test), axis=0).set_index('id')

In [5]:
apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}



short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}


emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

In [9]:
class TPprocessor:
    
    stop_words = nltk.corpus.stopwords.words('english')
    stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    def __init__(self, df: pd.DataFrame, column: str):
        
        self.df = df
        self.column = column
        
    def resub_patt(self, pattern: str, repl: str = '') -> np.array:
        
        pattern = re.compile(pattern=pattern) # init series pattern
        
        vector = np.vectorize(pattern.sub) # init resub series vector
        result = vector(repl=repl, string=self.df[self.column]) # resub series
        
        self.df[self.column] = result
        
        return result
    
    def reg_series(self, register: str = 'lower') -> np.array:
        
        vector = np.vectorize(str.lower if register == 'lower' else str.upper)
        result = vector(self.df[self.column])
        
        self.df[self.column] = result
        
        return result
    
    def repl_abb(self, repl_dict: dict) -> list:
    
        result = [text.split() for text in self.df[self.column]]

        for id_wl, words in enumerate(result): # перебор списков со словами
            for id_w, word in enumerate(words): # перебор слов
                result[id_wl][id_w] = repl_dict[word] if word in repl_dict.keys() else word

            result[id_wl] = ' '.join(result[id_wl])
            
        self.df[self.column] = result

        return result
    
    def clean_by_len(self, len_word: int = 1) -> list:
        
        result = [text.split() for text in self.df[self.column]]

        for id_wl, words in enumerate(result): # перебор списков со словами
            result[id_wl] = ' '.join([word for word in words if len(word) > len_word])
        
        self.df[self.column] = result
        
        return result
    
    def tokenize(self, method: str = 'split', new_column: str = None) -> list:
        
        if method == 'split':
            result = [text.split() for text in self.df[self.column]]
        elif method == 'tokenize':
            result = [nltk.tokenize.word_tokenize(text) for text in self.df[self.column]]
        
        if new_column:
            self.df[new_column] = result
        
        return result
    
    def clean_stop_words(self, target: str = None , new_column: str = None):
        
        result = [words for words in self.df[target]]

        for id_wl, words in enumerate(result): # перебор списков со словами
            result[id_wl] = [word for word in words if word not in self.stop_words]

        if new_column:
            self.df[new_column] = result
        
        return result
    
    def stemming(self, target: str = None, new_column: str = None):
        
        result = [words for words in self.df[target]]
        
        for id_wl, words in enumerate(result): # перебор списков со словами
            result[id_wl] = [self.stemmer.stem(word) for word in words]

        if new_column:
            self.df[new_column] = result
        
        return result
    
    def lemmating(self, target: str = None, new_column: str = None):
        
        result = [words for words in self.df[target]]
        
        for id_wl, words in enumerate(result): # перебор списков со словами
            result[id_wl] = [self.lemmatizer.lemmatize(word, nltk.corpus.wordnet.VERB) for word in words]

        if new_column:
            self.df[new_column] = result
        
        return result

text_prepr = TPprocessor(df=df, column='tweet')

### Удалим @user из всех твитов с помощью паттерна "@[\\w]*". Для этого создадим функцию:
- для того, чтобы найти все вхождения паттерна в тексте, необходимо использовать re.findall(pattern, input_txt)  
- для для замены @user на пробел, необходимо использовать re.sub()


In [10]:
text_prepr.resub_patt(pattern="@[\w]*")
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so sel...
2,0.0,thanks for #lyft credit i can't use cause th...
3,0.0,bihday your majesty
4,0.0,#model i love u take with u all the time in ...
5,0.0,factsguide: society now #motivation
...,...,...
49155,,thought factory: left-right polarisation! #tru...
49156,,feeling like a mermaid ð #hairflip #neverre...
49157,,#hillary #campaigned today in #ohio((omg)) &am...
49158,,"happy, at work conference: right mindset leads..."


### Изменим регистр твитов на нижний с помощью .lower()

In [11]:
text_prepr.reg_series()
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so sel...
2,0.0,thanks for #lyft credit i can't use cause th...
3,0.0,bihday your majesty
4,0.0,#model i love u take with u all the time in ...
5,0.0,factsguide: society now #motivation
...,...,...
49155,,thought factory: left-right polarisation! #tru...
49156,,feeling like a mermaid ð #hairflip #neverre...
49157,,#hillary #campaigned today in #ohio((omg)) &am...
49158,,"happy, at work conference: right mindset leads..."


### Заменим сокращения с апострофами (пример: ain't, can't) на пробел, используя apostrophe_dict. Для этого необходимо сделать функцию: для каждого слова в тексте проверить (for word in text.split()), если слово есть в словаре apostrophe_dict в качестве ключа (сокращенного слова), то заменить ключ на значение (полную версию слова)

In [12]:
text_prepr.repl_abb(repl_dict=apostrophe_dict)
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so selfi...
2,0.0,thanks for #lyft credit i cannot use cause the...
3,0.0,bihday your majesty
4,0.0,#model i love u take with u all the time in ur...
5,0.0,factsguide: society now #motivation
...,...,...
49155,,thought factory: left-right polarisation! #tru...
49156,,feeling like a mermaid ð #hairflip #neverre...
49157,,#hillary #campaigned today in #ohio((omg)) &am...
49158,,"happy, at work conference: right mindset leads..."


### Заменим сокращения на их полные формы, используя short_word_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте

In [13]:
text_prepr.repl_abb(repl_dict=short_word_dict)
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so selfi...
2,0.0,thanks for #lyft credit i cannot use cause the...
3,0.0,bihday your majesty
4,0.0,#model i love you take with you all the time i...
5,0.0,factsguide: society now #motivation
...,...,...
49155,,thought factory: left-right polarisation! #tru...
49156,,feeling like a mermaid ð #hairflip #neverre...
49157,,#hillary #campaigned today in #ohio((omg)) &am...
49158,,"happy, at work conference: right mindset leads..."


### Заменим эмотиконы (пример: ":)" = "happy") на пробелы, используя emoticon_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте

In [14]:
text_prepr.repl_abb(repl_dict=emoticon_dict)
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so selfi...
2,0.0,thanks for #lyft credit i cannot use cause the...
3,0.0,bihday your majesty
4,0.0,#model i love you take with you all the time i...
5,0.0,factsguide: society now #motivation
...,...,...
49155,,thought factory: left-right polarisation! #tru...
49156,,feeling like a mermaid ð #hairflip #neverre...
49157,,#hillary #campaigned today in #ohio((omg)) &am...
49158,,"happy, at work conference: right mindset leads..."


### Заменим пунктуацию на пробелы, используя re.sub() и паттерн r'[^\w\s]'

In [15]:
text_prepr.resub_patt(pattern=r'[^\w\s]')
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so selfi...
2,0.0,thanks for lyft credit i cannot use cause they...
3,0.0,bihday your majesty
4,0.0,model i love you take with you all the time in...
5,0.0,factsguide society now motivation
...,...,...
49155,,thought factory leftright polarisation trump u...
49156,,feeling like a mermaid ð hairflip neverready f...
49157,,hillary campaigned today in ohioomg amp used w...
49158,,happy at work conference right mindset leads t...


### Заменим спец. символы на пробелы, используя re.sub() и паттерн r'[^a-zA-Z0-9]

In [16]:
text_prepr.resub_patt(pattern=r'[^a-zA-Z0-9]', repl=' ')
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so selfi...
2,0.0,thanks for lyft credit i cannot use cause they...
3,0.0,bihday your majesty
4,0.0,model i love you take with you all the time in...
5,0.0,factsguide society now motivation
...,...,...
49155,,thought factory leftright polarisation trump u...
49156,,feeling like a mermaid hairflip neverready f...
49157,,hillary campaigned today in ohioomg amp used w...
49158,,happy at work conference right mindset leads t...


### Заменим числа на пробелы, используя re.sub() и паттерн r'[^a-zA-Z]'

In [17]:
text_prepr.resub_patt(pattern=r'[^a-zA-Z]', repl=' ')
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when a father is dysfunctional and is so selfi...
2,0.0,thanks for lyft credit i cannot use cause they...
3,0.0,bihday your majesty
4,0.0,model i love you take with you all the time in...
5,0.0,factsguide society now motivation
...,...,...
49155,,thought factory leftright polarisation trump u...
49156,,feeling like a mermaid hairflip neverready f...
49157,,hillary campaigned today in ohioomg amp used w...
49158,,happy at work conference right mindset leads t...


### Удалим из текста слова длиной в 1 символ, используя ' '.join([w for w in x.split() if len(w)>1])

In [18]:
text_prepr.clean_by_len()
text_prepr.df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,when father is dysfunctional and is so selfish...
2,0.0,thanks for lyft credit cannot use cause they d...
3,0.0,bihday your majesty
4,0.0,model love you take with you all the time in ur
5,0.0,factsguide society now motivation
...,...,...
49155,,thought factory leftright polarisation trump u...
49156,,feeling like mermaid hairflip neverready forma...
49157,,hillary campaigned today in ohioomg amp used w...
49158,,happy at work conference right mindset leads t...


### Поделим твиты на токены с помощью nltk.tokenize.word_tokenize, создав новый столбец 'tweet_token'

In [21]:
text_prepr.tokenize(method='tokenize', new_column='tweet_token')
text_prepr.df

Unnamed: 0_level_0,label,tweet,tweet_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,..."
2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau..."
3,0.0,bihday your majesty,"[bihday, your, majesty]"
4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ..."
5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]"
...,...,...,...
49155,,thought factory leftright polarisation trump u...,"[thought, factory, leftright, polarisation, tr..."
49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,..."
49157,,hillary campaigned today in ohioomg amp used w...,"[hillary, campaigned, today, in, ohioomg, amp,..."
49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ..."


### Удалим стоп-слова из токенов, используя nltk.corpus.stopwords. Создадим столбец 'tweet_token_filtered' без стоп-слов

In [22]:
text_prepr.clean_stop_words(target='tweet_token', new_column='tweet_token_filtered')
text_prepr.df

Unnamed: 0_level_0,label,tweet,tweet_token,tweet_token_filtered
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ..."
2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee..."
3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]"
4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]"
5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]"
...,...,...,...,...
49155,,thought factory leftright polarisation trump u...,"[thought, factory, leftright, polarisation, tr...","[thought, factory, leftright, polarisation, tr..."
49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,...","[feeling, like, mermaid, hairflip, neverready,..."
49157,,hillary campaigned today in ohioomg amp used w...,"[hillary, campaigned, today, in, ohioomg, amp,...","[hillary, campaigned, today, ohioomg, amp, use..."
49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead..."


### Применим стемминг к токенам с помощью nltk.stem.PorterStemmer. Создадим столбец 'tweet_stemmed' после применения стемминга

In [23]:
text_prepr.stemming(target='tweet_token', new_column='tweet_stemmed')
text_prepr.df

Unnamed: 0_level_0,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[when, father, is, dysfunct, and, is, so, self..."
2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, for, lyft, credit, can, not, use, caus..."
3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, your, majesti]"
4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, you, take, with, you, all, the, ..."
5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, now, motiv]"
...,...,...,...,...,...
49155,,thought factory leftright polarisation trump u...,"[thought, factory, leftright, polarisation, tr...","[thought, factory, leftright, polarisation, tr...","[thought, factori, leftright, polaris, trump, ..."
49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,...","[feeling, like, mermaid, hairflip, neverready,...","[feel, like, mermaid, hairflip, neverreadi, fo..."
49157,,hillary campaigned today in ohioomg amp used w...,"[hillary, campaigned, today, in, ohioomg, amp,...","[hillary, campaigned, today, ohioomg, amp, use...","[hillari, campaign, today, in, ohioomg, amp, u..."
49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead...","[happi, at, work, confer, right, mindset, lead..."


### Применим лемматизацию к токенам с помощью nltk.stem.wordnet.WordNetLemmatizer. Создадим столбец 'tweet_lemmatized' после применения лемматизации

In [26]:
text_prepr.lemmating(target='tweet_token', new_column='tweet_lemmatized')
text_prepr.df

Unnamed: 0_level_0,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[when, father, is, dysfunct, and, is, so, self...","[when, father, be, dysfunctional, and, be, so,..."
2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, for, lyft, credit, can, not, use, caus...","[thank, for, lyft, credit, can, not, use, caus..."
3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, your, majesti]","[bihday, your, majesty]"
4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, you, take, with, you, all, the, ...","[model, love, you, take, with, you, all, the, ..."
5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, now, motiv]","[factsguide, society, now, motivation]"
...,...,...,...,...,...,...
49155,,thought factory leftright polarisation trump u...,"[thought, factory, leftright, polarisation, tr...","[thought, factory, leftright, polarisation, tr...","[thought, factori, leftright, polaris, trump, ...","[think, factory, leftright, polarisation, trum..."
49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,...","[feeling, like, mermaid, hairflip, neverready,...","[feel, like, mermaid, hairflip, neverreadi, fo...","[feel, like, mermaid, hairflip, neverready, fo..."
49157,,hillary campaigned today in ohioomg amp used w...,"[hillary, campaigned, today, in, ohioomg, amp,...","[hillary, campaigned, today, ohioomg, amp, use...","[hillari, campaign, today, in, ohioomg, amp, u...","[hillary, campaign, today, in, ohioomg, amp, u..."
49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead...","[happi, at, work, confer, right, mindset, lead...","[happy, at, work, conference, right, mindset, ..."


### Сохраним результат предобработки в pickle-файл

In [27]:
text_prepr.df.to_pickle(path='/content/Lesson_1_done.pkl')