In [6]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os

pd.set_option('max_colwidth', 900)

In [7]:
train_df = pd.read_csv('../../data/nlp/train_tweets.csv')
train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


In [8]:
test_df = pd.read_csv('../../data/nlp/test_tweets.csv')
test_df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedication #willpower to find #newmaterialsâ¦
1,31964,@user #white #supremacists want everyone to see the new â #birdsâ #movie â and hereâs why
2,31965,safe ways to heal your #acne!! #altwaystoheal #healthy #healing!!
3,31966,"is the hp and the cursed child book up for reservations already? if yes, where? if no, when? ððð #harrypotter #pottermore #favorite"
4,31967,"3rd #bihday to my amazing, hilarious #nephew eli ahmir! uncle dave loves you and missesâ¦"


In [9]:
combine_df = train_df.append(test_df, ignore_index = True, sort = False)
combine_df.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0.0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0.0,factsguide: society now #motivation


In [10]:
print(combine_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49159 entries, 0 to 49158
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      49159 non-null  int64  
 1   label   31962 non-null  float64
 2   tweet   49159 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ MB
None


In [11]:
def user_remove(df):
    return df.replace(regex=r'@[\w]*', value=' ', inplace=True)

user_remove(combine_df)

combine_df.head()

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0.0,thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0.0,factsguide: society now #motivation


In [12]:
combine_df['tweet'].str.lower()

0                                                          when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
1                                           thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked
2                                                                                                                                      bihday your majesty
3                                                                   #model   i love u take with u all the time in urð±!!! ðððð
ð¦ð¦ð¦  
4                                                                                                                   factsguide: society now    #motivation
                                                                               ...                                                                        
49154                                         thought factory: left-ri

In [13]:
apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [14]:
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

In [15]:
def apostrophe_replace(df):
    for index, row in df.iterrows():
        text = ''
        for word in row['tweet'].split():
            for key, value in short_word_dict.items():
                if word == key:
                    word = value
            text = ' '.join((text, word))
        df.iloc[index, 2] = text
            
apostrophe_replace(combine_df)

combine_df.head()

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0.0,thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0.0,bihday your majesty
3,4,0.0,#model i love you take with you all the time in urð±!!! ðððð ð¦ð¦ð¦
4,5,0.0,factsguide: society now #motivation


In [16]:
combine_df[127:130]

Unnamed: 0,id,label,tweet
127,128,0.0,sad in the branches itâs just rainy day writing tears are flying birds #haiku #3lines #micropoetry
128,129,0.0,yeah! new buttons in the mail for me ð they are so pretty! :) #jewelrymaking #buttons
129,130,0.0,driver hit female moose on river rd #weston. moose was killed. driver is ok. crews removing animal now


In [17]:
emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

In [18]:
def emoticon_replace(df):
    for index, row in df.iterrows():
        text = ''
        for word in row['tweet'].split():
            for key, value in emoticon_dict.items():
                if word == key:
                    word = value
            text = ' '.join((text, word))
        df.iloc[index, 2] = text
            
emoticon_replace(combine_df)

combine_df.head()

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0.0,thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0.0,bihday your majesty
3,4,0.0,#model i love you take with you all the time in urð±!!! ðððð ð¦ð¦ð¦
4,5,0.0,factsguide: society now #motivation


Проверяем твит с эмтоиконом

In [19]:
combine_df[127:130]

Unnamed: 0,id,label,tweet
127,128,0.0,sad in the branches itâs just rainy day writing tears are flying birds #haiku #3lines #micropoetry
128,129,0.0,yeah! new buttons in the mail for me ð they are so pretty! happy #jewelrymaking #buttons
129,130,0.0,driver hit female moose on river rd #weston. moose was killed. driver is ok. crews removing animal now


7. Заменим пунктуацию на пробелы, используя re.sub() и паттерн r'[^\w\s]'

In [20]:
combine_df[8:9]

Unnamed: 0,id,label,tweet
8,9,0.0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦


In [21]:
def punctuation_replace(df):
    return df.replace(regex=r'[^\w\s]', value=' ', inplace=True)

punctuation_replace(combine_df)

combine_df.head(10)

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run
1,2,0.0,thanks for lyft credit i can t use cause they don t offer wheelchair vans in pdx disapointed getthanked
2,3,0.0,bihday your majesty
3,4,0.0,model i love you take with you all the time in urð ð ð ð ð ð ð ð
4,5,0.0,factsguide society now motivation
5,6,0.0,2 2 huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo
6,7,0.0,camping tomorrow dannyâ
7,8,0.0,the next school year is the year for exams ð can t think about that ð school exams hate imagine actorslife revolutionschool girl
8,9,0.0,we won love the land allin cavs champions cleveland clevelandcavaliers â
9,10,0.0,welcome here i m it s so gr8


In [22]:
def special_symbols_replace(df):
    return df.replace(regex=r'[^a-zA-Z0-9]', value=' ', inplace=True)

special_symbols_replace(combine_df)

combine_df.head(10)

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run
1,2,0.0,thanks for lyft credit i can t use cause they don t offer wheelchair vans in pdx disapointed getthanked
2,3,0.0,bihday your majesty
3,4,0.0,model i love you take with you all the time in ur
4,5,0.0,factsguide society now motivation
5,6,0.0,2 2 huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo
6,7,0.0,camping tomorrow danny
7,8,0.0,the next school year is the year for exams can t think about that school exams hate imagine actorslife revolutionschool girl
8,9,0.0,we won love the land allin cavs champions cleveland clevelandcavaliers
9,10,0.0,welcome here i m it s so gr8


In [23]:
def numbers_replace(df):
    return df.replace(regex=r'[^a-zA-Z]', value=' ', inplace=True)

numbers_replace(combine_df)

combine_df.head(10)

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run
1,2,0.0,thanks for lyft credit i can t use cause they don t offer wheelchair vans in pdx disapointed getthanked
2,3,0.0,bihday your majesty
3,4,0.0,model i love you take with you all the time in ur
4,5,0.0,factsguide society now motivation
5,6,0.0,huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo
6,7,0.0,camping tomorrow danny
7,8,0.0,the next school year is the year for exams can t think about that school exams hate imagine actorslife revolutionschool girl
8,9,0.0,we won love the land allin cavs champions cleveland clevelandcavaliers
9,10,0.0,welcome here i m it s so gr


In [24]:
combine_df['tweet'] = combine_df['tweet'].apply(lambda x: ' '.join(w for w in x.split() if len(w) > 1))

combine_df.head(10)

Unnamed: 0,id,label,tweet
0,1,0.0,when father is dysfunctional and is so selfish he drags his kids into his dysfunction run
1,2,0.0,thanks for lyft credit can use cause they don offer wheelchair vans in pdx disapointed getthanked
2,3,0.0,bihday your majesty
3,4,0.0,model love you take with you all the time in ur
4,5,0.0,factsguide society now motivation
5,6,0.0,huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo
6,7,0.0,camping tomorrow danny
7,8,0.0,the next school year is the year for exams can think about that school exams hate imagine actorslife revolutionschool girl
8,9,0.0,we won love the land allin cavs champions cleveland clevelandcavaliers
9,10,0.0,welcome here it so gr


In [25]:
combine_df['tweet_token'] = combine_df['tweet'].apply(lambda x: nltk.word_tokenize(x))

combine_df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,id,label,tweet,tweet_token
0,1,0.0,when father is dysfunctional and is so selfish he drags his kids into his dysfunction run,"[when, father, is, dysfunctional, and, is, so, selfish, he, drags, his, kids, into, his, dysfunction, run]"
1,2,0.0,thanks for lyft credit can use cause they don offer wheelchair vans in pdx disapointed getthanked,"[thanks, for, lyft, credit, can, use, cause, they, don, offer, wheelchair, vans, in, pdx, disapointed, getthanked]"
2,3,0.0,bihday your majesty,"[bihday, your, majesty]"
3,4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, time, in, ur]"
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]"
...,...,...,...,...
49154,49155,,thought factory left right polarisation trump uselections leadership politics brexit blm gt,"[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]"
49155,49156,,feeling like mermaid hairflip neverready formal wedding gown dresses mermaid,"[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dresses, mermaid]"
49156,49157,,hillary campaigned today in ohio omg amp used words like assets amp liability never once did clinton say thee word radicalization,"[hillary, campaigned, today, in, ohio, omg, amp, used, words, like, assets, amp, liability, never, once, did, clinton, say, thee, word, radicalization]"
49157,49158,,happy at work conference right mindset leads to culture of development organizations work mindset,"[happy, at, work, conference, right, mindset, leads, to, culture, of, development, organizations, work, mindset]"


In [26]:
stop_words = nltk.corpus.stopwords.words('english')
combine_df['tweet_token_filtered' ] = combine_df.tweet_token.apply(lambda x: [w for w in x if w not in stop_words])

In [27]:
combine_df

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered
0,1,0.0,when father is dysfunctional and is so selfish he drags his kids into his dysfunction run,"[when, father, is, dysfunctional, and, is, so, selfish, he, drags, his, kids, into, his, dysfunction, run]","[father, dysfunctional, selfish, drags, kids, dysfunction, run]"
1,2,0.0,thanks for lyft credit can use cause they don offer wheelchair vans in pdx disapointed getthanked,"[thanks, for, lyft, credit, can, use, cause, they, don, offer, wheelchair, vans, in, pdx, disapointed, getthanked]","[thanks, lyft, credit, use, cause, offer, wheelchair, vans, pdx, disapointed, getthanked]"
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]"
3,4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, time, in, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]"
...,...,...,...,...,...
49154,49155,,thought factory left right polarisation trump uselections leadership politics brexit blm gt,"[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]","[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]"
49155,49156,,feeling like mermaid hairflip neverready formal wedding gown dresses mermaid,"[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dresses, mermaid]","[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dresses, mermaid]"
49156,49157,,hillary campaigned today in ohio omg amp used words like assets amp liability never once did clinton say thee word radicalization,"[hillary, campaigned, today, in, ohio, omg, amp, used, words, like, assets, amp, liability, never, once, did, clinton, say, thee, word, radicalization]","[hillary, campaigned, today, ohio, omg, amp, used, words, like, assets, amp, liability, never, clinton, say, thee, word, radicalization]"
49157,49158,,happy at work conference right mindset leads to culture of development organizations work mindset,"[happy, at, work, conference, right, mindset, leads, to, culture, of, development, organizations, work, mindset]","[happy, work, conference, right, mindset, leads, culture, development, organizations, work, mindset]"


In [28]:
stemmer = nltk.stem.PorterStemmer()
combine_df['tweet_stemmed'] = combine_df.tweet_token_filtered.apply(lambda x: [stemmer.stem(w) for w in x])

combine_df

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed
0,1,0.0,when father is dysfunctional and is so selfish he drags his kids into his dysfunction run,"[when, father, is, dysfunctional, and, is, so, selfish, he, drags, his, kids, into, his, dysfunction, run]","[father, dysfunctional, selfish, drags, kids, dysfunction, run]","[father, dysfunct, selfish, drag, kid, dysfunct, run]"
1,2,0.0,thanks for lyft credit can use cause they don offer wheelchair vans in pdx disapointed getthanked,"[thanks, for, lyft, credit, can, use, cause, they, don, offer, wheelchair, vans, in, pdx, disapointed, getthanked]","[thanks, lyft, credit, use, cause, offer, wheelchair, vans, pdx, disapointed, getthanked]","[thank, lyft, credit, use, caus, offer, wheelchair, van, pdx, disapoint, getthank]"
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]"
3,4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, time, in, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]"
...,...,...,...,...,...,...
49154,49155,,thought factory left right polarisation trump uselections leadership politics brexit blm gt,"[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]","[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]","[thought, factori, left, right, polaris, trump, uselect, leadership, polit, brexit, blm, gt]"
49155,49156,,feeling like mermaid hairflip neverready formal wedding gown dresses mermaid,"[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dresses, mermaid]","[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dresses, mermaid]","[feel, like, mermaid, hairflip, neverreadi, formal, wed, gown, dress, mermaid]"
49156,49157,,hillary campaigned today in ohio omg amp used words like assets amp liability never once did clinton say thee word radicalization,"[hillary, campaigned, today, in, ohio, omg, amp, used, words, like, assets, amp, liability, never, once, did, clinton, say, thee, word, radicalization]","[hillary, campaigned, today, ohio, omg, amp, used, words, like, assets, amp, liability, never, clinton, say, thee, word, radicalization]","[hillari, campaign, today, ohio, omg, amp, use, word, like, asset, amp, liabil, never, clinton, say, thee, word, radic]"
49157,49158,,happy at work conference right mindset leads to culture of development organizations work mindset,"[happy, at, work, conference, right, mindset, leads, to, culture, of, development, organizations, work, mindset]","[happy, work, conference, right, mindset, leads, culture, development, organizations, work, mindset]","[happi, work, confer, right, mindset, lead, cultur, develop, organ, work, mindset]"


In [29]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
combine_df['tweet_lemmatized'] = combine_df.tweet_token_filtered.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])

combine_df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,when father is dysfunctional and is so selfish he drags his kids into his dysfunction run,"[when, father, is, dysfunctional, and, is, so, selfish, he, drags, his, kids, into, his, dysfunction, run]","[father, dysfunctional, selfish, drags, kids, dysfunction, run]","[father, dysfunct, selfish, drag, kid, dysfunct, run]","[father, dysfunctional, selfish, drag, kid, dysfunction, run]"
1,2,0.0,thanks for lyft credit can use cause they don offer wheelchair vans in pdx disapointed getthanked,"[thanks, for, lyft, credit, can, use, cause, they, don, offer, wheelchair, vans, in, pdx, disapointed, getthanked]","[thanks, lyft, credit, use, cause, offer, wheelchair, vans, pdx, disapointed, getthanked]","[thank, lyft, credit, use, caus, offer, wheelchair, van, pdx, disapoint, getthank]","[thanks, lyft, credit, use, cause, offer, wheelchair, van, pdx, disapointed, getthanked]"
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, time, in, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"
...,...,...,...,...,...,...,...
49154,49155,,thought factory left right polarisation trump uselections leadership politics brexit blm gt,"[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]","[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]","[thought, factori, left, right, polaris, trump, uselect, leadership, polit, brexit, blm, gt]","[thought, factory, left, right, polarisation, trump, uselections, leadership, politics, brexit, blm, gt]"
49155,49156,,feeling like mermaid hairflip neverready formal wedding gown dresses mermaid,"[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dresses, mermaid]","[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dresses, mermaid]","[feel, like, mermaid, hairflip, neverreadi, formal, wed, gown, dress, mermaid]","[feeling, like, mermaid, hairflip, neverready, formal, wedding, gown, dress, mermaid]"
49156,49157,,hillary campaigned today in ohio omg amp used words like assets amp liability never once did clinton say thee word radicalization,"[hillary, campaigned, today, in, ohio, omg, amp, used, words, like, assets, amp, liability, never, once, did, clinton, say, thee, word, radicalization]","[hillary, campaigned, today, ohio, omg, amp, used, words, like, assets, amp, liability, never, clinton, say, thee, word, radicalization]","[hillari, campaign, today, ohio, omg, amp, use, word, like, asset, amp, liabil, never, clinton, say, thee, word, radic]","[hillary, campaigned, today, ohio, omg, amp, used, word, like, asset, amp, liability, never, clinton, say, thee, word, radicalization]"
49157,49158,,happy at work conference right mindset leads to culture of development organizations work mindset,"[happy, at, work, conference, right, mindset, leads, to, culture, of, development, organizations, work, mindset]","[happy, work, conference, right, mindset, leads, culture, development, organizations, work, mindset]","[happi, work, confer, right, mindset, lead, cultur, develop, organ, work, mindset]","[happy, work, conference, right, mindset, lead, culture, development, organization, work, mindset]"


In [30]:
combine_df.to_pickle('../../data/nlp/tweets.pkl')