## Social Media Sentiment Analysis

In [62]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [83]:
Trainset=pd.read_csv('traindata.csv')
Testset=pd.read_csv('testdata.csv')

In [82]:
Trainset.head()

Unnamed: 0,id,label,tweet
0,1,1,@JohnDoe I just got a promotion at work! Hard ...
1,2,0,@JaneDoe I can't believe I lost my wallet at t...
2,3,1,@BobSmith Had the best birthday party ever! Su...
3,4,0,@SarahJones I had a huge argument with my frie...
4,5,1,@MikeWilson Just adopted a kitten! She's the c...


In [81]:
Testset.head()

Unnamed: 0,id,label,tweet
0,1,1,@JohnDoe I just got a promotion at work! Feeli...
1,2,0,@JaneDoe I can't believe I lost my wallet at t...
2,3,1,@BobSmith Just finished an incredible book! It...
3,4,0,@SarahJones I had a terrible day at work. Ever...
4,5,1,@MikeWilson I finally learned how to bake a ca...


In [80]:
combined = pd.concat([Trainset,Testset])
combined.sort_values(by='id')

Unnamed: 0,id,label,tweet
0,1,1,@JohnDoe I just got a promotion at work! Hard ...
0,1,1,@JohnDoe I just got a promotion at work! Feeli...
1,2,0,@JaneDoe I can't believe I lost my wallet at t...
1,2,0,@JaneDoe I can't believe I lost my wallet at t...
2,3,1,@BobSmith Just finished an incredible book! It...
...,...,...,...
1395,1396,0,@SophiaDavis I spilled coffee all over my shir...
1396,1397,0,@AvaGreen I had a bad experience at the dentis...
1397,1398,0,@EthanBrown I can't believe I lost my keys aga...
1398,1399,0,@JacobMiller I had a terrible headache all day...


In [79]:
combined.shape

(1801, 3)

In [86]:
# Strip potential whitespace from column names
combined.columns = combined.columns.str.strip()

# Check for and replace any invisible unicode characters
import unicodedata
def clean_column_name(name):
    return unicodedata.normalize('NFKC', name)

combined.columns = [clean_column_name(col) for col in combined.columns]

# Now try the string replacement again
combined['tidy_tweet'] = combined['tweet'].str.replace("@[\w]*", " ")
combined['tidy_tweet'] = combined['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [87]:
combined.head(10)

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,1,@JohnDoe I just got a promotion at work! Hard ...,@JohnDoe I just got a promotion at work! Hard ...
1,2,0,@JaneDoe I can't believe I lost my wallet at t...,@JaneDoe I can't believe I lost my wallet at t...
2,3,1,@BobSmith Had the best birthday party ever! Su...,@BobSmith Had the best birthday party ever! Su...
3,4,0,@SarahJones I had a huge argument with my frie...,@SarahJones I had a huge argument with my frie...
4,5,1,@MikeWilson Just adopted a kitten! She's the c...,@MikeWilson Just adopted a kitten! She's the c...
5,6,0,"@EmilyDavis My computer crashed, and I lost al...","@EmilyDavis My computer crashed, and I lost al..."
6,7,1,@DavidBrown I finally crossed off skydiving fr...,@DavidBrown I finally crossed off skydiving fr...
7,8,0,@LisaMiller I got a flat tire on my way to wor...,@LisaMiller I got a flat tire on my way to wor...
8,9,1,@JohnnyRockets Just booked a trip to Hawaii! C...,@JohnnyRockets Just booked a trip to Hawaii! C...
9,10,0,@SophiaTaylor I missed the bus and now I'm lat...,@SophiaTaylor I missed the bus and now I'm lat...


In [88]:
combined['tidy_tweet']=combined['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
combined

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,1,@JohnDoe I just got a promotion at work! Hard ...,@JohnDoe just promotion work! Hard work really...
1,2,0,@JaneDoe I can't believe I lost my wallet at t...,@JaneDoe can't believe lost wallet mall. What ...
2,3,1,@BobSmith Had the best birthday party ever! Su...,@BobSmith best birthday party ever! Surrounded...
3,4,0,@SarahJones I had a huge argument with my frie...,@SarahJones huge argument with friend. hate wh...
4,5,1,@MikeWilson Just adopted a kitten! She's the c...,@MikeWilson Just adopted kitten! She's cutest ...
...,...,...,...,...
396,396,0,@SophiaDavis I lost my favorite jacket. I can'...,@SophiaDavis lost favorite jacket. can't belie...
397,397,1,@EthanGreen I had a wonderful day at the beach...,@EthanGreen wonderful beach! weather perfect.
398,398,0,@AvaJohnson I got a call from a telemarketer d...,@AvaJohnson call from telemarketer during dinn...
399,399,1,@JacobMiller I just discovered a new hobby! Pa...,@JacobMiller just discovered hobby! Painting r...


### Tokenization

In [89]:
tokenize_tweet = combined['tidy_tweet'].apply(lambda x: x.split(' '))
tokenize_tweet

0      [@JohnDoe, just, promotion, work!, Hard, work,...
1      [@JaneDoe, can't, believe, lost, wallet, mall....
2      [@BobSmith, best, birthday, party, ever!, Surr...
3      [@SarahJones, huge, argument, with, friend., h...
4      [@MikeWilson, Just, adopted, kitten!, She's, c...
                             ...                        
396    [@SophiaDavis, lost, favorite, jacket., can't,...
397    [@EthanGreen, wonderful, beach!, weather, perf...
398    [@AvaJohnson, call, from, telemarketer, during...
399    [@JacobMiller, just, discovered, hobby!, Paint...
400    [@EmmaDavis, fight, with, best, friend., reall...
Name: tidy_tweet, Length: 1801, dtype: object

In [91]:
import nltk
nltk.download('wordnet')
wnlemma = nltk.WordNetLemmatizer()
tokenize_tweet = combined['tidy_tweet'].apply(lambda x: [wnlemma.lemmatize(t) for t in x.split(' ')])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Lemmatization

In [92]:
tokenize_tweet

0      [@JohnDoe, just, promotion, work!, Hard, work,...
1      [@JaneDoe, can't, believe, lost, wallet, mall....
2      [@BobSmith, best, birthday, party, ever!, Surr...
3      [@SarahJones, huge, argument, with, friend., h...
4      [@MikeWilson, Just, adopted, kitten!, She's, c...
                             ...                        
396    [@SophiaDavis, lost, favorite, jacket., can't,...
397    [@EthanGreen, wonderful, beach!, weather, perf...
398    [@AvaJohnson, call, from, telemarketer, during...
399    [@JacobMiller, just, discovered, hobby!, Paint...
400    [@EmmaDavis, fight, with, best, friend., reall...
Name: tidy_tweet, Length: 1801, dtype: object

In [93]:
combined['tidy_tweet']= tokenize_tweet.apply(lambda x: ' '.join(t for t in x))
combined['tidy_tweet']

0      @JohnDoe just promotion work! Hard work really...
1      @JaneDoe can't believe lost wallet mall. What ...
2      @BobSmith best birthday party ever! Surrounded...
3      @SarahJones huge argument with friend. hate wh...
4      @MikeWilson Just adopted kitten! She's cutest ...
                             ...                        
396    @SophiaDavis lost favorite jacket. can't belie...
397        @EthanGreen wonderful beach! weather perfect.
398    @AvaJohnson call from telemarketer during dinn...
399    @JacobMiller just discovered hobby! Painting r...
400    @EmmaDavis fight with best friend. really upse...
Name: tidy_tweet, Length: 1801, dtype: object

In [94]:
combined

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,1,@JohnDoe I just got a promotion at work! Hard ...,@JohnDoe just promotion work! Hard work really...
1,2,0,@JaneDoe I can't believe I lost my wallet at t...,@JaneDoe can't believe lost wallet mall. What ...
2,3,1,@BobSmith Had the best birthday party ever! Su...,@BobSmith best birthday party ever! Surrounded...
3,4,0,@SarahJones I had a huge argument with my frie...,@SarahJones huge argument with friend. hate wh...
4,5,1,@MikeWilson Just adopted a kitten! She's the c...,@MikeWilson Just adopted kitten! She's cutest ...
...,...,...,...,...
396,396,0,@SophiaDavis I lost my favorite jacket. I can'...,@SophiaDavis lost favorite jacket. can't belie...
397,397,1,@EthanGreen I had a wonderful day at the beach...,@EthanGreen wonderful beach! weather perfect.
398,398,0,@AvaJohnson I got a call from a telemarketer d...,@AvaJohnson call from telemarketer during dinn...
399,399,1,@JacobMiller I just discovered a new hobby! Pa...,@JacobMiller just discovered hobby! Painting r...
