## Social Media Sentiment Analysis

In [10]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [11]:
Trainset = pd.read_csv('traindata.csv', on_bad_lines='skip')
Testset = pd.read_csv('testdata1.csv', on_bad_lines='skip')

In [12]:
Trainset.head()

Unnamed: 0,id,label,tweet
0,1,1,#Promotion #Excited @JohnDoe I just got a prom...
1,2,0,#LostWallet #Nightmare @JaneDoe I can't believ...
2,3,1,#GreatBook #PageTurner @BobSmith Just finished...
3,4,0,#TerribleDay #WorkFail @SarahJones I had a ter...
4,5,1,#BakingSuccess #Delicious @MikeWilson I finall...


In [13]:
Testset.head()

Unnamed: 0,id,label,tweet
0,1,1,#ExcitingPromotion #CareerGrowth @JohnDoe I ju...
1,2,0,#LostWallet #Frustration @JaneDoe I can't beli...
2,3,1,#BirthdayFun #Grateful @BobSmith Had the best ...
3,4,0,#Argument #Misunderstanding @SarahJones I had ...
4,5,1,#AdoptedKitten #Cute @MikeWilson Just adopted ...


In [14]:
combined = pd.concat([Trainset,Testset])
combined.sort_values(by='id')

Unnamed: 0,id,label,tweet
0,1,1,#Promotion #Excited @JohnDoe I just got a prom...
0,1,1,#ExcitingPromotion #CareerGrowth @JohnDoe I ju...
1,2,0,#LostWallet #Nightmare @JaneDoe I can't believ...
1,2,0,#LostWallet #Frustration @JaneDoe I can't beli...
2,3,1,#GreatBook #PageTurner @BobSmith Just finished...
...,...,...,...
1329,1396,0,#TrafficJam #Frustrated @EmilyDavis I got stuc...
1330,1397,1,#HikeSuccess #AmazingViews @DavidBrown I went ...
1331,1398,0,#CoffeeSpill #Clumsy @LisaMiller I spilled cof...
1332,1399,1,#AuntUncleNews #Thrilled @JohnnyRockets I just...


In [15]:
combined.shape

(1901, 3)

## Data Pre-Processing

### Removing Twitter Handles,Punctuations, Numbers, and Special Characters

In [17]:
combined.columns = combined.columns.str.strip()

# Check for and replace any invisible unicode characters
import unicodedata
def clean_column_name(name):
    return unicodedata.normalize('NFKC', name)

combined.columns = [clean_column_name(col) for col in combined.columns]

# Now try the string replacement again
combined['tidy_tweet'] = combined['tweet'].str.replace("@[\w]*", " ")
combined['tidy_tweet'] = combined['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [None]:
combined.head(10)

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so se...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause ...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide society now #motivation
5,6,0.0,[2/2] huge fan fare and big talking before the...,huge fan fare and big talking before the...
6,7,0.0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0.0,the next school year is the year for exams.ð...,the next school year is the year for exams ...
8,9,0.0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land #allin #cavs #champ...
9,10,0.0,@user @user welcome here ! i'm it's so #gr...,welcome here i m it s so #gr


In [18]:
combined['tidy_tweet']=combined['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
combined

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,1,#Promotion #Excited @JohnDoe I just got a prom...,#Promotion #Excited @JohnDoe just promotion wo...
1,2,0,#LostWallet #Nightmare @JaneDoe I can't believ...,#LostWallet #Nightmare @JaneDoe can't believe ...
2,3,1,#GreatBook #PageTurner @BobSmith Just finished...,#GreatBook #PageTurner @BobSmith Just finished...
3,4,0,#TerribleDay #WorkFail @SarahJones I had a ter...,#TerribleDay #WorkFail @SarahJones terrible wo...
4,5,1,#BakingSuccess #Delicious @MikeWilson I finall...,#BakingSuccess #Delicious @MikeWilson finally ...
...,...,...,...,...
562,596,0,#VacationPlansCanceled #Upset @EmmaDavis I had...,#VacationPlansCanceled #Upset @EmmaDavis cance...
563,597,1,#FamilyDinner #QualityTime @MiaSmith I just ha...,#FamilyDinner #QualityTime @MiaSmith just wond...
564,598,0,#Anxiety #BadDream @NoahBrown I had a terrible...,#Anxiety #BadDream @NoahBrown terrible dream l...
565,599,1,#OnlineCourse #Learning @LiamThompson I just f...,#OnlineCourse #Learning @LiamThompson just fin...


### Tokenization

In [19]:
tokenize_tweet = combined['tidy_tweet'].apply(lambda x: x.split(' '))
tokenize_tweet

0      [#Promotion, #Excited, @JohnDoe, just, promoti...
1      [#LostWallet, #Nightmare, @JaneDoe, can't, bel...
2      [#GreatBook, #PageTurner, @BobSmith, Just, fin...
3      [#TerribleDay, #WorkFail, @SarahJones, terribl...
4      [#BakingSuccess, #Delicious, @MikeWilson, fina...
                             ...                        
562    [#VacationPlansCanceled, #Upset, @EmmaDavis, c...
563    [#FamilyDinner, #QualityTime, @MiaSmith, just,...
564    [#Anxiety, #BadDream, @NoahBrown, terrible, dr...
565    [#OnlineCourse, #Learning, @LiamThompson, just...
566    [#BadReview, #Discouraged, @SophiaDavis, revie...
Name: tidy_tweet, Length: 1901, dtype: object

### Lemmatization

In [21]:
import nltk
nltk.download('wordnet')
wnlemma = nltk.WordNetLemmatizer()
tokenize_tweet = combined['tidy_tweet'].apply(lambda x: [wnlemma.lemmatize(t) for t in x.split(' ')])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [23]:
tokenize_tweet

0      [#Promotion, #Excited, @JohnDoe, just, promoti...
1      [#LostWallet, #Nightmare, @JaneDoe, can't, bel...
2      [#GreatBook, #PageTurner, @BobSmith, Just, fin...
3      [#TerribleDay, #WorkFail, @SarahJones, terribl...
4      [#BakingSuccess, #Delicious, @MikeWilson, fina...
                             ...                        
562    [#VacationPlansCanceled, #Upset, @EmmaDavis, c...
563    [#FamilyDinner, #QualityTime, @MiaSmith, just,...
564    [#Anxiety, #BadDream, @NoahBrown, terrible, dr...
565    [#OnlineCourse, #Learning, @LiamThompson, just...
566    [#BadReview, #Discouraged, @SophiaDavis, revie...
Name: tidy_tweet, Length: 1901, dtype: object

In [24]:
combined['tidy_tweet']= tokenize_tweet.apply(lambda x: ' '.join(t for t in x))
combined['tidy_tweet']

0      #Promotion #Excited @JohnDoe just promotion wo...
1      #LostWallet #Nightmare @JaneDoe can't believe ...
2      #GreatBook #PageTurner @BobSmith Just finished...
3      #TerribleDay #WorkFail @SarahJones terrible wo...
4      #BakingSuccess #Delicious @MikeWilson finally ...
                             ...                        
562    #VacationPlansCanceled #Upset @EmmaDavis cance...
563    #FamilyDinner #QualityTime @MiaSmith just wond...
564    #Anxiety #BadDream @NoahBrown terrible dream l...
565    #OnlineCourse #Learning @LiamThompson just fin...
566    #BadReview #Discouraged @SophiaDavis review la...
Name: tidy_tweet, Length: 1901, dtype: object

In [25]:
combined

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,1,#Promotion #Excited @JohnDoe I just got a prom...,#Promotion #Excited @JohnDoe just promotion wo...
1,2,0,#LostWallet #Nightmare @JaneDoe I can't believ...,#LostWallet #Nightmare @JaneDoe can't believe ...
2,3,1,#GreatBook #PageTurner @BobSmith Just finished...,#GreatBook #PageTurner @BobSmith Just finished...
3,4,0,#TerribleDay #WorkFail @SarahJones I had a ter...,#TerribleDay #WorkFail @SarahJones terrible wo...
4,5,1,#BakingSuccess #Delicious @MikeWilson I finall...,#BakingSuccess #Delicious @MikeWilson finally ...
...,...,...,...,...
562,596,0,#VacationPlansCanceled #Upset @EmmaDavis I had...,#VacationPlansCanceled #Upset @EmmaDavis cance...
563,597,1,#FamilyDinner #QualityTime @MiaSmith I just ha...,#FamilyDinner #QualityTime @MiaSmith just wond...
564,598,0,#Anxiety #BadDream @NoahBrown I had a terrible...,#Anxiety #BadDream @NoahBrown terrible dream l...
565,599,1,#OnlineCourse #Learning @LiamThompson I just f...,#OnlineCourse #Learning @LiamThompson just fin...
