In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [54]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from nltk.corpus import stopwords
from textblob import Word

stop = stopwords.words('english')
%matplotlib inline

In [4]:
df = pd.read_csv('data/sentiment_analysis/train_E6oV3lV.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [6]:
df.drop('id', inplace=True, axis=1)
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [7]:
df['word_count'] = df['tweet'].apply(lambda x: len(str(x).split(" ")))
df[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


In [8]:
df['char_count'] = df['tweet'].str.len() ## this also includes spaces
df[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39


In [9]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

df['avg_word'] = df['tweet'].apply(lambda x: avg_word(x))
df[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


In [11]:
df['stopwords'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [12]:
# Number of special characters
df['hastags'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


In [13]:
# Number of numeric values
df['numerics'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [14]:
# Number of upper case
df['upper'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [15]:
# Lower case
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['tweet'].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

In [16]:
# Remove punctuation
df['tweet'] = df['tweet'].str.replace('[^\w\s]','')
df['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

In [17]:
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [18]:
# Word count
word_count = pd.Series(' '.join(df['tweet']).split()).value_counts()
word_count

user                  17473
love                   2647
ð                      2511
day                    2199
â                      1797
                      ...  
yofelizytufustrada        1
noww                      1
mylifeâ                   1
photowall                 1
goodluckinnovember        1
Length: 45044, dtype: int64

In [19]:
# Remove most common words
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if word_count.get(x) < 1000))
df['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [20]:
pd.Series(' '.join(df['tweet']).split()).value_counts()

today                  991
new                    983
positive               928
thankful               919
get                    917
                      ... 
muslm                    1
cmn                      1
businessbootcamp         1
felizgelp22triperos      1
goodluckinnovember       1
Length: 45032, dtype: int64

In [21]:
# Remove rare words
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if word_count.get(x) > 5))
df['tweet'].head()

0                    father selfish kids run
1    thanks credit cant use cause dont offer
2                             bihday majesty
3                    model take urð ðððð ððð
4              factsguide society motivation
Name: tweet, dtype: object

In [22]:
pd.Series(' '.join(df['tweet']).split()).value_counts()

today         991
new           983
positive      928
thankful      919
get           917
             ... 
smash           6
wealth          6
adrenaline      6
antiaging       6
extremists      6
Length: 5343, dtype: int64

In [26]:
df['tweet'] = df['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['tweet'].head()

0                     father selfish kid run
1    thanks credit cant use cause dont offer
2                             bihday majesty
3                    model take urð ðððð ððð
4              factsguide society motivation
Name: tweet, dtype: object

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size = 0.2, random_state = 4)

In [49]:
cv = CountVectorizer(ngram_range=(1,3), max_features=25000)

X_traincv = cv.fit_transform(X_train)
X_testcv = cv.transform(X_test)

In [50]:
tf = TfidfVectorizer(ngram_range=(1,3), max_features=25000)

X_traintf = tf.fit_transform(X_train)
X_testtf = tf.transform(X_test)

In [51]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [52]:
X_traincv.shape, X_testcv.shape, X_traintf.shape, X_testtf.shape, y_train.shape, y_test.shape

((25569, 25000),
 (6393, 25000),
 (25569, 25000),
 (6393, 25000),
 (25569,),
 (6393,))

In [53]:
naive_bayes = MultinomialNB()

naive_bayes.fit(X_traincv, y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
confusion_matrix(naive_bayes.predict(X_testcv), y_test)

array([[5891,  201],
       [  48,  253]])

In [56]:
confusion_matrix(naive_bayes.predict(X_testtf), y_test)

array([[5931,  292],
       [   8,  162]])

True Positives (TP) - These are the correctly predicted positive values which means that the value of actual class is yes and the value of predicted class is also yes.

True Negatives (TN) - These are the correctly predicted negative values which means that the value of actual class is no and value of predicted class is also no.

False Positives (FP) – When actual class is no and predicted class is yes.

False Negatives (FN) – When actual class is yes but predicted class in no.

Precision = TP/TP+FP

Recall = TP/TP+FN

F1 Score = 2*(Recall * Precision) / (Recall + Precision)

In [57]:
precision_score(naive_bayes.predict(X_testcv), y_test)

0.5572687224669604

In [58]:
precision_score(naive_bayes.predict(X_testtf), y_test)

0.3568281938325991

In [59]:
recall_score(naive_bayes.predict(X_testcv), y_test)

0.840531561461794

In [60]:
recall_score(naive_bayes.predict(X_testtf), y_test)

0.9529411764705882

In [61]:
f1_score(naive_bayes.predict(X_testcv), y_test)

0.6701986754966887

In [62]:
f1_score(naive_bayes.predict(X_testtf), y_test)

0.5192307692307692