# Sentiment Analysis & Modelling

In [2]:

from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('train.tsv', sep='\t')

In [5]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
data['Sentiment'].replace(0, value = 'negative', inplace = True)
data['Sentiment'].replace(1, value = 'negative', inplace = True)

In [7]:
data['Sentiment'].replace(3, value = 'positive', inplace = True)
data['Sentiment'].replace(4, value = 'positive', inplace = True)

In [8]:
data.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negative
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [9]:
data = data[(data.Sentiment == 'negative') | (data.Sentiment == 'positive')]

In [10]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negative
21,22,1,good for the goose,positive
22,23,1,good,positive
33,34,1,"the gander , some of which occasionally amuses...",negative
46,47,1,amuses,positive


In [11]:
data.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,34345,34345,34345
positive,42133,42133,42133


In [12]:
df = pd.DataFrame()
df["text"] = data["Phrase"]
df["label"] = data["Sentiment"]

In [13]:
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,negative
21,good for the goose,positive
22,good,positive
33,"the gander , some of which occasionally amuses...",negative
46,amuses,positive


lower case 

In [14]:
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

punct

In [15]:
df['text'] = df['text'].str.replace('[^\w\s]','')

numbers

In [16]:
df['text'] = df['text'].str.replace('\d','')

stopwords

In [17]:
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [18]:
delete = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in delete))

lemmi

In [19]:
from textblob import Word
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

## Feature Engineering
   * Count Vectors
   * TF - IDF Vectors (words, characters, n-grams)
   * Word Embeddings

- TF(t) = (The term frequency of a word in a document) 
- IDF(t) = (The inverse document frequency of the word across a set of documents.)

In [20]:
df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,negative
21,good goose,positive
22,good,positive
33,gander occasionally amuses none amount much story,negative
46,amuses,positive


## Train - Test

In [21]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text'], df['label'],random_state = 1)

In [22]:
encoder = preprocessing.LabelEncoder()

In [23]:
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [24]:
train_y[0:5]

array([1, 0, 1, 0, 1])

In [25]:
test_y[0:5]

array([1, 0, 1, 0, 0])

## Count Vectors

In [26]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

CountVectorizer()

In [27]:
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

In [28]:
vectorizer.get_feature_names()[0:5]

['aaliyah', 'abagnale', 'abandon', 'abandoned', 'abbass']

## TF-IDF

Word Level

In [29]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

TfidfVectorizer()

In [30]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

In [31]:
tf_idf_word_vectorizer.get_feature_names()[0:5]

['aaliyah', 'abagnale', 'abandon', 'abandoned', 'abbass']

ngram level tf-idf

In [32]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range=(2,3))
tf_idf_ngram_vectorizer.fit(train_x)

TfidfVectorizer(ngram_range=(2, 3))

In [33]:
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

characters level tf-idf

In [34]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2,3))
tf_idf_chars_vectorizer.fit(train_x)

TfidfVectorizer(analyzer='char', ngram_range=(2, 3))

In [35]:
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

# ML Models

## LogisticRegression()

In [36]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_count, test_y, cv=10).mean()

print("Count Vectors Accuracy:", accuracy)

Count Vectors Accuracy: 0.8365062761506274


In [37]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_word, test_y, cv=10).mean()

print("Word-Level TF-IDF Accuracy:", accuracy)

Word-Level TF-IDF Accuracy: 0.8331066945606695


In [38]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_ngram, test_y, cv=10).mean()

print("N-GRAM TF-IDF Accuracy:", accuracy)

N-GRAM TF-IDF Accuracy: 0.7481694560669456


In [39]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(loj_model,x_test_tf_idf_chars,test_y, cv = 10).mean()
                                
print("CHARLEVEL Accuracy:", accuracy)

CHARLEVEL Accuracy: 0.7809100418410042


## Naive Bayes()

In [40]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_count, test_y, cv=10).mean()

print("Count Vectors Accuracy:", accuracy)

Count Vectors Accuracy: 0.8330020920502091


In [41]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_word, test_y, cv=10).mean()

print("Word-Level TF-IDF Accuracy:", accuracy)

Word-Level TF-IDF Accuracy: 0.8346234309623431


In [42]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_ngram, test_y, cv=10).mean()

print("N-GRAM TF-IDF Accuracy:", accuracy)


N-GRAM TF-IDF Accuracy: 0.7686192468619246


In [45]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_chars, test_y, cv=10).mean()

print("CHARLEVEL Accuracy:", accuracy)

CHARLEVEL Accuracy: 0.7559100418410042


## Random Forest()

In [44]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_count, test_y, cv=10).mean()

print("Count Vectors Accuracy:", accuracy)

Count Vectors Accuracy: 0.8186715481171548


In [46]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_word, test_y, cv=10).mean()

print("Word-Level TF-IDF Accuracy:", accuracy)

Word-Level TF-IDF Accuracy: 0.8227510460251046


In [47]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_chars, test_y, cv=10).mean()

print("N-GRAM TF-IDF Accuracy:", accuracy)


N-GRAM TF-IDF Accuracy: 0.8139121338912133


In [48]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_chars, test_y, cv=10).mean()

print("CHARLEVEL Accuracy:", accuracy)

CHARLEVEL Accuracy: 0.813284518828452


## XGBoost

In [49]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit( x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_count , test_y, cv=10).mean()

print(" Count Vectors Accuracy:", accuracy)

Count Vectors Accuracy: 0.7153242677824267


In [50]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit( x_train_tf_idf_word, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_word, test_y, cv=10).mean()

print(" Word-Level TF-IDF Accuracy:", accuracy)

Word-Level TF-IDF Accuracy: 0.7060669456066945


In [51]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit( x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_ngram , test_y, cv=10).mean()

print("N-GRAM TF_IDF Accuracy:", accuracy)

N-GRAM TF_IDF Accuracy: 0.5828451882845188


In [52]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit( x_train_tf_idf_chars, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_chars , test_y, cv=10).mean()

print(" CHARLEVEL Accuracy:", accuracy)

CHARLEVEL Accuracy: 0.7777719665271967
