# Sentiment Analysis based on movies/boooks rating


In [35]:
import pandas as pd
import numpy as np
#import the natural language toolkit to get the stopwords
import nltk
#In computing, stop words are words which are filtered out before or 
#after processing of natural language data (text). Though "stop words" 
#usually refers to the most common words in a language
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

In [36]:
df=pd.read_csv('sentiment_analysis_train_data.txt', sep='\t', names=['liked','comment'])
df.head()

Unnamed: 0,liked,comment
0,0,In the beginning God created the heavens and t...
1,1,"And God said, Let there be light, and there wa..."
2,1,"And God said, Let there be a vault between the..."
3,1,"And God said, Let the water under the sky be g..."
4,1,"Then God said, Let the land produce vegetation..."


In [37]:
#we will perform the TFIDF vectorization
stopset=set(stopwords.words('english'))
print(stopset)

{'am', 'those', "you'd", 'each', 'same', "she's", 'm', "mustn't", 'is', "you've", 'his', 'if', 've', 'had', 'didn', 'ourselves', 'more', 'haven', 'it', 'having', "hasn't", 'do', 'too', 'this', 'the', 'our', 'by', 'again', "didn't", 'for', "won't", 'shouldn', 'can', 'hadn', 'him', 'above', 'most', "it's", 'up', 'such', 'herself', 'these', 'as', 'have', 'are', 'wasn', "shan't", 'once', 'will', 'out', 'ma', 'from', 'of', 'in', 'on', "wouldn't", 'after', 'very', 'don', 'being', "that'll", 'but', 'not', 'why', 'all', 'before', 'only', 'won', 'her', 're', 'doesn', 'other', 'further', "hadn't", "don't", 'over', 'them', 'there', 'yourselves', 'myself', 'aren', 'd', "you're", 'because', 'doing', 'theirs', 'were', 'just', 'll', 'o', 'during', 'hasn', 'then', "doesn't", 's', 'at', 'under', 'mightn', 'couldn', 'yourself', 'who', 'yours', 'into', 'few', 'be', 'y', 'or', "should've", "wasn't", 'me', 'how', 'and', "isn't", 'whom', 'between', 'does', 'we', 'weren', 'some', 'wouldn', 'needn', 'down', '

In [38]:
#build the victorizer that will transform the comments into a vector
#of 0/1 using the TFIDF method
vectorizer=TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset)


In [39]:
y=df.liked
X=vectorizer.fit_transform(df.comment)
print(X.shape,y.shape)


(30459, 7121) (30459,)


After applying the vectorization on the comments, we get a matrix of (30459, 7121) which means that we have 30459 vectors (each of 7121 0/1, 1 if the word is present on the comment and 0 if not) representing the output of the TFIDF transformation on a given comment.

In [40]:
#we split our data
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [41]:
#we build the naive bayes classifier
classifier=naive_bayes.MultinomialNB()
#we train our model
classifier.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
#getting our model's accuracy with the roc_auc metric
print(roc_auc_score(y_test,classifier.predict_proba(X_test)[:,1]))

0.9988635795781062


In [46]:
#testing our model with our own comment
movie_review=np.array(['The pursuit of happines is one of the best film i have ever seen'])
movie_review_vector=vectorizer.transform(movie_review)
print('',classifier.predict(movie_review_vector))


 [1]


In [48]:
movie_review=np.array(['it\'s horrible'])
movie_review_vector=vectorizer.transform(movie_review)
print('',classifier.predict(movie_review_vector))

 [0]
