## Sentiment Analysis

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
data = pd.read_csv('datasets/Lezione_5-sentiment_Analysis/IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
s, reviews, s1, sentiments = train_test_split(data['review'],data['sentiment'], test_size=0.10, random_state=42)

In [5]:
len(reviews)

5000

In [18]:
print(len(sentiments[sentiments == 'positive']))
print(len(sentiments[sentiments == 'negative']))

2519
2481


In [7]:
import string
import spacy
from nltk.corpus import stopwords
import re

english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(sentence):
    sentence = sentence.lower()
    for c in string.punctuation:
        sentence = sentence.replace(c, " ")
    document = nlp(sentence)
    sentence = ' '.join(token.lemma_ for token in document)
    sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
    sentence = re.sub('\d', '', sentence)
    
    return sentence

In [19]:
reviews_cleaned = []
for r in reviews:
    reviews_cleaned.append(data_cleaner(r))

In [22]:
x, x_test, y, y_test = train_test_split(reviews_cleaned, sentiments, test_size=0.20, random_state=42)

In [23]:
# Vectorize text reviews to numbers
vec = CountVectorizer()
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x, y)

MultinomialNB()

In [25]:
model.score(x_test, y_test)

0.823

In [32]:
sentence = "This course is very beautiful, fun and simple."
sentence_cleaned = data_cleaner(sentence)
sentence_countv = vec.transform([sentence_cleaned])
model.predict(sentence_countv)

array(['positive'], dtype='<U8')

In [33]:
model.predict_proba(sentence_countv)

array([[0.08681275, 0.91318725]])

In [34]:
sentence = "This course is useless, difficult and ugly."
sentence_cleaned = data_cleaner(sentence)
sentence_countv = vec.transform([sentence_cleaned])
model.predict(sentence_countv)

array(['negative'], dtype='<U8')

In [35]:
model.predict_proba(sentence_countv)

array([[0.91045862, 0.08954138]])