# Project Orange: IMDB reviews sentiment analysis

## Setup

Install all required dependencies in the current Jupyter kernel

In [34]:
import sys
!{sys.executable} -m pip install spacy pandas sklearn
!{sys.executable} -m spacy download en



[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/sylvain/anaconda3/envs/optimize/lib/python3.7/site-packages/en_core_web_sm
-->
/Users/sylvain/anaconda3/envs/optimize/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [35]:
import pandas as pd
import re
import spacy
from spacy import displacy
from spacy.lang.en import English
from sklearn.model_selection import train_test_split

sp = spacy.load('en')

### Import dataset

In [36]:
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 
data = pd.read_csv("../data/IMDB Dataset.csv") 

# Keep the first 10 elements to reduce the load on cpu
data=data[:10]
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Text to lowercase

In [37]:
def to_lower(this_review):
    this_review=this_review.lower()
    return this_review
    
data['review'] = data['review'].map(to_lower)
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove HTML elements

In [38]:
REMOVE_HTML = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def remove_html(review):
    return REMOVE_HTML.sub("", review) 

data['cleaned_review'] = data['review'].map(remove_html)
data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,one of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,a wonderful little production. <br /><br />the...,positive,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is...",positive,"petter mattei's ""love in the time of money"" is..."


### Identify and remove entities

In [39]:
def recognize_it(this_review):
    doc=sp(this_review)
    
    for i in doc.ents:
            i=str(i)
            this_review=this_review.replace(" "+i,"")
    return this_review

In [40]:
data['IDcleaned_review'] = data['cleaned_review'].map(recognize_it)
data.head()

Unnamed: 0,review,sentiment,cleaned_review,IDcleaned_review
0,one of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,a wonderful little production. <br /><br />the...,positive,a wonderful little production. the filming tec...,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is...",positive,"petter mattei's ""love in the time of money"" is...","petter's ""love in the time of money"" is a visu..."


### Lemmatization

In [41]:
# Implementing lemmatization
def lemmatize_it(this_review):
    filtered_sent=[]

    #  "nlp" Object is used to create documents with linguistic annotations.
    lem = sp(this_review)
    
   # finding lemma for each word
    for word in lem:
        filtered_sent.append(word.lemma_)
    return filtered_sent

In [42]:
data['lemmatized_review'] = data['IDcleaned_review'].map(lemmatize_it)
data.head()

Unnamed: 0,review,sentiment,cleaned_review,IDcleaned_review,lemmatized_review
0,one of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewer, have, mention,..."
1,a wonderful little production. <br /><br />the...,positive,a wonderful little production. the filming tec...,a wonderful little production. the filming tec...,"[a, wonderful, little, production, ., the, fil..."
2,i thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...,"[i, think, this, be, a, wonderful, way, to, sp..."
3,basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...,basically there's a family where a little boy ...,"[basically, there, be, a, family, where, a, li..."
4,"petter mattei's ""love in the time of money"" is...",positive,"petter mattei's ""love in the time of money"" is...","petter's ""love in the time of money"" is a visu...","[petter, 's, "", love, in, the, time, of, money..."


### Tokenization (not used)

In [43]:
# "nlp" Object is used to create documents with linguistic annotations.
nlp = English()

def tokenize_review(this_review):
    my_doc = nlp(this_review)
    
    # Create list of word tokens
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    return token_list

In [44]:
#data['tokenized_review'] = data['cleaned_review'].map(tokenize_review)
#data.head()

### Adapt spacy stopwords list to our topic

In [45]:
#print stopword list from spacy
spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)

remove_from_stopwordlist=["n't", "most", "much", "never", "no", "not", "nothing", "n‘t", "n’t", "really", "top", "very", "well"]
for word in spacy_stopwords:
    if word in remove_from_stopwordlist:
         spacy_stopwords.remove(word)

add_to_stopwords=['.', ',', '!', '?', ':', '&', '...', '(', ')','-', '/', '"', ';']
for word in add_to_stopwords:
    spacy_stopwords.append(word)

###  Remove stopwords and punctuation

In [46]:
def eliminate_stopwords(this_review):
    
    filtered_sent=[]

    #  "nlp" Object is used to create documents with linguistic annotations.
    doc = this_review
    
    # filtering stop words
    for word in doc:
        if word not in spacy_stopwords:
            filtered_sent.append(word)
    return filtered_sent
    

In [47]:
data['stopcleaned_review'] = data['lemmatized_review'].map(eliminate_stopwords)
data.head()

Unnamed: 0,review,sentiment,cleaned_review,IDcleaned_review,lemmatized_review,stopcleaned_review
0,one of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewer, have, mention,...","[reviewer, mention, watch, oz, episode, -PRON-..."
1,a wonderful little production. <br /><br />the...,positive,a wonderful little production. the filming tec...,a wonderful little production. the filming tec...,"[a, wonderful, little, production, ., the, fil...","[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...,"[i, think, this, be, a, wonderful, way, to, sp...","[think, wonderful, way, spend, time, hot, sit,..."
3,basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...,basically there's a family where a little boy ...,"[basically, there, be, a, family, where, a, li...","[basically, family, little, boy, jake, think, ..."
4,"petter mattei's ""love in the time of money"" is...",positive,"petter mattei's ""love in the time of money"" is...","petter's ""love in the time of money"" is a visu...","[petter, 's, "", love, in, the, time, of, money...","[petter, love, time, money, visually, stunning..."


Do we do Bag of Words?
Do we do TF-IDF?

### Splitting the dataset into training and test sets

In [48]:
X = data['stopcleaned_review'] # the features we want to analyze, we can play with others too
y = data['sentiment'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=72)

#### Logistic Model

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
classifier = LogisticRegression(solver="lbfgs")

# Create pipeline using Bag of Words
pipe = Pipeline([('vectorizer', data['stopcleaned_review']), ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### Evaluating Logistic Regression

In [55]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print(" test Accuracy:",metrics.accuracy_score(y_test, predicted))
print(" Precision:",metrics.precision_score(y_test, predicted, average=None))
print(" Recall:",metrics.recall_score(y_test, predicted, average=None))

NameError: name 'pipe' is not defined

### KNN Classification

In [56]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

In [57]:
np.random.seed = 10

In [58]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform') #here we can change the K-neighbors

In [59]:
knn.fit(X_train, y_train)

ValueError: setting an array element with a sequence.

In [60]:
knn.score(X_test, y_test)

ValueError: setting an array element with a sequence.

### Decision Trees

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
clf = DecisionTreeClassifier(criterion='entropy')

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
clf.get_depth()

#### Tuning the depth of the tree

In [None]:
scores = []
for d in range(1, 21):
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

In [None]:
plt.plot(scores)
plt.ylabel('accuracy', fontsize=15)
plt.xlabel('depth', fontsize=15)

In [None]:
np.argmax(scores)

 We need to have: 
* Precision & Recall for all methods 
* Precision-Recall curve 
* Cross-validation for all methods 
