## Predict the sentiment of a movie based on the reviews
#### The below model works based on the IMDB reviews dataset from Cornell Natural Language Processing. This is a classic sentiment analysis dataset and the Random Forests are used for training and testing

In [155]:
import numpy as np  
import re  
import nltk  
from sklearn.datasets import load_files  
nltk.download('stopwords')  
import pickle  
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /Users/swat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [156]:
movie_data = load_files(r"/Users/swat/Desktop/ML_AN/review_polarity/txt_sentoken")  ## files should be in directory tree format with classifications as folder names,returns target(integer value for classes),data,target_names(class names)
X, y = movie_data.data, movie_data.target 

In [157]:
movie_data.target_names

['neg', 'pos']

In the below cell, the pre-processing of the input document is done. Since the dataset provided was mostly clean, processing steps such as removing punctuations, spaces and certain indicators are cleaned. 

In [158]:
documents=[]
stemmer = WordNetLemmatizer()
for sen in range(0,len(X)):
    document= re.sub(r'\W',' ', str(X[sen]))
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ',document )
    document = re.sub(r'^[a-zA-Z]\s+', ' ',document )
    document = re.sub(r'\s+',' ',document )
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    documents.append(document)
    


Count Vectorizer is a way of converting the words into a matrix based on their occurences in each of the document(each review). Stop words are removed and only the most frequently occuring words are considered for training. 

In [163]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X=vectorizer.fit_transform(documents).toarray()
vectorizer.get_feature_names()

['000',
 '10',
 '13',
 '1997',
 '1998',
 '1999',
 '20',
 '80',
 '90',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accident',
 'across',
 'act',
 'acting',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'age',
 'agent',
 'ago',
 'air',
 'alan',
 'alien',
 'alive',
 'allen',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'amusing',
 'anderson',
 'angel',
 'angle',
 'angry',
 'animal',
 'animated',
 'animation',
 'annoying',
 'another',
 'answer',
 'anti',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apartment',
 'ape',
 'apparent',
 'apparently',
 'appeal',
 'appealing',
 'appear',
 'appearance',
 'appears',
 'appreciate',
 'approach',
 'appropriate',
 'arm',
 'army',
 'around',
 'art',
 'artist',
 'aside',
 'ask',
 'asked',
 'asks',
 'aspect',
 'atmosphere',
 

In [164]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 1, 0, 0]], dtype=int64)

The below step converts the raw frequency of the words into Tf-Idf format

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer ###transform an existing raw count to tfidf
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

The below is another way of declaring Tf Idf vector. Both methods provide the same output matrix.

In [161]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, max_df=0.7, min_df=5, stop_words=stopwords.words('english'))
X1 = tfidfconverter.fit_transform(documents).toarray()
tfidfconverter.get_feature_names()

['000',
 '10',
 '13',
 '1997',
 '1998',
 '1999',
 '20',
 '80',
 '90',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accident',
 'across',
 'act',
 'acting',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'age',
 'agent',
 'ago',
 'air',
 'alan',
 'alien',
 'alive',
 'allen',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'amusing',
 'anderson',
 'angel',
 'angle',
 'angry',
 'animal',
 'animated',
 'animation',
 'annoying',
 'another',
 'answer',
 'anti',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apartment',
 'ape',
 'apparent',
 'apparently',
 'appeal',
 'appealing',
 'appear',
 'appearance',
 'appears',
 'appreciate',
 'approach',
 'appropriate',
 'arm',
 'army',
 'around',
 'art',
 'artist',
 'aside',
 'ask',
 'asked',
 'asks',
 'aspect',
 'atmosphere',
 

In [162]:
X1

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.07349223,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.05424743, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1232798 , 0.        , 0.        , ..., 0.05277153, 0.        ,
        0.        ]])

Now splitting the training set into train and test set. This training set is again split into train and validation set in order to obtain maximum accuracy, before testing with the test set.

In [124]:
from sklearn.model_selection import train_test_split
X_trainset, X_test, y_trainset, y_test = train_test_split(X1,y, test_size=0.3, random_state=0)

X_train, X_valid, y_train, y_valid = train_test_split(X_trainset,y_trainset, test_size=0.3/0.7, random_state=0)

In [125]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [126]:
y_pred = classifier.predict(X_valid)

Confusion Matrix

In [127]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_valid, y_pred))

[[247  31]
 [ 89 233]]


Assessing the evaluation metrics

In [128]:
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

          0       0.74      0.89      0.80       278
          1       0.88      0.72      0.80       322

avg / total       0.81      0.80      0.80       600



In [129]:
print(accuracy_score(y_valid, y_pred))

0.8


Now predicting with the test set

In [130]:
y_pred_test = classifier.predict(X_test)

In [131]:
print(confusion_matrix(y_test, y_pred_test))

[[268  35]
 [ 82 215]]


In [132]:
print(classification_report(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.77      0.88      0.82       303
          1       0.86      0.72      0.79       297

avg / total       0.81      0.81      0.80       600



In [133]:
print(accuracy_score(y_test, y_pred_test))

0.805


In [134]:
y_test

array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,

Cross Validation

In [147]:
from sklearn.model_selection import cross_val_score
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
scores = cross_val_score(classifier, X_trainset, y_trainset, cv=10)
print(scores)
scores = pd.Series(scores)
print(scores.mean(), " =/- ", scores.std())

[0.74468085 0.75886525 0.82269504 0.83571429 0.87142857 0.79285714
 0.87142857 0.81294964 0.79136691 0.87769784]
0.8179684094669553  =/-  0.04696156067652733


Saving the model

In [138]:
with open('text_classifier', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)
    

In [139]:
with open('text_classifier', 'rb') as training_model:  
    model = pickle.load(training_model)

In [140]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))  
print(classification_report(y_test, y_pred2))  
print(accuracy_score(y_test, y_pred2))  

[[268  35]
 [ 82 215]]
             precision    recall  f1-score   support

          0       0.77      0.88      0.82       303
          1       0.86      0.72      0.79       297

avg / total       0.81      0.81      0.80       600

0.805
