In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("moviereviews.tsv", sep='\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
len(df)

2000

We have a dataset with 2000 movie reviews, labeled as neg for negative and pos for positive reviews.

In [5]:
# Checking for the NaN values in the dataset:
df.isnull().sum()

label      0
review    35
dtype: int64

In [6]:
df = df.dropna()
#or 
# df.dropna(inplace=True)

In [7]:
df.isnull().sum()
# Null values have been removed

label     0
review    0
dtype: int64

In [8]:
len(df)

1965

Dealing with "whitespace only" strings
In order to detect these strings we need to iterate over each row in the DataFrame. The **.itertuples()** pandas method is a good tool for this as it provides access to every field. For brevity we'll assign the names `i`, `lb` and `rv` to the `index`, `label` and `review` columns.

In [9]:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list

In [10]:
print("Number of blank reviwes: ", len(blanks),'\n','\n','Blanks: ', blanks)

Number of blank reviwes:  27 
 
 Blanks:  [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [11]:
df.drop(blanks, inplace=True)

len(df)

1938

In [12]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

### Spliting the data into train & test sets:

In [13]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

## Building pipelines to vectorize the data, then train and fit a model
Now that we have sets to train and test, we'll develop a selection of pipelines, each with a different model.

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

## Feeding the training data through the first pipeline
We'll run naïve Bayes first

In [15]:
text_clf_nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Running predictions and analyze the results (naïve Bayes)

In [16]:
# Forming a prediction set
predictions = text_clf_nb.predict(X_test)

In [17]:
# Reporting the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[306  20]
 [151 202]]


In [18]:
# Classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.67      0.94      0.78       326
         pos       0.91      0.57      0.70       353

   micro avg       0.75      0.75      0.75       679
   macro avg       0.79      0.76      0.74       679
weighted avg       0.79      0.75      0.74       679



In [19]:
# Overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.748159057437408


## ------------------------------------------------------------------------------------------

## Next we'll run Linear SVC

In [20]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [21]:
predictions = text_clf_lsvc.predict(X_test)

from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[276  50]
 [ 54 299]]


In [22]:
print(metrics.classification_report(y_test,predictions),'\n',"Accuracy : ", 
      round(metrics.accuracy_score(y_test,predictions)*100,2))

              precision    recall  f1-score   support

         neg       0.84      0.85      0.84       326
         pos       0.86      0.85      0.85       353

   micro avg       0.85      0.85      0.85       679
   macro avg       0.85      0.85      0.85       679
weighted avg       0.85      0.85      0.85       679
 
 Accuracy :  84.68
