# Text Classification

Dataset: Cornell University Movie Review polarity dataset v2.0 obtained from http://www.cs.cornell.edu/people/pabo/movie-review-data/


In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('moviereviews.tsv', sep='\t')
df.head()

In [None]:
len(df)

In [None]:
from IPython.display import Markdown, display
display(Markdown('> '+df['review'][0]))

In [None]:
# Check for the existence of NaN values in a cell:
df.isnull().sum()

35 records show **NaN** (this stands for "not a number" and is equivalent to *None*). These are easily removed using the `.dropna()` pandas function.
<div class="alert alert-info" style="margin: 20px">CAUTION: By setting inplace=True, we permanently affect the DataFrame currently in memory, and this can't be undone. However, it does *not* affect the original source data. If we needed to, we could always load the original DataFrame from scratch.</div>

In [None]:
df.dropna(inplace=True)

len(df)

In [None]:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

In [None]:
df.drop(blanks, inplace=True)

len(df)

In [None]:
df['label'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [None]:
text_clf_nb.fit(X_train, y_train)

In [None]:
# Form a prediction set
predictions = text_clf_nb.predict(X_test)

In [None]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

In [None]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

In [None]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

In [None]:
text_clf_lsvc.fit(X_train, y_train)

In [None]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

In [None]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

In [None]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

In [None]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

In [None]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [None]:

text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
text_clf_lsvc2.fit(X_train, y_train)

In [None]:
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

In [None]:
print(metrics.classification_report(y_test,predictions))

In [None]:
print(metrics.accuracy_score(y_test,predictions))

In [None]:
myreview = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

In [None]:
print(text_clf_nb.predict([myreview]))  # be sure to put "myreview" inside square brackets

In [None]:
print(text_clf_lsvc.predict([myreview]))