In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import string
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline, make_pipeline

from mlxtend.feature_extraction import PrincipalComponentAnalysis
from mlxtend.preprocessing import standardize
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [2]:
train = pd.read_csv('clean_train_sample.csv', index_col=0)
test = pd.read_csv('clean_test_sample.csv', index_col=0)

In [3]:
test.head(2)

Unnamed: 0,Review,Label
8638,I'm a fan of Get Shorty. This is the sequel fo...,0
6385,This film was so bad it became enjoyable. If y...,0


In [4]:
train.shape

(5000, 2)

In [5]:
type(train)

pandas.core.frame.DataFrame

In [6]:
def clean_text(text):
    
    lower = [word.lower() for word in text]
    lower = ''.join(lower)
    
    #remove html
    nohtml = soup = BeautifulSoup(lower, 'lxml')
    nohtml = soup.get_text()
    
    #remove punctuation
    nopunc = [char for char in nohtml if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #remove stopwords
    nostop = [word for word in nopunc.split() if word not in stopwords.words('english')]
    #nostop = ''.join(nostop)
    
    #Lemmatization: converting the words into its roots
    root = [WordNetLemmatizer().lemmatize(word) for word in nostop]
    
    return list(set(root))

import nltk
nltk.download()

In [7]:
#Checking to see that the function works
train['Review'].head(1).apply(clean_text)

18288    [bozo, guillotine, psychotic, two, tube, fathe...
Name: Review, dtype: object

In [8]:
#Convert the tokens into a vector
#Instead of using single words on my vector, I am going to use bigrams
#to get more context
ngram_min = 1
ngram_max = 3
bow = CountVectorizer(analyzer=clean_text, ngram_range=(ngram_min,ngram_max)).fit_transform(train['Review'])


In [9]:
# Print total number of words in the bow
print(len(bow.todense()))

5000


In [10]:
#Normalize the bag of words
# I am going to term frequency inverse document frequency (TF-IDF) to use the weight of the words 
#(the number of times they show up in the document) instead of using the words everytime they appear in the document

tfidf_transformer = TfidfTransformer().fit_transform(bow)

# Model Training

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [43]:
model = MultinomialNB()
model.fit(tfidf_transformer, train['Label'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Cross Validation

In [44]:
scores = cross_val_score(model, 
                tfidf_transformer, train['Label'], 
                cv=5, 
                n_jobs=-1, 
                scoring='roc_auc')

In [45]:
scores.mean()

0.9303558928121411

# Create a pipeline

In [46]:
pipeline = Pipeline([
                    ('bow', CountVectorizer(analyzer=clean_text, ngram_range=(ngram_min,ngram_max))),
                    ('tfidf', TfidfTransformer()),
                    ('classifier', MultinomialNB()),
])

In [47]:
pipeline.fit(train['Review'], train['Label'])

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x1a2245cd90>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [48]:
y_pred = pipeline.predict(test['Review'])

In [49]:
y_pred = pipeline.predict(test['Review'])
print(confusion_matrix(test['Label'],y_pred))
print(classification_report(test['Label'],y_pred))

[[2304  252]
 [ 543 1901]]
              precision    recall  f1-score   support

           0       0.81      0.90      0.85      2556
           1       0.88      0.78      0.83      2444

    accuracy                           0.84      5000
   macro avg       0.85      0.84      0.84      5000
weighted avg       0.85      0.84      0.84      5000

