In [1]:
# import packages

import os
import re
import nltk
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.classification import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = 'train/neg/'
neg = os.listdir(path)

In [3]:
documentlist = []

for negative in neg:
    with open('train/neg/{}'.format(negative), 'r', encoding="utf8") as f1:
        documentlist.append([f1.read(), 0])

In [4]:
path = 'train/pos/'
pos = os.listdir(path)

In [5]:
for positive in pos:
    with open('train/pos/{}'.format(positive), 'r', encoding="utf8") as f1:
        documentlist.append([f1.read(), 1])

In [6]:
documentDF = pd.DataFrame(documentlist, columns = ['Content', 'Feedback'])
documentDF.tail()

Unnamed: 0,Content,Feedback
24995,"Seeing as the vote average was pretty low, and...",1
24996,"The plot had some wretched, unbelievable twist...",1
24997,I am amazed at how this movie(and most others ...,1
24998,A Christmas Together actually came before my t...,1
24999,Working-class romantic drama from director Mar...,1


In [7]:
#documentDF.to_csv('traindata.csv', index = False)

In [8]:
documentlist = []

path = 'test/neg/'
neg = os.listdir(path)

In [9]:
for negative in neg:
    with open('test/neg/{}'.format(negative), 'r', encoding="utf8") as f1:
        documentlist.append([f1.read(), 0])

In [10]:
path = 'test/pos/'
pos = os.listdir(path)

In [11]:
for positive in pos:
    with open('test/pos/{}'.format(positive), 'r', encoding="utf8") as f1:
        documentlist.append([f1.read(), 1])

In [12]:
documentDF = pd.DataFrame(documentlist, columns = ['Content', 'Feedback'])
documentDF.tail()

Unnamed: 0,Content,Feedback
24995,I was extraordinarily impressed by this film. ...,1
24996,"Although I'm not a golf fan, I attended a snea...",1
24997,"From the start of ""The Edge Of Love"", the view...",1
24998,"This movie, with all its complexity and subtle...",1
24999,I've seen this story before but my kids haven'...,1


In [13]:
#documentDF.to_csv('testdata.csv', index = False)

In [14]:
# Load the dataset
df = pd.read_csv('traindata.csv')
print(df.head())

                                             Content  Feedback
0  Story of a man who has unnatural feelings for ...         0
1  Airport '77 starts as a brand new luxury 747 p...         0
2  This film lacked something I couldn't put my f...         0
3  Sorry everyone,,, I know this is supposed to b...         0
4  When I was little my parents took me along to ...         0


In [15]:
time_start = time.time()

# declare empty list 'corpus'
corpus = []

# for loop to fill in corpus
for i in range(0, 25000):
    review = df['Content'].iloc[i]
    # retain alphabets
    review = re.sub("[^a-zA-Z]", " ", review)
    # convert to lower case
    review = review.lower()
    # tokenize
    review = review.split()
    # initialize stemmer object
    ps = PorterStemmer()
    # perform stemming
    review = [word for word in review if word not in stopwords.words('english')]
    review = [ps.stem(i) for i in review]
    # join elements of list
    review = ' '.join(review)
    # add to 'corpus'
    corpus.append(review)
    
# display 'corpus'
print(corpus)

print('Corpus created! Time elapsed: {} seconds'.format(time.time()-time_start))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
# Instantiate count vectorizer
cv = CountVectorizer(max_features = 1500)

# Independent variable
X_train = cv.fit_transform(corpus)

# dependent variable
y_train = df['Feedback']

In [17]:
# Load the dataset
df1 = pd.read_csv('testdata.csv')
print(df1.head())

                                             Content  Feedback
0  Once again Mr. Costner has dragged out a movie...         0
1  This is an example of why the majority of acti...         0
2  First of all I hate those moronic rappers, who...         0
3  Not even the Beatles could write songs everyon...         0
4  Brass pictures (movies is not a fitting word f...         0


In [18]:
time_start = time.time()

corpus_test = []

# for loop to fill in corpus
for i in range(0, 25000):
    review = df1['Content'].iloc[i]
    # retain alphabets
    review = re.sub("[^a-zA-Z]", " ", review)
    # convert to lower case
    review = review.lower()
    # tokenize
    review = review.split()
    # initialize stemmer object
    ps = PorterStemmer()
    # perform stemming
    review = [word for word in review if word not in stopwords.words('english')]
    review = [ps.stem(i) for i in review]
    # join elements of list
    review = ' '.join(review)
    # add to 'corpus'
    corpus_test.append(review)
    
# display 'corpus'
print(corpus_test)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [19]:
print('Corpus created! Time elapsed: {} seconds'.format(time.time()-time_start))

Corpus created! Time elapsed: 1121.0855565071106 seconds


In [20]:
# Independent variable
X_test = cv.fit_transform(corpus_test)

# dependent variable
y_test = df1['Feedback']

In [22]:
# Insatntiate calssifier
rf = RandomForestClassifier(random_state=2)

# fit modelk on training data
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [23]:
# predict on test data
y_pred = rf.predict(X_test)

In [24]:
# calculate the accuracy score
score = accuracy_score(y_test, y_pred)

# calculate the precision
precision = precision_score(y_test, y_pred)

# display 'score' and 'precision'
print("Score: ", round(score, 4))
print("Precision: ", round(precision, 4))

Score:  0.5026
Precision:  0.5043


In [25]:
xgb_clf = XGBClassifier(base_estimator=rf, random_state=0)
xgb_clf.fit(X_train, y_train)
xgb_score = xgb_clf.score(X_test, y_test)
print("Score with XGBOOSTING: ", xgb_score)

Score with XGBOOSTING:  0.50384


In [26]:
y_pred = xgb_clf.predict(X_test)

# calculate the accuracy score
score = accuracy_score(y_test, y_pred)

# calculate the precision
precision = precision_score(y_test, y_pred)

# display 'score' and 'precision'
print("Score: ", round(score, 4))
print("Precision: ", round(precision, 4))

Score:  0.5038
Precision:  0.5034


In [27]:
nb = GaussianNB()
nb.fit(X_train.toarray(), y_train)

# generate your cross-validation prediction with 10 fold Stratified sampling
y_pred = cross_val_predict(nb, X_train.toarray(), y_test, cv=10)

print(classification_report(y_test, y_pred))
print("ACCURACY::", accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.69      0.85      0.76     12500
           1       0.80      0.62      0.70     12500

   micro avg       0.73      0.73      0.73     25000
   macro avg       0.75      0.73      0.73     25000
weighted avg       0.75      0.73      0.73     25000

ACCURACY:: 0.73368


In [39]:
tokenizer = TreebankWordTokenizer()

#instantiate TfidfVectorizer (with the default parameters)

vect = TfidfVectorizer()
vect.set_params(tokenizer=tokenizer.tokenize)

# remove English stop words
vect.set_params(stop_words='english')

# include 1-grams and 2-grams
vect.set_params(ngram_range=(1, 3))

# ignore terms that appear in more than 50% of the documents
vect.set_params(max_df=0.5)

# only keep terms that appear in at least 2 documents
vect.set_params(min_df=100)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=100,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method TreebankWordTokenizer.tokenize of <nltk.tokenize.treebank.TreebankWordTokenizer object at 0x000000007053D6A0>>,
        use_idf=True, vocabulary=None)

In [40]:
train_vectors = vect.fit_transform(df['Content'])

In [41]:
nb.fit(train_vectors.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [42]:
test_vectors = vect.transform(df1['Content'])

In [43]:
# generate your cross-validation prediction with 10 fold Stratified sampling
y_pred = cross_val_predict(nb, test_vectors.toarray(), y_test, cv=10)

print(classification_report(y_test, y_pred))
print ("ACCURACY::",accuracy_score(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82     12500
           1       0.82      0.81      0.82     12500

   micro avg       0.82      0.82      0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

ACCURACY:: 0.8174
