<a href="https://colab.research.google.com/github/Twixii99/Movie-Review-Dataset/blob/main/Movie_Review_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%autosave 1

Autosaving every 1 seconds


In [127]:
import pandas as pd
import numpy as np
import re

import os
import tarfile
import urllib

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet, words
from nltk.stem import WordNetLemmatizer 

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score

from IPython.display import display

nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -O data.tar.gz

--2021-12-14 07:17:31--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘data.tar.gz’


2021-12-14 07:17:37 (14.3 MB/s) - ‘data.tar.gz’ saved [84125825/84125825]



In [None]:
# Unzipping the data files.
tar = tarfile.open('data.tar.gz', 'r:gz')
tar.extractall()
tar.close()

# Data Cleaninng 
## **Goal:** Get the data in clean, standard format for further analysis.
## **HINT:** Different Types of analysis require different data formats.


1.   Corpus (I will use Pandas to build the data in this format)
2.   Document-Term Matrix

### Actually for Document-Term Matrix Format we need to do some nice things:

*   Cleaning data { numbers, punks, lowercase }
*   Tokenize text ( Spitting the data into tokens then removing the stopwords from tokens )







In [3]:
def collect_data(data_path: str):
  reviews = []
  for review_file in os.listdir(data_path):
    review_file = os.path.join(data_path, review_file)
    with open(review_file, 'r') as review:
      reviews.append(review.read().rstrip())
  return reviews  

In [4]:
def clean(data):
  cleaned_data = []
  for item in data:
    item = item.lower()
    item = re.sub(r'[^\w\s]|[\d+]', '', item)
    cleaned_data.append(item)
  return cleaned_data 

In [5]:
pos_training_data_path = '/content/aclImdb/train/pos'
neg_training_data_path = '/content/aclImdb/train/neg'
pos_testing_data_path = '/content/aclImdb/test/pos'
neg_testing_data_path = '/content/aclImdb/test/neg'

train_pos_reviews = collect_data(pos_training_data_path)
train_neg_reviews = collect_data(neg_training_data_path)
test_pos_reviews = collect_data(pos_testing_data_path)
test_neg_reviews = collect_data(neg_testing_data_path)

cleaned_train_pos_reviews = clean(train_pos_reviews)
cleaned_train_neg_reviews = clean(train_neg_reviews)
cleaned_test_pos_reviews = clean(test_pos_reviews)
cleaned_test_neg_reviews = clean(test_neg_reviews)
  
print('Number of positive reviews: {}, and the number of negative reviews is: {}'.format(len(train_pos_reviews), len(train_neg_reviews)))
print('Number of positive reviews: {}, and the number of negative reviews is: {}'.format(len(test_pos_reviews), len(test_neg_reviews)))

Number of positive reviews: 12500, and the number of negative reviews is: 12500
Number of positive reviews: 12500, and the number of negative reviews is: 12500


In [6]:
train_pos_reviews[0]

'This is a short, crudely animated series by David Lynch (as it says in the beginning), and it follows the misadventures of a backwoods, overall-wearing large man, with a wife who has a stress disorder and an annoying son. Both of those elements are harped upon repeatedly in the short episodes, and there\'s no real plot to be seen. It\'s easier if you think of this as an exceptionally odd, slightly macabre Looney Tunes- with far more gore, profanity, bloody violence, and occasional moments of hilarity.<br /><br />I bought the DVD along with Eraserhead, having previously seen Eraserhead. Don\'t look to this series if you want an artistic masterpiece- this is anything but. In fact, it seems to almost be a statement against such things, as its rough style spits in the face of any sort of animation convention you may see. As Lynch says, "If this is funny, it is only funny because we see the absurdity of it all."'

In [7]:
cleaned_train_pos_reviews[0]

'this is a short crudely animated series by david lynch as it says in the beginning and it follows the misadventures of a backwoods overallwearing large man with a wife who has a stress disorder and an annoying son both of those elements are harped upon repeatedly in the short episodes and theres no real plot to be seen its easier if you think of this as an exceptionally odd slightly macabre looney tunes with far more gore profanity bloody violence and occasional moments of hilaritybr br i bought the dvd along with eraserhead having previously seen eraserhead dont look to this series if you want an artistic masterpiece this is anything but in fact it seems to almost be a statement against such things as its rough style spits in the face of any sort of animation convention you may see as lynch says if this is funny it is only funny because we see the absurdity of it all'

In [8]:
# Preparing the stopwards in English
english_stopwords = stopwords.words('english')
# Lemmatization
lemmatizer = WordNetLemmatizer()
# Error and spelling. corrections
words = set(nltk.corpus.words.words())

In [9]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [10]:
def advanced_clean(data, potential_features: set):
  for idx, sample in enumerate(data):
    __tokens__ = []
    tokens = nltk.word_tokenize(sample)
    for token in tokens:
      if token not in english_stopwords:
        token_pos = get_wordnet_pos(token)
        lemma = lemmatizer.lemmatize(token, token_pos)
        if  lemma in words and lemma not in english_stopwords and lemma in words:
          potential_features.add(lemma)
          __tokens__.append(lemma)
    data[idx] = ''.join(__token__ + ' ' for __token__ in __tokens__)

In [11]:
# converting the representation of text to the Bag of Words (BoW) model.
# BoW is a simplified format of representing the text that ingnrs order.
# Really powerful as a first round of analysis.
potential_features = set()

advanced_clean(cleaned_train_pos_reviews, potential_features)
advanced_clean(cleaned_train_neg_reviews, potential_features)
advanced_clean(cleaned_test_pos_reviews, potential_features)
advanced_clean(cleaned_test_neg_reviews, potential_features)

In [37]:
print("number of features(unique english non-stopwords words):", len(potential_features))

number of features(unique english non-stopwords words): 29621


# Building Corups

In [None]:
# preparring the labels
labels = np.array([1 for item in range(len(train_pos_reviews))] + [0 for item in range(len(train_neg_reviews))])

In [None]:
data = {
    'reviews': train_pos_reviews + train_neg_reviews,
    'label': labels
}
corpus_df = pd.DataFrame(data=data).sample(frac = 1).reset_index(drop=True)

In [None]:
corpus_df.head(10)

Unnamed: 0,reviews,label
0,"This is a wonderful movie with a fun, clever s...",1
1,Watched on Hulu (far too many commercials!) so...,0
2,Time for Hollywood to sit up and take notice! ...,1
3,"Hitokiri (which translates roughly as ""assassi...",1
4,This film is justly famous as one of the most ...,0
5,One of the most timely and engrossing document...,1
6,I have to start off by apologizing because I t...,1
7,"First, there is NO way the remake can be as go...",1
8,I did watch all of the film through to the dis...,0
9,This is one of the best Bollywood movies i hav...,1


# Data splitting

In [87]:
full_data = cleaned_train_pos_reviews + cleaned_train_neg_reviews + cleaned_test_pos_reviews + cleaned_test_neg_reviews
labels = np.array([1 for item in range(len(cleaned_train_pos_reviews))] + [0 for item in range(len(cleaned_train_neg_reviews))] 
                  + [1 for item in range(len(cleaned_test_pos_reviews))] + [0 for item in range(len(cleaned_test_neg_reviews))])

In [88]:
len(full_data), len(labels)

(50000, 50000)

In [89]:
full_train, test, full_train_labels, test_labels = train_test_split(full_data, labels, test_size=0.2, shuffle=True, random_state=11);
train, validate, train_labels, validate_labels = train_test_split(full_train, full_train_labels, test_size=0.25, shuffle=True, random_state=11);

In [90]:
len(train), len(train_labels), len(validate), len(validate_labels), len(test), len(test_labels)

(30000, 30000, 10000, 10000, 10000, 10000)

# DTM data frame

In [91]:
cv = CountVectorizer(max_df=0.95, min_df=0.05)
X_train_data = cv.fit_transform(train)
X_validate_data = cv.transform(validate)
X_test_data = cv.transform(test)

In [85]:
X_train_data.toarray().shape

(30000, 310)

In [128]:
len(cv.get_feature_names())



310

In [109]:
train_df = pd.DataFrame(data=X_train_data.toarray(), columns=cv.get_feature_names())
validate_df = pd.DataFrame(X_validate_data.toarray(), columns = cv.get_feature_names())
test_df = pd.DataFrame(X_test_data.toarray(), columns = cv.get_feature_names())



In [132]:
display(train_df.head()), display(validate_df.head()), display(test_df.head())

Unnamed: 0,absolutely,act,action,actor,actually,add,almost,along,also,although,always,another,anyone,anything,appear,around,attempt,audience,away,awful,back,bad,base,beautiful,become,begin,believe,best,big,bit,black,book,boring,boy,budget,call,camera,cant,care,case,...,three,time,title,together,top,true,truly,try,turn,two,understand,use,version,video,view,viewer,want,war,wasnt,waste,watch,way,well,whole,wife,without,woman,wonder,wonderful,word,work,world,worth,would,write,wrong,year,yes,yet,young
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0
4,0,1,0,3,1,0,0,0,2,0,3,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,...,0,6,1,0,0,1,0,0,0,2,0,2,1,0,0,1,0,1,0,0,0,4,1,2,0,0,0,0,0,0,2,0,0,0,0,0,2,0,1,0


Unnamed: 0,absolutely,act,action,actor,actually,add,almost,along,also,although,always,another,anyone,anything,appear,around,attempt,audience,away,awful,back,bad,base,beautiful,become,begin,believe,best,big,bit,black,book,boring,boy,budget,call,camera,cant,care,case,...,three,time,title,together,top,true,truly,try,turn,two,understand,use,version,video,view,viewer,want,war,wasnt,waste,watch,way,well,whole,wife,without,woman,wonder,wonderful,word,work,world,worth,would,write,wrong,year,yes,yet,young
0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,5,1,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0
3,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,absolutely,act,action,actor,actually,add,almost,along,also,although,always,another,anyone,anything,appear,around,attempt,audience,away,awful,back,bad,base,beautiful,become,begin,believe,best,big,bit,black,book,boring,boy,budget,call,camera,cant,care,case,...,three,time,title,together,top,true,truly,try,turn,two,understand,use,version,video,view,viewer,want,war,wasnt,waste,watch,way,well,whole,wife,without,woman,wonder,wonderful,word,work,world,worth,would,write,wrong,year,yes,yet,young
0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0


(None, None, None)

In [115]:
from sklearn import preprocessing
X_train = pd.DataFrame(preprocessing.scale(X_train_data.toarray()), columns = cv.get_feature_names())
X_validate = pd.DataFrame(preprocessing.scale(X_validate_data.toarray()), columns = cv.get_feature_names())
X_test = pd.DataFrame(preprocessing.scale(X_test_data.toarray()), columns = cv.get_feature_names())



In [116]:
pca = PCA(n_components=0.8, svd_solver='full')
X_train_minimized = pca.fit_transform(X_train)
X_validate_minimized = pca.transform(X_validate)
X_test_minimized = pca.transform(X_test)

In [121]:
train_df2 = pd.DataFrame(data=X_train_minimized)
validate_df2 = pd.DataFrame(X_validate_minimized)
test_df2 = pd.DataFrame(X_test_minimized)

In [123]:
display(train_df2.head()), display(validate_df2.head()), display(test_df2.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220
0,-3.714022,0.32665,0.952677,-0.256785,-0.317663,0.067011,-0.313837,0.260127,-0.736693,-0.306876,0.257238,0.825132,1.119011,0.661598,0.729808,0.616266,0.549789,-0.740469,-0.04582,-0.766732,-0.182131,0.050254,-0.354026,-0.660534,1.843665,-0.463395,-0.848693,1.020685,0.292743,-0.923198,1.524125,0.035997,-0.070903,-0.355336,-0.041756,0.839237,-0.547357,-0.447659,-0.301166,-0.095129,...,0.281426,-0.167456,-0.188156,-0.239352,-0.032054,0.222577,0.282505,0.537704,1.050689,0.238569,-0.630353,-0.966334,-0.596238,0.005802,-0.542151,-0.036893,0.016136,-0.231373,0.011691,-0.978165,-0.240508,0.221547,0.422594,-0.313671,0.333947,-0.348984,-0.864796,0.076692,0.564008,-0.13673,0.167229,-1.036985,0.667576,-0.092401,0.28903,0.163428,-0.277421,0.088984,0.388812,-0.67124
1,-1.435664,-0.698079,0.936132,0.592507,1.834695,0.446115,0.817845,-0.128024,0.088748,-0.763717,-0.709145,-1.015218,-0.170444,1.198778,-0.103533,-1.211594,-0.239279,0.507251,0.265997,0.712011,-0.76824,-1.66994,0.007011,2.152603,-2.178905,-0.148427,0.335049,-0.105462,-0.971871,-0.634372,0.631034,0.083695,-1.167299,-0.322652,0.701396,-0.265351,0.158537,0.924733,0.089723,1.684061,...,-0.771888,0.711504,-0.68141,-2.790454,1.472794,0.238163,1.323692,0.757597,-1.665247,0.634935,-0.602532,-1.168492,0.939032,1.099448,-0.378242,-0.789704,-1.180059,1.016531,1.206541,-0.936633,-0.113951,-0.2246,0.238731,-0.556433,-0.360259,-1.374497,0.166823,0.826742,-1.141011,0.893219,1.607787,0.442518,0.247674,-1.185882,-1.184744,0.406193,-0.264499,0.936333,0.970664,0.515709
2,-0.325994,-1.069913,-0.279649,1.387446,0.323508,0.871323,1.380215,0.649811,-0.185176,-0.145812,-0.299123,-0.804413,0.668378,0.113801,-1.831866,0.475826,0.538419,0.049005,-0.040697,-0.909074,0.859959,-0.615591,1.639611,1.135,0.981666,0.471629,-0.043788,-0.536919,-1.744609,1.676735,-1.00207,-0.253054,-0.485735,-0.4703,0.651439,-0.673807,1.330782,0.432404,0.92172,0.388075,...,-1.003708,1.645949,-0.541237,0.485813,0.254745,0.708985,1.548381,0.144085,-0.272117,-0.313394,-0.674562,0.301155,-0.546234,0.037209,-0.860701,-1.310693,-0.920956,0.788346,-1.372371,-1.291845,0.029313,-1.335148,0.631903,-0.360812,-0.127511,0.649391,1.100449,-0.981244,-0.655746,0.561689,-2.449519,-1.87392,-1.059714,0.123042,-0.131417,-0.025541,-0.748026,0.498186,1.848934,0.092106
3,-2.784459,0.250436,-0.439211,0.049519,0.663685,-0.661791,1.370234,-1.27027,0.352023,0.728507,-0.079416,-0.751052,-2.216237,0.064398,1.204275,-1.005425,-1.296045,0.136044,1.390928,-0.508721,1.347658,-0.950407,0.131473,-0.264999,1.1593,0.159917,0.688112,0.432887,-1.014092,-0.302576,2.145718,-1.197859,0.490107,0.057966,-0.27495,-0.328626,-0.309419,0.324349,0.541045,-2.535868,...,0.527574,-0.573064,-1.024591,-0.21235,-0.245367,-0.11893,0.32988,-0.979946,0.148422,0.010946,-0.357013,-0.125772,0.276698,-0.398683,-0.836296,-0.356928,-0.367887,0.146069,-0.433048,-0.059641,-0.31482,-1.175827,-0.902931,-0.26686,0.014961,0.123341,0.060557,-0.298129,0.705049,-0.764632,-0.236707,-0.454125,-0.393619,-0.671575,1.117098,0.004031,0.587635,-0.305833,0.60432,-0.44521
4,13.100914,-1.650632,-3.150654,-3.30769,1.723023,0.573653,0.094679,1.789119,0.610579,-3.084891,-3.004384,1.366654,3.214039,-1.167245,0.781152,-1.400376,2.961258,0.016005,1.414072,1.819674,-0.11626,2.406677,1.626936,0.412467,-1.709257,-0.905193,-2.677871,0.185194,0.676383,1.169372,1.390289,-0.667506,-1.684203,-1.637292,0.918299,-2.964223,0.675926,0.001648,-0.949687,1.968716,...,-0.170318,0.707488,-1.832612,0.48102,-1.45691,-1.843467,3.766376,0.662864,-2.222609,0.421997,1.814085,3.190071,0.131352,-0.672481,-0.69012,-0.901215,-0.363825,-0.329444,0.507019,1.389419,0.142581,-1.124617,0.506887,-0.601229,0.805965,-2.008684,-0.038129,1.284155,-3.209923,1.366828,-1.412841,-1.501318,0.306684,2.466206,-0.421817,-1.022696,-0.611076,-4.25513,0.065985,-2.078747


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220
0,-0.074192,0.426415,0.223475,-0.409126,-1.699382,-1.667962,1.887992,-1.388866,-0.49656,0.260026,-0.16382,-1.16121,-0.15056,-0.883417,0.443879,-0.352616,0.606355,0.71115,-0.485965,-0.973103,-0.897353,0.948562,0.615492,1.521808,1.117774,1.408321,-0.342649,0.425149,-0.653847,-1.152028,1.092308,0.174649,-0.459681,-0.0088,-0.063035,1.433227,0.394872,-0.024206,-0.333515,-1.575272,...,-0.785197,-1.234854,0.524822,0.793315,2.269784,-1.245179,-0.251633,-0.307411,-1.525971,-0.012152,0.377074,1.097861,-0.698268,-1.124224,0.187966,-1.026588,-0.803819,-0.206353,-0.806045,0.228312,-1.416426,0.260019,0.697537,1.32368,2.642104,-0.665788,0.895614,1.366279,0.03064,0.673563,0.153194,1.634086,1.770616,-0.127109,1.29957,-2.269015,-0.865695,2.588001,-0.679364,1.650948
1,-1.693017,2.74784,0.543357,-1.607088,-0.05124,0.936699,-0.470229,-0.909739,0.050899,0.290958,-0.44112,0.041968,-0.130081,0.354857,0.575216,-1.220883,0.243139,-0.15551,-0.562802,0.216447,-0.07451,0.325335,-0.065496,1.118492,-1.113163,-0.336102,0.364158,-0.369678,0.116181,0.501022,0.973061,0.789417,-0.877367,0.5886,0.551978,0.381924,0.990576,0.42677,0.411894,-0.009203,...,0.05551,0.274549,-0.957998,0.630298,0.649679,0.242758,0.374081,-0.532713,1.103239,0.611472,-0.178659,-0.446219,-0.606457,0.300246,-0.272923,-0.503479,0.694932,-0.145794,0.597307,-0.011333,-0.582647,0.306106,-0.469526,-0.851783,-1.119976,-0.03162,0.232017,0.917043,-0.05367,-0.206064,0.303415,-0.747377,-0.54426,0.345479,0.544871,0.332604,0.245231,1.002741,-0.204275,-0.294221
2,0.955765,0.147581,1.440473,1.794758,1.200678,0.629506,-0.976354,0.270893,-0.338541,3.997022,-0.483769,1.251809,-0.040216,-0.025603,-0.16697,0.496429,1.646974,0.494492,1.359436,-2.550457,3.079344,0.398173,1.392837,-0.289828,0.394733,-1.502545,0.743492,0.678223,-2.399273,0.180106,0.581403,0.209262,-0.944692,-1.000993,0.201347,-0.231277,1.118368,-0.310769,-0.725931,0.231812,...,1.355572,0.499238,-0.403216,0.432947,-0.383702,1.14252,-1.467699,0.71037,-0.040782,-1.53047,-0.364398,-2.489241,-0.496196,1.116987,-0.405489,-0.991682,-0.256714,0.707204,0.281145,1.351471,1.483712,-0.462048,-1.430055,-0.630135,1.035816,-2.222483,0.870551,1.596758,0.966721,1.447452,-0.107375,-0.241678,0.634944,0.857031,1.415896,-0.190532,1.240181,1.427895,-0.010917,-0.238052
3,-3.272967,1.20773,-0.342037,0.636299,0.144812,-0.375044,-1.722947,1.739662,0.637165,-0.254103,0.027166,-0.043489,0.306515,-0.531427,0.439722,-0.815593,0.615527,-0.667879,-0.698113,0.438205,0.563597,0.172982,1.464743,0.420412,-0.512224,1.001371,0.074489,1.390343,0.379048,-0.125775,0.155525,-0.413674,0.235171,-1.20123,-1.379134,-0.005917,0.138137,-0.976772,0.005834,-0.314111,...,-0.420236,-0.308797,0.308585,-0.129463,-0.357214,-0.089016,0.665383,0.274196,0.684876,0.058445,-0.200551,0.516649,1.201196,0.052618,0.537163,0.174773,-0.342644,-0.397299,-0.029191,-0.269521,-0.129336,0.160487,0.538012,0.86759,0.212254,-0.226351,-0.033852,-0.094299,-0.667646,-0.12902,0.052676,0.371336,0.807557,-0.659911,-1.002164,-0.224915,0.406692,0.770148,-0.921087,-0.207464
4,-0.734077,-0.38407,0.701102,1.410651,2.067875,-1.364778,0.596806,0.889552,0.582695,-0.005096,0.127289,0.502014,0.10591,0.18771,-0.153903,1.177804,0.590575,-1.635215,-0.986261,0.805206,0.10156,-0.954861,1.392377,-1.026852,-0.734335,0.378313,-2.058978,-0.503743,1.27162,0.571864,-0.017421,-2.047508,2.328698,0.904992,0.43202,-0.651952,-0.351333,-0.239179,-1.040494,0.211714,...,-1.322252,0.4993,0.366744,0.170477,-0.341099,0.280951,0.236619,-0.545375,1.014833,0.462883,-1.143579,0.160542,-0.541923,-0.481837,-0.038311,0.496394,-0.350788,-1.478471,1.261517,0.617524,-1.061766,0.278792,0.898626,0.353728,0.661973,0.328768,-1.622867,0.115032,2.084601,-0.073204,-0.011319,1.037405,-0.965496,0.079312,0.865306,0.634444,-0.578221,0.618363,-0.269082,-0.324333


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220
0,2.704542,0.739213,3.294808,-2.558974,0.428805,0.086546,1.62193,2.585696,-4.682559,0.184267,3.173028,-0.421678,-0.870232,-2.875823,0.164511,0.253802,-1.437483,-0.613287,0.622051,-1.305391,2.644344,1.209035,2.026234,-1.795399,-1.926449,2.387252,0.915063,2.368032,-0.396171,1.166595,0.976737,1.82808,0.24713,-0.709413,1.602811,0.149668,0.410997,0.833133,-0.758663,1.607747,...,0.670339,-0.670495,-0.599662,-0.441375,-0.570851,-4.041239,1.877794,-0.516591,0.05896,0.391555,-0.348382,-0.414317,0.93307,-2.283436,-0.536281,0.301645,-1.3978,-0.932609,1.912265,0.38424,0.435788,1.363844,-0.660193,-0.337986,-0.26671,-0.439001,1.88192,2.066889,1.033726,1.643918,-2.079383,-0.353048,-0.769965,-1.087803,1.521669,-0.41176,-1.248425,-1.89051,0.222404,-0.411048
1,-1.898985,0.307149,-0.302975,-0.744602,-0.821057,-0.84328,1.018507,0.430109,0.215165,1.418781,-0.162231,-1.051508,-0.382592,-0.270998,0.188078,-0.208912,-1.003622,0.31268,0.476961,-1.251294,1.353997,-1.535044,-0.915389,-0.456386,0.61706,1.39797,-1.905334,0.117009,-0.042446,-0.669859,0.109794,0.824686,-1.323295,0.076737,0.949093,0.653642,1.514453,-0.429225,1.240164,0.437778,...,0.16225,0.020504,0.037892,-0.391969,-0.27977,0.558327,-1.206533,-0.221019,0.581386,-0.545524,-0.215874,-0.430724,0.623701,0.245089,-0.690813,1.174759,-0.269267,-0.058995,0.131252,-0.520902,-0.022585,1.420623,-1.975537,-0.539196,1.546597,-0.057101,-0.331283,0.531693,0.903247,-0.461062,0.224447,-0.449789,0.404158,-0.256293,0.200176,-0.097944,-0.697542,-1.745565,-1.049868,-0.420386
2,-1.211321,2.210492,1.397768,0.916468,-0.992459,0.691384,-0.492165,0.290068,0.136822,-0.086367,-1.160614,0.349771,-0.145342,-1.46536,1.202232,0.447821,0.038778,0.909596,0.554925,1.551181,0.490496,0.781256,0.496373,-0.193398,-0.383982,0.289723,-0.480775,-0.72516,0.807706,-0.523809,0.406784,1.266881,0.250105,-1.234827,0.708554,1.573969,-0.104029,1.186872,-0.768389,0.033747,...,-0.444734,0.489528,-0.055232,0.521581,0.542489,-0.055318,0.192187,-0.142685,-0.638454,-0.159873,-0.065897,0.241068,-0.376725,-0.301809,0.440986,-1.269546,0.667506,-1.437242,1.722487,0.323508,0.015753,-0.853823,-1.06301,1.143209,1.495452,-0.930316,2.005089,-0.082237,0.812785,-0.883295,-0.475279,-0.047463,0.680896,0.788628,0.644956,-0.482219,-0.551042,-0.260248,-0.481574,0.050019
3,-2.966908,0.007317,0.403432,-0.956856,0.653296,0.508187,-0.24812,-0.14687,-0.331709,-0.781726,-0.255633,0.142599,0.424121,-0.671264,-1.11988,0.887547,-1.08912,1.290727,-0.009496,-0.592438,-0.27561,-1.59378,-0.112923,0.493,-0.623869,-0.164607,-0.261597,-1.514092,-0.131527,-0.610384,0.221819,1.53614,-0.235403,0.94933,-1.406374,1.711063,-1.0685,0.935213,-0.912447,0.197651,...,0.388092,-0.765367,-0.1527,0.856062,0.689549,-0.337698,0.158906,0.100814,0.221606,0.761695,0.932909,0.596447,-1.138269,-2.127072,-0.61347,1.181181,0.660961,-0.075327,0.264016,-0.090433,1.167383,-0.722468,0.790107,-0.625303,0.415221,0.551103,-0.241084,0.574487,-0.994133,-0.444841,0.644158,-0.532353,0.847118,0.535547,0.456598,0.498083,1.080818,-0.128599,-0.221778,-0.488903
4,-0.544812,-0.646422,1.967946,-0.5164,1.326179,0.506023,0.33311,-1.405123,-1.617366,1.300108,-0.445183,1.166093,0.490471,-1.909181,-0.380856,2.072251,0.323001,-0.179872,0.084776,2.812717,-0.215993,-0.311141,-0.18289,0.08719,0.385365,-1.091985,0.044622,-0.044844,-0.46082,0.32677,0.692742,-0.929496,-0.452898,0.404348,0.331664,-0.597086,-0.771621,0.024337,0.606004,0.174228,...,-0.009889,-0.048062,-0.379123,0.616593,-0.128935,-0.337939,0.810791,-1.241359,0.406299,-1.794574,0.251381,0.945061,0.650314,0.587015,0.565341,0.353094,-1.24465,0.954839,-0.576582,-0.703331,-1.84443,0.90824,0.098051,0.52693,0.743591,0.016482,-0.966391,0.286703,-0.551375,0.421981,-0.198838,-0.527137,0.390591,2.458867,0.20836,-0.025456,-0.584592,1.299346,2.089822,0.861007


(None, None, None)

# Training

## KNN Classifier

### A quick prototype

In [137]:
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(train_df, train_labels)

KNeighborsClassifier(n_neighbors=3)

In [143]:
y_pred_proba = knn_classifier.predict_proba(validate_df)[:, 1]
accuracy_score(validate_labels, (y_pred_proba >= 0.5))

0.6437

### Hyperparameter tuning

In [159]:
optmization = {
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'n_neighbors': [1, 2, 3]
}

In [162]:
_knn = KNeighborsClassifier()
gscv = GridSearchCV(_knn, optmization, scoring='accuracy',cv=4)
lr_gs = gscv.fit(train_df, train_labels)

print('Accurecy score: %.5f' % lr_gs.best_score_)
print('Best n_neighbors: %s' % lr_gs.best_params_)

Accurecy score: 0.65187
Best n_neighbors: {'algorithm': 'auto', 'n_neighbors': 3}


### Training with the best combination

In [171]:
_knn_best = KNeighborsClassifier(**lr_gs.best_params_)

In [172]:
_knn_best.fit(train_df, train_labels)

KNeighborsClassifier(n_neighbors=3)

In [174]:
y_pred_proba = _knn_best.predict_proba(validate_df)[:, 1]
accuracy_score(validate_labels, (y_pred_proba >= 0.5))

0.6437

## XGBoost

### A quick prototype

In [175]:
import xgboost as xgb

In [184]:
dtrain = xgb.DMatrix(data=train_df, label=train_labels)
xgb_params = {
    'eta': 0.01, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
    'eval_metric': 'auc'
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [178]:
dtest = xgb.DMatrix(data=validate_df, label=validate_labels)
y_pred_proba = model.predict(dtest)
accuracy_score(validate_labels, (y_pred_proba >= 0.5))

0.7482

### Hyperparameter tuning

In [179]:
xgb_params = {"eta":[0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
               "max_depth":[ 3, 4, 5, 6, 8, 10, 12, 15],
               "min_child_weight":[ 1, 3, 5, 7 ],
               "gamma":[ 0.0, 0.1, 0.2 , 0.3, 0.4 ]
              }

In [188]:
xgbC = xgb.XGBClassifier(objective="binary:logistic", use_label_encoder=False, eval_metric='auc')
xgbc_random = RandomizedSearchCV(estimator = xgbC, 
                                    param_distributions = xgb_params, 
                                    n_iter = 100,
                                    scoring = 'roc_auc',
                                    cv = 4, 
                                    verbose=2, 
                                    random_state=11);

# Fit the random search model
xgbc_random.fit(train_df, train_labels);

### Fitting with tuned hyperparameters

In [186]:
dtrain = xgb.DMatrix(data=train_df, label=train_labels)
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 86431,
    'verbosity': 1,
    'eval_metric': 'auc'
}

xgbC = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [None]:
dtest = xgb.DMatrix(data=X_test, label=y_test)
y_pred_proba = xgbC.predict(dtest)
roc_auc_score(y_test, y_pred_proba)

In [187]:
dtest = xgb.DMatrix(data=validate_df, label=validate_labels)
y_pred_proba = xgbC.predict(dtest)
accuracy_score(validate_labels, (y_pred_proba >= 0.5))

0.8127

# Already given BoW

In [None]:
# Loading feature names from the "imdb.vocab" file
with open('/content/aclImdb/imdb.vocab', 'r') as feature_names:
  given_features = [given_feature.rstrip() for given_feature in feature_names if not given_feature.rstrip() in english_stopwords]

In [None]:
len(given_features)

89356

In [None]:
# Reading labeledBow.feat file which contains the data in libsvm format (sparse matrix)
# shape of data <Label> <Feature>:<Number of Apperance>
POSITIVE_REVIEW = 7
review_tokens = []
labels = []
with open('/content/aclImdb/train/labeledBow.feat', 'r') as data:
  for record in data:
    record = record.split()
    labels.append(int(record[0]) >= POSITIVE_REVIEW)
    review_tokens.append(record[1:])

In [None]:
# Extracting the words of each review coupled with its number of apperance.
# Takeing care of the stop words which isnot needed.
tuples = []
for record in review_tokens:
  sentance_tuple = {}
  for item in record:
    feature, value = item.split(':')
    if not features[int(feature)] in english_stopwords:
      sentance_tuple[features[int(feature)]] = int(value)
  tuples.append(sentance_tuple)
print("Number of trainig records: ", len(tuples))

Number of trainig records:  25000


# Model Saving and Loading

In [191]:
import pickle

In [189]:
model_name = 'xgb_model'
model_file = f'model_{model_name}.bin'

In [193]:
with open(model_file, 'wb') as m_out:
  pickle.dump(xgbC, m_out)