In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer


### Cleaning the data

In [4]:
tokenizer = RegexpTokenizer("[a-zA-Z]+")

ss = SnowballStemmer('english')
sw = set(stopwords.words('english'))

In [5]:
def pipeline_vocab(text):
    text = text.lower()
    text = text.replace("<br /><br />"," ")
    text = text.replace("\n","")
    words = tokenizer.tokenize(text)
    words = [w for w in words if w not in sw]
    for i in range(len(words)):
        words[i] = ss.stem(words[i])
    
    new_text = ' '.join(words)
    
    return new_text

In [6]:
def cleaned_text(inputFile,outputFile):
    out = open(outputFile,'w')
    reviews = inputFile.readlines() 
    for review in reviews:
        cleaned_review = pipeline_vocab(review)
        print((cleaned_review), file=out)
    out.close()

In [7]:
f = open('imdb_trainX.txt', 'r')
cleaned_text(f,'cx_train')
f.close()

In [8]:
f1 = open('imdb_testX.txt','r')
cleaned_text(f1,'cx_test')
f1.close()

### Vectorizing the data

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [10]:
with open("cx_train") as f:
    x_train = f.readlines()

with open("cx_test") as f:
    x_test = f.readlines()
    

In [11]:
x_vector = cv.fit_transform(x_train).toarray()

In [12]:
print(cv.vocabulary_)

{'love': 25249, 'movi': 28573, 'sinc': 39084, 'saw': 37282, 'open': 30709, 'day': 10227, 'touch': 43745, 'beauti': 3547, 'strong': 41234, 'recommend': 35052, 'see': 37874, 'watch': 47018, 'famili': 14236, 'far': 14299, 'mpaa': 28609, 'rate': 34789, 'pg': 32318, 'themat': 42947, 'element': 12948, 'prolong': 33760, 'scene': 37405, 'disastor': 11329, 'nuditi': 30135, 'sexual': 38228, 'languag': 23918, 'first': 14932, 'thing': 43054, 'edison': 12715, 'chen': 7344, 'fantast': 14287, 'believ': 3728, 'job': 22037, 'cambodian': 6235, 'hit': 19415, 'man': 26029, 'born': 5011, 'bred': 5312, 'dump': 12374, 'gladiatori': 16935, 'ring': 35939, 'hone': 19683, 'craft': 9285, 'savag': 37259, 'batteri': 3409, 'order': 30786, 'surviv': 41810, 'live': 24892, 'mantra': 26180, 'kill': 22996, 'role': 36222, 'littl': 24880, 'dialogu': 11045, 'least': 24225, 'line': 24759, 'thai': 42881, 'perform': 32127, 'compel': 8481, 'probabl': 33667, 'jet': 21924, 'li': 24556, 'vehicl': 46125, 'danni': 10063, 'dog': 1168

In [14]:
xt_vector = cv.transform(x_test).toarray()

In [15]:
print(x_vector.shape)

(25000, 48956)


In [16]:
print(xt_vector.shape)

(25000, 48956)


In [17]:
with open("imdb_trainY.txt") as f:
    y_train = f.readlines()

with open("imdb_testY.txt") as f:
    y_test = f.readlines()
    

### Multinomial Naive Bayes using SciKitLearn

In [18]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [19]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [20]:
# Training
mnb.fit(x_vector,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
#Predictions
mnb.predict(xt_vector[:10])

array(['1\n', '7\n', '4\n', '10\n', '8\n', '1\n', '10\n', '10\n', '8\n',
       '10\n'], dtype='<U3')

### Multinomial Naive Bayes 

In [22]:
x_train = np.array(x_train)
y_train = np.array(y_train)

x_test = np.array(x_test)
y_test = np.array(y_test)

In [23]:
print(y_train)
y_train[1].replace("\n","")

['10\n' '8\n' '7\n' ... '2\n' '1\n' '1']


'8'

In [24]:
print(x_train.size)
print(x_train[0])
for i in range(y_train.size):
    y_train[i].replace("\n","")
print(np.unique(y_train))

25000
love movi sinc saw open day touch beauti strong recommend see movi watch famili far mpaa rate pg themat element prolong scene disastor nuditi sexual languag

['1' '1\n' '10\n' '2\n' '3\n' '4\n' '7\n' '8\n' '9\n']


In [25]:
def prior(y_train,label):
    num = np.sum(y_train==label)
    den = y_train.shape
    return num/den
    
def likelihood_word(x_train,y_train,label,word):
    x_filtered = x_train[y_train==label]
    num = 0
    
    for i in range(x_filtered.size):
        review = x_filtered[i]
        words = tokenizer.tokenize(review)
        for j in range(len(words)):
            if(words[j]==word):
                num+=1
    
    return num

def likelihood_den(x_train,y_train,label):
    x_filtered = x_train[y_train==label]
    den = 0
    
        
    for i in range(x_filtered.size):
        review = x_filtered[i]
        words = tokenizer.tokenize(review)
        for j in range(len(words)):
            den+=1
            
    return den

def vocab_size(x_train):
    v = 0
    for i in range(x_train.size):
        review = x_train[i]
        words = tokenizer.tokenize(review)
        for j in words:
            v+=1
            
    return v

In [28]:
def prediction(x_train,y_train,x_test):
    classes = np.unique(y_train)
    n_reviews = x_test.shape[0]
    v = vocab_size(x_train)
    post_probs = [] # List of prob for all classes and given a single testing point
    #Compute Posterior for each class
    pred = []
    for i in range(n_reviews):
        words = tokenizer.tokenize(x_test[i])
        for label in classes:
            den = likelihood_den(x_train,y_train,label)
            #Post_c = likelihood*prior
            num = 1.0
            likelihood=1
            for f in range(len(words)):
                cond = likelihood_word(x_train,y_train,label,words[f])
                num*=cond
                likelihood=(num+1)/(den+v)
        p = prior(y_train,label)
        post = likelihood*p
        post_probs.append(post)
        
        pred.append(np.unique(y_train)[np.argmax(post_probs)])
        
    return pred
    

In [32]:
pred = prediction(x_train[:1000],y_train[:1000],x_test[:10])
print(pred)

['10\n', '10\n', '10\n', '10\n', '10\n', '10\n', '10\n', '10\n', '10\n', '10\n']
