In [96]:
import glob
import numpy as np 
import re
import time
from math import log
from os import system
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
#getting all files path
def get_files(path):
    files =[]
    for file in glob.glob(path+'pos/*.*'):
        files.append(file)
    for file in glob.glob(path+'/neg/*.*'):
        files.append(file)
    return files

train_files = get_files('./dataset/train/')
test_files = get_files('./dataset/test/')

In [3]:
#reading and storing all files data,labels and ratings
def get_data(files):
    start_time = time.time()
    data = []
    labels=[]
    for file in files:
        with open(file,encoding='utf-8') as f:
            raw = f.read()
            data.append(raw)
            if 'pos' in file:
                labels.append(1)
            else:
                labels.append(0)
            f.close()
    print('end-time',time.time()-start_time)
    return data,labels
train_data,train_labels = get_data(train_files)
test_data,test_labels = get_data(test_files) 

end-time 144.26308822631836
end-time 154.29385614395142


In [4]:
#reading stopwords
def get_wordlists(file):
    data = []
    with open(file,'r') as f:
        data = [i.strip("\n").lower() for i in f.readlines()]
    f.close()
    return data
stopwords = get_wordlists('./dataset/stop_words.txt')

In [17]:
#preprocessing removing every character excpet alphabets
def preprocessing(data,stopwords):
    start_time = time.time()
    sentences = []
    tokens = []
    filter_ = lambda x: len(x.strip(" ")) > 5  
    for i,review in enumerate(data):
        sentences.append(re.sub('[^a-z]+'," ",review.lower()))
    for i in sentences:
        temp = []
        for word in i.split(" "):
            if len(word) > 5: #eliminating word with len <5
                temp.append(word)
        tokens.append(temp)
    print('end-time',time.time()-start_time)
    return tokens
train_tokens = preprocessing(train_data,stopwords)
test_tokens = preprocessing(test_data,stopwords)

end-time 3.1119191646575928
end-time 2.91359281539917


In [22]:
#extracting vocab
def extract_vocab(tokens):
    return list(set([i for x in tokens for i in x]))
vocab = extract_vocab(train_tokens)

In [24]:
#Part1
#counts of vocab in pos and negative corpus
def class_counts(tokens,labels,vocab):
    start_time = time.time()
    pos_tokens = [y for x in [tokens[i] for i in range(len(tokens)) if labels[i] == 1] for y in x]
    neg_tokens = [y for x in [tokens[i] for i in range(len(tokens)) if labels[i] == 0] for y in x]
    pos_counts = list(map(lambda x: pos_tokens.count(x),vocab))
    neg_counts = list(map(lambda x: neg_tokens.count(x),vocab))
    print('end-time',time.time()-start_time)
    return pos_counts,neg_counts
pos,neg = class_counts(train_tokens,train_labels,vocab)

end-time 1824.8439095020294


In [55]:
    #flatten neg and pos corpuses
    train_positive_corpus = [y for x in [train_tokens[i] for i in range(len(train_tokens)) if train_labels[i] == 1] for y in x]
    train_negative_corpus = [y for x in [train_tokens[i] for i in range(len(train_tokens)) if train_labels[i] == 0] for y in x]

In [49]:
#Naive Bayes variables
prior_class = log(0.5)
docs_per_class = 12500
total_docs = 25000
vocab_len = len(vocab)
pos_corp_len = len(train_positive_corpus)
neg_corp_len = len(train_negative_corpus)

#prediction function to sum and compute likelihoods
def predict(pos_counts,neg_counts,vocab,subject):
    negative_probs =[]
    positive_probs = []
    for i in range(len(subject)):
        pos_count = 0
        neg_count = 0
        if subject[i] in vocab:
            index = vocab.index(subject[i])
            pos_count = pos_counts[index]
            neg_count = neg_counts[index]
        negative_probs.append( log ((neg_count+1)/(neg_corp_len + vocab_len)) )
        positive_probs.append( log ((pos_count+1)/(pos_corp_len + vocab_len)) )
    return np.argmax([np.sum(negative_probs) + prior_class , np.sum(positive_probs) + prior_class])

In [50]:
#predicitng test corpus
correct = 0
start_time = time.time()
predictions = list(map(lambda x: predict(pos,neg,vocab,x),test_tokens))
print('end-time',time.time()-start_time)

end-time 5421.596265792847


In [54]:
#calculating accuracy
correct = 0
for i,x in enumerate(predictions):
    if x == test_labels[i]:
        correct+=1
print("ACCURRACY ON TEST DATA",correct/len(test_labels))

ACCURRACY ON TEST DATA 0.81012


In [83]:
#Part2
#convert tokens to strings
def tokens_to_str(tokens):
    return [" ".join(i) for i in tokens]
train = tokens_to_str(train_tokens)
test = tokens_to_str(test_tokens)


In [84]:
#count vectorizer model
model = CountVectorizer().fit(train)

In [89]:
#tranforming train and test corpus using model fitted on train corpus
train_vec = model.transform(train)
test_vec = model.transform(test)
#initializing naive bayes
clf = MultinomialNB().fit(train_vec,train_labels)

In [94]:
#prediciting test corpus
predictions2 = clf.predict(test_vec)
print('ACCURACY SCORE:',accuracy_score(predictions2,test_labels))

ACCURACY SCORE: 0.81012


In [100]:
confusion_matrix(predictions2,test_labels,labels=[0,1])

array([[10854,  3101],
       [ 1646,  9399]], dtype=int64)