# Naive Bayes with Count Vectorizer and Preprocessing
______

##### Importing Libraries

In [1]:
import os
import re
import glob
import nltk
import string
import random
import numpy as np
from warnings import simplefilter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
simplefilter(action='ignore', category=FutureWarning)

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

##### Importing Dataset

In [2]:
train_positive_path = 'C:/Users/susya/TCS iON Projects/Projects/Project 10-20/aclImdb/train/pos/'
train_negative_path = 'C:/Users/susya/TCS iON Projects/Projects/Project 10-20/aclImdb/train/neg/'
test_positive_path = 'C:/Users/susya/TCS iON Projects/Projects/Project 10-20/aclImdb/test/pos/'
test_negative_path = 'C:/Users/susya/TCS iON Projects/Projects/Project 10-20/aclImdb/test/neg/'

##### Reading Data from Function

In [3]:
def read_files_in_path(path):
    positive_training_data=[]
    for filepath in glob.glob(os.path.join(path, '*.txt')):
        with open(filepath,encoding='utf8') as f:
            content = f.read()
            positive_training_data.append(content)
    return positive_training_data

##### Reading training data

In [4]:
positive_training_data = read_files_in_path(train_positive_path)
negative_training_data = read_files_in_path(train_negative_path)
whole_training_data = positive_training_data+negative_training_data

##### Reading Test Data

In [5]:
positive_test_data = read_files_in_path(test_positive_path)
negative_test_data = read_files_in_path(test_negative_path)
whole_test_data = positive_test_data+negative_test_data

##### Remove HTML tags from dataset

In [6]:
def cleanhtml(whole_training_data):
    no_HTML_whole_training_data = []
    cleaner = re.compile('<.*?>')
    for review in whole_training_data:
        cleantext = re.sub(cleaner, '', review)
        no_HTML_whole_training_data.append(cleantext)
    return no_HTML_whole_training_data

##### Removing stop words and punctuation

In [7]:
def remove_punc_stop_words(no_HTML_whole_training_data):
    clean_training_data = []
    for review in no_HTML_whole_training_data:
        #remove punctuation marks
        review = review.lower()
        for ch in string.punctuation:
            review = review.replace(ch,' ')
        #Remove stop words
        stop_words = set(stopwords.words('english')) 
        word_tokens = word_tokenize(review) 
        filtered_words = [w for w in word_tokens if not w in stop_words] 
        #construct the review again
        sentence =''
        for word in filtered_words:
            sentence+=lemmatize_word(word)
            sentence+=' '
        clean_training_data.append(sentence)
    return clean_training_data

##### Lemmatize words

In [8]:
def lemmatize_word(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)

##### Cleaning data from HTML tags

In [9]:
no_HTML_whole_training_data = cleanhtml(whole_training_data)

In [10]:
print(whole_training_data[210])

The Movie was sub-par, but this Television Pilot delivers a great springboard into what has become a Sci-Fi fans Ideal program. The Actors deliver and the special effects (for a television series) are spectacular. Having an intelligent interesting script doesn't hurt either.<br /><br />Stargate SG1 is currently one of my favorite programs.


In [11]:
print(no_HTML_whole_training_data[210])

The Movie was sub-par, but this Television Pilot delivers a great springboard into what has become a Sci-Fi fans Ideal program. The Actors deliver and the special effects (for a television series) are spectacular. Having an intelligent interesting script doesn't hurt either.Stargate SG1 is currently one of my favorite programs.


In [12]:
clean_training_data = remove_punc_stop_words(no_HTML_whole_training_data)

In [13]:
print(set(stopwords.words('english')))

{'now', 'it', 'ain', 'who', "wouldn't", 'all', 'each', "weren't", 'below', 'only', 'there', 'my', 'has', 'aren', 'up', "mightn't", 'very', 'same', 'most', 'few', 'have', 'her', 'weren', 'under', "it's", 'm', 'theirs', 'their', 'y', 'don', 'isn', 'about', 'from', 'your', "hadn't", 'were', 'having', 'when', 'at', 'before', 'between', 'itself', 'because', 'you', 'what', 'how', "didn't", 'mustn', "hasn't", 'yourself', 'had', 'the', 'through', 'once', 'does', "haven't", 'hers', 'for', 'over', 'its', 'doing', 'couldn', 'is', 'not', 'of', 'am', 'been', 'are', 'we', "should've", 'after', 'a', 'ma', "shan't", 'his', 'that', 'yourselves', 'being', 'd', 'while', 'needn', 'himself', 'but', "you'll", 'further', 'until', 'own', 'why', 'again', 'to', 'll', 'will', 'them', "that'll", 'can', 'too', "couldn't", 'doesn', 'whom', 'or', 'herself', 'so', 's', 've', 'didn', 'ours', 'do', "shouldn't", 'wouldn', "wasn't", 'during', "mustn't", 'they', 'those', 'shan', 'in', 'yours', 'this', 'out', "needn't", 'w

In [15]:
print(no_HTML_whole_training_data[210])
print(clean_training_data[210])

The Movie was sub-par, but this Television Pilot delivers a great springboard into what has become a Sci-Fi fans Ideal program. The Actors deliver and the special effects (for a television series) are spectacular. Having an intelligent interesting script doesn't hurt either.Stargate SG1 is currently one of my favorite programs.
movie sub par television pilot delivers great springboard become sci fi fan ideal program actor deliver special effect television series spectacular intelligent interesting script hurt either stargate sg1 currently one favorite program 


In [18]:
print(no_HTML_whole_training_data[310])
print('\n',clean_training_data[310])

I'm not great at writing reviews, so I'll just spout my opinions...I loved this series at first. The adventure, the action, the comedy, the drama... I thought it was all brilliant. Anderson, Tapping, Shanks, Judge, Davis... I loved them all. Davis, it seemed, was the fifth-most important person in the cast. Not a big deal. But when his character (General Hammond) left at the end of the seventh season, and Anderson's character (Colonel O'Neill) moved from the field to the office, the quality of the series suddenly fell off a cliff. I don't know whether it's because Hammond was more important that I realized or what, but for some reason, after the seventh season, the series turned to ****.The first seven seasons, though, were awesome. The movie Stargate seemed mediocre the first time I saw it, but it turned out to be, even if this wasn't the original intention, a brilliant setup to the series. I recommend that you watch the movie first, then watch the first season of the TV series, then 

In [20]:
print(no_HTML_whole_training_data[4510])
print('\n',clean_training_data[4510])

Undying is a very good game which brings some new elements on the tired genre of first person shoot em ups. It tells the story of Patrick Galloway an expert of the occult and a formidable fighter who is summoned by a friend to his estate in Ireland to investigate some weird phainomena. The game is set in Ireland after World War one so don't expect to find weapons like chainguns or rocket launchers.All the weapons in the game can be considered antiques but the real fun in the game are its spells and the system they operate on.Our hero is ambidexterous so he can use both his hands at the same time: he casts spells with his right arm and uses his guns with the left.So you can shoot and cast spells at the same time which as you understand very fun and also unique to this game! The graphics are great and they can run very well on a medium power P.C..Level design is also cool and atmospheric. Mostly the game revolves around the Covenant estate and the mansion but there are many other locatio

In [21]:
no_HTML_whole_test_data = cleanhtml(whole_test_data)
clean_test_data = remove_punc_stop_words(no_HTML_whole_test_data)

##### Creating labels for the training data and test data first 12500 are positive and the remaining 12500 are negative

In [24]:
train_labels = np.asarray([1]*len(positive_training_data) + [0]*len(negative_training_data))
test_labels = np.asarray([1]*len(positive_test_data) + [0]*len(negative_test_data))

##### Randomizing test data

In [25]:
z = list(zip(clean_training_data, train_labels))
random.shuffle(z)
random_clean_training_data, random_train_labels = zip(*z)

##### Apply feature extraction: countVectorizer which is based on bag of words algorithm

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,4))
train_vector = vectorizer.fit_transform(random_clean_training_data)
test_vector = vectorizer.transform(clean_test_data)

##### Printing some visualization of the sizes of the train and test vectors
train_vector has 25000 row corresponding to the 25000 review and 74849 feature extracted

In [27]:
print(train_labels.shape)
print(train_vector.shape)

(25000,)
(25000, 7423470)


##### Printing shapes of test vector and label

In [28]:
print(test_labels.shape)
print(test_vector.shape)

(25000,)
(25000, 7423470)


In [29]:
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
gnb.fit(train_vector, random_train_labels)
print("Naive Bayes with text preprocessing using CountVectorizer classification accuracy:\n",gnb.score(test_vector,test_labels)*100,"%")

Naive Bayes with text preprocessing using CountVectorizer classification accuracy:
 85.636 %
