#CLEANING THE REVIEWS USING NLTK

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer 
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91763\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
tokenizer=RegexpTokenizer('[a-zA-Z]+')
stopwords=set(stopwords.words('english'))
ps=PorterStemmer()

In [4]:
important_words=["not","don't","doesn't","didn'","didn't","doesn'","doesn't","don","dont't","wasn","wasn't"]
stopwords=[i for i in stopwords if i not in important_words]

In [5]:
def cleaned_reviews(review):
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    #tokenize
    tokens=tokenizer.tokenize(review)
    new_tokens=[token for token in tokens if token not in stopwords]
    stemmed_tokens=[ps.stem(token)for token in new_tokens]
    cleaned_review=' '.join(stemmed_tokens)
    return cleaned_review
    

#NOW WE WILL WRITE A FUNCTION THAT WILL ACCEPT A INPUT FILE AND RETURNS A CLEANED DOCUMENT

In [6]:
def cleaned_document(inputfile,outputfile):
    out=open(outputfile,'w',encoding="utf8")
    with open(inputfile,encoding="utf8")as f:
        reviews=f.readlines()
    for review in reviews:
        cleaned_review=cleaned_reviews(review)
        print(cleaned_review,file=out)
    out.close()    

#CLEANING THE TRAINING DATA

In [7]:
cleaned_document('imdb_trainX.txt','cleaned_input_reviews')

#CLEANING THE TESTING DATA

In [8]:
cleaned_document('imdb_testX.txt','cleaned_output_reviews')

#VECTORIZATION

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2))

In [10]:
cleaned_rev=[]
with open('cleaned_input_reviews','r',encoding="utf8") as f:
    cleaned_rev=f.readlines()    

In [11]:
len(cleaned_rev)

25000

In [12]:
for i in range(len(cleaned_rev)):
    cleaned_rev[i]=cleaned_rev[i][:-1]


In [13]:
train_vec=cv.fit_transform(cleaned_rev[:500]).toarray()


In [14]:
train_vec.shape

(500, 65143)

#VECTORIZATION ON TESTING DATA

In [15]:
xtest_=[]
with open('cleaned_output_reviews','r',encoding="utf8") as f:
    xtest_=f.readlines()
    
    

In [16]:
test_vec=cv.transform(xtest_[:500]).toarray()
test_vec.shape

(500, 65143)

#USING MULTINOMIAL NAIVE BAYES 

In [17]:
ratings=[]
with open('imdb_trainY.txt','r',encoding="utf8") as f:
    ratings=f.readlines()
    for i in range (500):
        ratings[i]=int(ratings[i][:2])    
        

In [18]:
ratings=ratings[:500]
len(ratings)

500

In [79]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [20]:
mnb=MultinomialNB()
model=mnb.fit(train_vec,ratings)

#training accuracy

In [21]:
model.score(train_vec,ratings)*100

100.0

In [22]:
predictions=model.predict(test_vec)

In [50]:
output_ratings=[]
with open('imdb_testY.txt','r',encoding="utf8") as f:
    output_ratings=f.readlines()

In [51]:
output_ratings=output_ratings[:500]
for i in range (500):
        output_ratings[i]=int(output_ratings[i][:2]) 


In [55]:
import numpy as np
predictions=np.array(predictions)
output_ratings=np.array(output_ratings)

#CALCULATING TESTING ACCURACY

In [56]:
num=np.sum(predictions==output_ratings)
accuracy=num/len(predictions)
print(accuracy*100)

42.6


#LETS TRY TFIDFVECTORIZER FOR VECTORIZATION WHICH ASSIGN WEIGHTS TO EACH OF THE VECTORS , THE IDEA BEHIND THIS ALGO IS THAT THE WORD WHICH IS APPEARING IN MOST OF THE DOCUMENTS SHOULD HAVE A LESS WEIGHT AS IT IS GIVING US THE LEAST INFORMATION

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

#vectorizing the training data

In [66]:
train_vec_2=tfidf.fit_transform(cleaned_rev[:500]).toarray()
train_vec_2.shape

(500, 9178)

In [67]:
#vectorizing the testing data

test_vec_2=tfidf.transform(xtest_[:500]).toarray()
test_vec_2.shape

(500, 9178)

In [74]:
mnb=MultinomialNB()
model=mnb.fit(train_vec_2,ratings)

In [75]:
model.score(train_vec_2,ratings)*100

49.8

In [76]:
predictions_tf=model.predict(test_vec_2)

In [77]:
predictions_tf=np.array(predictions_tf)

In [78]:
num=np.sum(predictions_tf==output_ratings)
accuracy=num/len(predictions_tf)
print(accuracy*100)

41.0


#UNFORTUNATELY THIS IS NOT WORKING  AS GOOD AS COUNTVECTORIZER, LET US TRY WITH A DIFFERENT NAIVE BAYES MODEL , LETS GO WITH BERNOULLI

In [109]:
bnb=BernoulliNB(binarize=0.0)
model=bnb.fit(train_vec,ratings)

In [110]:
model.score(train_vec,ratings)*100

52.800000000000004

In [111]:
predictions_bnb=model.predict(test_vec)
predictions_bnb=np.array(predictions_bnb)
num=np.sum(predictions_bnb==output_ratings)
accuracy=num/len(predictions_bnb)
print(accuracy*100)


41.0
