In [51]:
import numpy as np
import pandas as pd

In [52]:
Train= pd.read_csv('Train.csv')

In [53]:
Train.head(15)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
5,Steve Carell comes into his own in his first s...,pos
6,I'm only going to write more because it's requ...,neg
7,"OK, it was a ""risky"" move to rent this flick, ...",neg
8,"Cannibalism, a pair of cinematic references to...",pos
9,This is one of the great modern kung fu films....,pos


In [54]:
Test= pd.read_csv('Test.csv')

In [55]:
Test.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [56]:
test=Test['review'].values
print(type(test))

<class 'numpy.ndarray'>


In [57]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
Train['label']=le.fit_transform(Train['label'])

In [58]:
Train.head(15)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,1
1,http://video.google.com/videoplay?docid=211772...,1
2,Title: Opera (1987) Director: Dario Argento Ca...,1
3,I think a lot of people just wrote this off as...,1
4,This is a story of two dogs and a cat looking ...,1
5,Steve Carell comes into his own in his first s...,1
6,I'm only going to write more because it's requ...,0
7,"OK, it was a ""risky"" move to rent this flick, ...",0
8,"Cannibalism, a pair of cinematic references to...",1
9,This is one of the great modern kung fu films....,1


In [59]:
X= Train['review'].values
Y= Train['label'].values

In [60]:
print(X.shape, Y.shape)

(40000,) (40000,)


# Tokenizing and Stemming

In [61]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [62]:
tokenizer = RegexpTokenizer(r'\w+')
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [83]:
cleaned_data=[]
for review in X:
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    cleaned_data.append(cleaned_review)
    

In [84]:
print(len(cleaned_data))

40000


Performing the same operation on test dataset

In [65]:
cleaned_data_test=[]
for review in test:
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    cleaned_data_test.append(cleaned_review)
    
print(len(cleaned_data_test))    

10000


# Vectorization

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
cv= CountVectorizer()

In [86]:
x_vec=cv.fit_transform(cleaned_data)

In [None]:
print(x_vec.shape)

In [87]:
#Vectorization on test dataset
xt_vec = cv.transform(cleaned_data_test)


# Multinomial Naive Bayes

In [88]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB, GaussianNB
mnb = MultinomialNB()
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [89]:
Y.shape

(40000,)

In [90]:
# Training 
mnb.fit(x_vec,Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [91]:
#Predictions
Y_pred=mnb.predict(xt_vec)

In [136]:
mnb.score(x_vec,Y)

0.89035

In [93]:
Y_pred.shape

(10000,)

In [113]:
df=pd.DataFrame(Y_pred)

In [114]:
df.columns=['label']

In [121]:
df['label'].replace({0:"neg", 1:"pos"}, inplace=True)

In [122]:
df.head()

Unnamed: 0,label
0,neg
1,neg
2,neg
3,neg
4,pos


In [123]:
df.to_csv('Y_Pred.csv', index=True)

# Multivariate Bernoulli Event Model Naive Bayes


In [124]:
bnb = BernoulliNB(binarize=0.0)


In [125]:
bnb.fit(x_vec,Y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [126]:
bnb.predict_proba(xt_vec)

array([[9.99799804e-01, 2.00195805e-04],
       [7.03861842e-01, 2.96138158e-01],
       [1.00000000e+00, 3.66759666e-13],
       ...,
       [1.32119807e-11, 1.00000000e+00],
       [1.33207994e-04, 9.99866792e-01],
       [1.00000000e+00, 1.78283727e-12]])

In [135]:
bnb.score(x_vec,Y)

0.885725

In [127]:
Y_pred2=bnb.predict(xt_vec)

In [128]:
Y_pred2.shape

(10000,)

In [129]:
df2=pd.DataFrame(Y_pred2)

In [130]:
df2.head()

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,1


In [132]:
df2.columns=['label']
df2.head()

Unnamed: 0,label
0,0
1,0
2,0
3,1
4,1


In [133]:
df2['label'].replace({0:"neg", 1:"pos"}, inplace=True)
df2.head()

Unnamed: 0,label
0,neg
1,neg
2,neg
3,pos
4,pos


In [134]:
df2.to_csv('Y_pred2.csv', index=True)