In [1]:
import pandas as pd
import numpy as np


In [2]:
x=pd.read_csv('Train.csv')
y=pd.read_csv('Test.csv')

In [3]:
x.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
y.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [5]:
x_train=x.values[:,:-1]
y_train=x.values[:,-1]
x_test=y.values

##CLEANING THE REVIEWS

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer 
from nltk.corpus import stopwords

In [7]:
tokenizer=RegexpTokenizer('[a-zA-Z]+')
stopwords=set(stopwords.words('english'))
ps=PorterStemmer()

In [8]:
important_words=["not","don't","doesn't","didn'","didn't","doesn'","doesn't","don","dont't","wasn","wasn't"]
stopwords=[i for i in stopwords if i not in important_words]

In [9]:
def cleaned_reviews(review):
    review=str(review)[1:-1]
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    #tokenize
    tokens=tokenizer.tokenize(review)
    new_tokens=[token for token in tokens if token not in stopwords]
    stemmed_tokens=[ps.stem(token)for token in new_tokens]
    cleaned_review=' '.join(stemmed_tokens)
    return cleaned_review
    

#CLEANING THE TRAINING DATA

In [10]:
for i in range (x.shape[0]):
    x_train[i]=np.array(cleaned_reviews(x_train[i]))
    

#CLEANING THE TESTING DATA

In [11]:
for i in range(y.shape[0]):
    x_test[i]=np.array(cleaned_reviews(x_test[i]))
    

#VECTORIZATION

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [13]:
ready_data=[]
for i in x_train:
    
    ready_data.append(str(i))
    

In [14]:
len(ready_data)

40000

In [15]:
for i in range (len(ready_data)):
    ready_data[i]=ready_data[i][3:-3]

In [16]:
train_vec=cv.fit_transform(ready_data[:10000]).toarray()

In [17]:
train_vec.shape

(10000, 38029)

In [18]:
readytestdata=[]
for i in x_test:
    readytestdata.append(str(i))

In [19]:
for i in range (len(readytestdata)):
    readytestdata[i]=readytestdata[i][3:-3]

In [30]:
test_vec=cv.transform(readytestdata).toarray()

In [20]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [21]:
y_train.shape

(40000,)

In [22]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)


In [23]:
y_train=y_train[:10000]

In [24]:
mnb=MultinomialNB()
model=mnb.fit(train_vec,y_train)

In [33]:
model.score(train_vec,y_train)*100

92.58999999999999

In [60]:
predictions=model.predict(test_vec)
predictions.shape

(10000,)

In [62]:
predictions=le.inverse_transform(predictions)

In [63]:
ind=np.arange(x_test.shape[0])
ind=ind.reshape((-1,1))
predictions=predictions.reshape((-1,1))
predictions=np.hstack((ind,predictions))
predictions

array([[0, 'neg'],
       [1, 'neg'],
       [2, 'neg'],
       ...,
       [9997, 'pos'],
       [9998, 'pos'],
       [9999, 'neg']], dtype=object)

In [70]:
pred=pd.DataFrame(predictions,columns=['Id','label'])

In [71]:
pred.head()

Unnamed: 0,Id,label
0,0,neg
1,1,neg
2,2,neg
3,3,neg
4,4,pos


In [72]:
pred.to_csv('prediction.csv',index=False)