In [30]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import sys
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [74]:
df = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv").values

In [20]:
df.head
print(df.columns)

Index(['review', 'label'], dtype='object')


In [21]:
df.head(n=5)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [22]:
df = df.values
review = df[:,0]
label = df[:,1]

In [24]:
print(review.shape)
print(label.shape)

(40000,)
(40000,)


In [134]:
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

def getStemmedReview(review):
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    cleaned_review = " ".join(stemmed_tokens)
    return cleaned_review

In [135]:
review_clean = []
for i in review:
    rev = getStemmedReview(i)
    review_clean.append(rev)


In [136]:
len(review_clean)
review_clean

['matur intellig highli charg melodrama unbelivebl film china wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take',
 'http video googl com videoplay docid hl en distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule',
 'titl opera director dario argento cast cristina masillach ian charleson urbano barberini daria nicolodi review argento movi seen suspiria one blew away style color spooki stori line next decid go opera told one best man think discov ultim one favorit horror director opera young opera singer get big break main star creepi modern opera take mc beth get hit car betti understudi get part bad psycho make watch brutal murder friend co worker wow id heard good thing flick prepar level great film would take yeah movi shortcom ill get later part movi blew away first movi fill lot color suspiria exp

In [156]:
cv = CountVectorizer(ngram_range=(1,2))
review_clean_vec = cv.fit_transform(review_clean)


In [157]:
print(type(review_clean_vec))

<class 'scipy.sparse.csr.csr_matrix'>


In [158]:
cv.vocabulary_

{'matur': 1219170,
 'intellig': 1001470,
 'highli': 913936,
 'charg': 314991,
 'melodrama': 1234174,
 'unbelivebl': 2065816,
 'film': 717192,
 'china': 328868,
 'wei': 2157672,
 'stun': 1900290,
 'perform': 1447331,
 'catylast': 295927,
 'love': 1165810,
 'triangl': 2038863,
 'simpli': 1787180,
 'oppurun': 1393728,
 'see': 1720815,
 'magnific': 1184470,
 'take': 1940657,
 'matur intellig': 1219283,
 'intellig highli': 1001763,
 'highli charg': 913988,
 'charg melodrama': 315194,
 'melodrama unbelivebl': 1234410,
 'unbelivebl film': 2065817,
 'film china': 718328,
 'china wei': 329090,
 'wei wei': 2157685,
 'wei stun': 2157683,
 'stun perform': 1900576,
 'perform catylast': 1447712,
 'catylast love': 295928,
 'love triangl': 1169082,
 'triangl simpli': 2038951,
 'simpli stun': 1788193,
 'stun oppurun': 1900562,
 'oppurun see': 1393729,
 'see magnific': 1723372,
 'magnific film': 1184575,
 'film take': 723722,
 'http': 946704,
 'video': 2113452,
 'googl': 839234,
 'com': 365962,
 'videop

In [159]:
print(cv.get_feature_names())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [160]:
print(test.shape)
test = test.reshape((10000,))

(10000,)


In [161]:
reviewt_clean = []
for i in test:
    rev = getStemmedReview(i)
    reviewt_clean.append(rev)

In [162]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [163]:
mnb = MultinomialNB()

In [164]:
mnb.fit(review_clean_vec,label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [165]:
reviewt_clean_vec = cv.transform(reviewt_clean)
print(reviewt_clean_vec)
print(reviewt_clean_vec.shape)

  (0, 13479)	1
  (0, 14485)	1
  (0, 20759)	1
  (0, 82664)	2
  (0, 82823)	1
  (0, 82952)	1
  (0, 126774)	1
  (0, 126969)	1
  (0, 141999)	1
  (0, 148982)	1
  (0, 149204)	1
  (0, 174475)	1
  (0, 175911)	1
  (0, 193582)	2
  (0, 194337)	1
  (0, 195307)	1
  (0, 195357)	1
  (0, 206794)	1
  (0, 288120)	1
  (0, 295984)	1
  (0, 295986)	1
  (0, 309945)	1
  (0, 312163)	1
  (0, 314991)	1
  (0, 319105)	1
  :	:
  (9999, 2018866)	2
  (9999, 2019691)	1
  (9999, 2019913)	1
  (9999, 2063570)	1
  (9999, 2071905)	1
  (9999, 2074876)	1
  (9999, 2090704)	1
  (9999, 2092248)	1
  (9999, 2135357)	1
  (9999, 2142943)	1
  (9999, 2146297)	1
  (9999, 2149155)	2
  (9999, 2150194)	1
  (9999, 2151149)	1
  (9999, 2159292)	2
  (9999, 2162010)	1
  (9999, 2162999)	1
  (9999, 2163085)	1
  (9999, 2171668)	1
  (9999, 2172008)	1
  (9999, 2196870)	1
  (9999, 2198452)	1
  (9999, 2221029)	1
  (9999, 2234093)	2
  (9999, 2234454)	1
(10000, 2235661)


In [166]:
prediction = mnb.predict(reviewt_clean_vec)

In [167]:
print(prediction.shape)

(10000,)


In [168]:
df = pd.DataFrame(data = prediction,columns=["label"])

In [169]:
df.to_csv('prediction2.csv',index = True)

In [170]:
bnb = BernoulliNB()

In [171]:
bnb.fit(review_clean_vec,label)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [172]:
predictionBernoulli = bnb.predict(reviewt_clean_vec)

In [173]:
df = pd.DataFrame(data = predictionBernoulli,columns=["label"])

In [174]:
df.to_csv('predictionBernoulli2.csv',index = True)