In [1]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

In [2]:
df = pd.read_csv("Train.csv")

In [3]:
df.head()
ds=df.values
ds.shape

(40000, 2)

In [4]:
df['review'].head()

0    mature intelligent and highly charged melodram...
1    http://video.google.com/videoplay?docid=211772...
2    Title: Opera (1987) Director: Dario Argento Ca...
3    I think a lot of people just wrote this off as...
4    This is a story of two dogs and a cat looking ...
Name: review, dtype: object

## data cleaning 

In [5]:
corpus = []
for i in range(0, 15000):
    review = re.sub('[^a-zA-Z]', ' ', df['review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus

[u'matur intellig highli charg melodrama unbelivebl film china wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take',
 u'http video googl com videoplay docid hl en distribut tri br br opt mass appeal br br want best possibl view rang forgo profit continu manual labor job gladli entertain work br br view texa tale pleas write like like alex like stuie texa texa tale write br br opinion rule',
 u'titl opera director dario argento cast cristina masillach ian charleson urbano barberini daria nicolodi review argento movi seen suspiria one blew away style color spooki stori line next decid go opera told one best man think discov ultim one favorit horror director br br opera young opera singer get big break main star creepi modern opera take mc beth get hit car betti understudi get part bad psycho make watch brutal murder friend co worker br br wow id heard good thing flick prepar level great film would take yeah movi shortcom ill get later part movi blew away

## vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv=CountVectorizer()
vect_x=cv.fit_transform(corpus).toarray()    #it will be a sparse matrix
print(vect_x)



[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [9]:
print(vect_x.shape)

(15000, 41064)


## labels 

In [10]:
y=df['label']
y=y[:15000]
print(y)

0        pos
1        pos
2        pos
3        pos
4        pos
5        pos
6        neg
7        neg
8        pos
9        pos
10       neg
11       pos
12       pos
13       neg
14       pos
15       pos
16       pos
17       neg
18       pos
19       pos
20       neg
21       pos
22       pos
23       neg
24       pos
25       pos
26       pos
27       neg
28       pos
29       pos
        ... 
14970    neg
14971    pos
14972    pos
14973    pos
14974    pos
14975    pos
14976    neg
14977    pos
14978    neg
14979    pos
14980    pos
14981    pos
14982    pos
14983    neg
14984    neg
14985    pos
14986    neg
14987    pos
14988    neg
14989    pos
14990    neg
14991    neg
14992    pos
14993    pos
14994    pos
14995    pos
14996    neg
14997    pos
14998    neg
14999    pos
Name: label, Length: 15000, dtype: object


In [15]:
ys=y.values.shape
print(ys)

(15000,)


## splitting into train and test data 

In [19]:
from sklearn.model_selection import train_test_split

In [21]:
#x_train,x_test,y_train,y_test=train_test_split(vect_x,y,test_size=0.2)

In [29]:
x_train=vect_x[:8000,:]
x_test=vect_x[8000:15000,:]
y_train=y[:8000]
y_test=y[8000:15000]

In [30]:
print(x_train.shape,x_test.shape)

((8000, 41064), (7000, 41064))


In [31]:
print(y_train.shape,y_test.shape)

((8000,), (7000,))


## using multinomial NB 

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
mnb=MultinomialNB()

In [34]:
mnb.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
mnb.predict(x_test)

array(['neg', 'neg', 'pos', ..., 'neg', 'neg', 'pos'], dtype='|S3')

In [36]:
mnb.predict_proba(x_test)

array([[1.00000000e+00, 3.66609518e-17],
       [9.99999261e-01, 7.38693230e-07],
       [4.88465381e-09, 9.99999995e-01],
       ...,
       [9.29361482e-01, 7.06385184e-02],
       [1.00000000e+00, 9.88310323e-16],
       [7.71557567e-27, 1.00000000e+00]])

In [38]:
print(y_test)

8000     neg
8001     neg
8002     pos
8003     neg
8004     pos
8005     neg
8006     pos
8007     neg
8008     pos
8009     neg
8010     neg
8011     neg
8012     pos
8013     neg
8014     neg
8015     neg
8016     neg
8017     neg
8018     neg
8019     pos
8020     neg
8021     neg
8022     neg
8023     pos
8024     pos
8025     neg
8026     pos
8027     neg
8028     pos
8029     neg
        ... 
14970    neg
14971    pos
14972    pos
14973    pos
14974    pos
14975    pos
14976    neg
14977    pos
14978    neg
14979    pos
14980    pos
14981    pos
14982    pos
14983    neg
14984    neg
14985    pos
14986    neg
14987    pos
14988    neg
14989    pos
14990    neg
14991    neg
14992    pos
14993    pos
14994    pos
14995    pos
14996    neg
14997    pos
14998    neg
14999    pos
Name: label, Length: 7000, dtype: object
