In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords


In [2]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [3]:
def getStemmedReview(review):
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    
    #tokenize
    tokens =  tokenizer.tokenize(review) 
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review
    

# Importing data

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv("Train/Train.csv")
data.head(5)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [39]:
X_train = np.array(data.values[:30000,0])
Y_train = np.array(data.values[:30000,1])

X_test = np.array(data.values[30000:,0])
Y_test = np.array(data.values[30000:,1])

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(30000,) (30000,)
(10000,) (10000,)


In [40]:
X_train_clean = [getStemmedReview(review) for review in X_train]
X_test_clean = [getStemmedReview(review) for review in X_test]

X_train_clean

['matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take',
 'http video googl com videoplay docid 211772166650071408 hl en distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule',
 'titl opera 1987 director dario argento cast cristina masillach ian charleson urbano barberini daria nicolodi review argento movi seen suspiria one blew away style color spooki stori line next decid go opera told one best man think discov ultim one favorit horror director opera young opera singer get big break main star creepi modern opera take mc beth get hit car betti understudi get part bad psycho make watch brutal murder friend co worker wow id heard good thing flick prepar level great film would take yeah movi shortcom ill get later part movi blew away first mov

## vectoriztion

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
cv = CountVectorizer(ngram_range=(1,1))

x_vec = cv.fit_transform(X_train_clean[:])
print(x_vec)
print(x_vec.shape)

  (0, 49881)	1
  (0, 30960)	1
  (0, 44860)	1
  (0, 36696)	1
  (0, 46225)	1
  (0, 52049)	1
  (0, 30329)	1
  (0, 9113)	1
  (0, 38312)	1
  (0, 48812)	2
  (0, 55607)	2
  (0, 381)	1
  (0, 9889)	1
  (0, 18397)	2
  (0, 52954)	1
  (0, 32516)	1
  (0, 9529)	1
  (0, 23535)	1
  (0, 25601)	1
  (0, 31896)	1
  (1, 43380)	1
  (1, 36670)	1
  (1, 48806)	1
  (1, 2232)	1
  (1, 29693)	3
  :	:
  (29999, 47546)	3
  (29999, 23976)	1
  (29999, 8483)	2
  (29999, 41511)	1
  (29999, 17172)	1
  (29999, 6257)	1
  (29999, 44643)	1
  (29999, 30176)	1
  (29999, 28996)	1
  (29999, 56631)	1
  (29999, 19524)	2
  (29999, 55418)	2
  (29999, 4541)	1
  (29999, 37750)	2
  (29999, 23683)	1
  (29999, 20536)	1
  (29999, 24125)	1
  (29999, 50837)	2
  (29999, 13228)	1
  (29999, 36551)	11
  (29999, 34149)	7
  (29999, 5827)	1
  (29999, 52043)	1
  (29999, 44860)	2
  (29999, 46225)	1
(30000, 57707)


In [43]:
print(cv.get_feature_names())

['00', '000', '0000000000001', '00000001', '00001', '00015', '001', '003830', '006', '007', '0079', '0080', '0083', '009', '0093638', '00am', '00pm', '00schneider', '01', '0126', '0148', '02', '020410', '0230', '029', '03', '039', '04', '05', '050', '06', '0615', '07', '07b', '08', '087', '089', '08th', '09', '0f', '0ne', '0r', '0s', '0tt', '10', '100', '1000', '10000', '1000000', '10000000000', '10000000000000', '10000th', '1001', '1004', '100ib', '100k', '100m', '100mile', '100min', '100mph', '100th', '100x', '100yard', '100â', '101', '101st', '102', '102nd', '103', '104', '1040', '1040a', '105', '105lb', '106', '106min', '107', '108', '1080p', '109', '10_', '10ft', '10k', '10line', '10mil', '10min', '10minut', '10p', '10pm', '10star', '10th', '10x', '10yo', '10yr', '10â', '11', '110', '1100', '1100ad', '110mph', '111', '112', '113', '1138', '113min', '113minut', '114', '1146', '115', '116', '116minut', '117', '118', '119', '11am', '11f', '11m', '11th', '11yr', '12', '120', '1200', '

In [45]:
x_test_vec = cv.fit_transform(X_test_clean[:])