In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('labeledTrainData.tsv', header = 0, delimiter = '\t', quoting=3)
df.shape

(25000, 3)

In [3]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
df.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [5]:
df.review[0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [6]:
from bs4 import BeautifulSoup as bs
import re
import nltk
from nltk.corpus import stopwords

In [7]:
def transformReview(rawReview):
    """
    This function takes a raw string review as input, and applies the following steps to return a refined string review as output:
    1. removing HTML tags
    2. removing punctuation marks
    3. converting the text to lower case
    4. splitting the string into words
    5. removing stop words
    """
    # remove HTML tags
    noHtml = bs(rawReview, "lxml").get_text()
    
    #remove punctuation marks
    letters_only = re.sub("[^a-zA-Z]", " ", noHtml)
    
    #convert to lower case
    tolower = letters_only.lower()
    
    #split
    words = tolower.split()
    
    #convert stopwords list to set for fast searching
    stopwordsSet = set(stopwords.words("english"))
    
    #remove stop words
    RefinedWords = [w for w in words if w not in stopwordsSet]
    
    #form new review
    return(" ".join(RefinedWords) )

In [8]:
clean_review = transformReview(df.review[0])
clean_review

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [9]:
sze = len(df.review)
clean_review = []

for i in range(sze):
    clean_review.append( transformReview(df.review[i]))

In [10]:
# form vocabulary and select top 5000 words
print("creating bag of words...\n")

from sklearn.feature_extraction.text import CountVectorizer

vectorized = CountVectorizer(analyzer='word', max_features=5000)

creating bag of words...



In [11]:
print("extracting features from bag of words...")
train_features = vectorized.fit_transform(clean_review)
train_features = train_features.toarray()
train_features.shape

extracting features from bag of words...


(25000, 5000)

In [12]:
vocab = vectorized.get_feature_names()
# vocab

In [13]:
# See the frequency of each feature in train_features
dist = np.sum(train_features, axis = 0)
print(dist.shape)

# print vocab word and its frequency
# for word, frq in zip(vocab, dist):
#     print(word + "\t" + str(frq))

(5000,)
abandoned	187
abc	125
abilities	108
ability	454
able	1259
abraham	85
absence	116
absent	83
absolute	352
absolutely	1485
absurd	306
abuse	192
abusive	91
abysmal	98
academy	297
accent	485
accents	203
accept	300
acceptable	130
accepted	144
access	92
accident	318
accidentally	200
accompanied	88
accomplished	124
according	296
account	186
accuracy	81
accurate	284
accused	123
achieve	179
achieved	139
achievement	124
acid	90
across	971
act	1251
acted	658
acting	6490
action	3354
actions	311
activities	83
actor	2389
actors	4486
actress	1219
actresses	369
acts	394
actual	793
actually	4237
ad	148
adam	302
adams	98
adaptation	453
adaptations	80
adapted	154
add	810
added	439
adding	166
addition	347
adds	337
adequate	113
admire	124
admit	621
admittedly	134
adorable	101
adult	510
adults	376
advance	100
advanced	90
advantage	153
adventure	510
adventures	204
advertising	91
advice	259
advise	90
affair	346
affect	93
affected	113
afford	104
aforementioned	126
afraid	343
africa	212
african	255
after

haunted	217
haunting	229
hawke	79
hbo	108
head	1541
headed	169
heads	291
health	137
hear	733
heard	1111
hearing	231
heart	1328
hearted	225
hearts	135
heat	128
heaven	320
heavily	180
heavy	492
heck	222
heights	85
held	391
helen	152
helicopter	97
hell	1025
hello	90
help	1895
helped	324
helping	176
helps	360
hence	155
henry	407
hero	1056
heroes	318
heroic	115
heroine	291
heston	136
hey	409
hidden	342
hide	210
hideous	103
hiding	144
high	2161
higher	289
highest	106
highlight	202
highlights	125
highly	1147
hilarious	973
hilariously	86
hill	243
hills	152
hint	147
hints	103
hip	181
hippie	84
hire	130
hired	188
historical	407
historically	86
history	1332
hit	1088
hitchcock	209
hitler	305
hits	272
hitting	137
ho	126
hoffman	188
hold	545
holding	209
holds	300
hole	167
holes	367
holiday	148
hollow	113
holly	107
hollywood	1907
holmes	163
holy	113
homage	133
home	1877
homeless	140
homer	103
homosexual	90
honest	481
honestly	453
honesty	99
hong	191
honor	173
hood	162
hook	99
hooked	139
hop	95
hope	1

typical	778
typically	130
ugly	354
uk	229
ultimate	248
ultimately	521
ultra	140
un	190
unable	247
unaware	83
unbearable	116
unbelievable	434
unbelievably	116
uncle	335
uncomfortable	149
unconvincing	186
underground	174
underlying	82
underrated	235
understand	1643
understandable	97
understanding	275
understated	90
understood	179
undoubtedly	107
uneven	107
unexpected	250
unexpectedly	79
unfair	80
unfolds	104
unforgettable	143
unfortunate	208
unfortunately	1352
unfunny	267
unhappy	96
uninspired	123
unintentional	108
unintentionally	135
uninteresting	198
union	128
unique	634
unit	89
united	216
universal	217
universe	197
university	131
unknown	286
unless	675
unlike	585
unlikely	211
unnecessary	307
unoriginal	83
unpleasant	110
unpredictable	82
unreal	84
unrealistic	226
unseen	83
unsettling	99
unusual	310
unwatchable	106
uplifting	81
upon	859
upper	158
ups	266
upset	154
urban	189
urge	102
us	3794
usa	164
use	1803
used	1879
useful	94
useless	128
user	102
uses	540
using	801
ustinov	83
usual	965

In [14]:
# training random forest
print("training random forest...\n")
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(train_features, df.sentiment)

training random forest...



  from numpy.core.umath_tests import inner1d


In [15]:
# make predicitons on test dataset

# load
test_data = pd.read_csv("testData.tsv", delimiter="\t", header= 0, quoting=3)
print(test_data.shape)
print(test_data.columns)

# transform all reviews in the test data
n_reviews = test_data.shape[0]
clean_reviews = []

print('Cleaning reviews...\n')
for i in range(0, n_reviews):
    clean_reviews.append(transformReview(test_data.review[i]))

test_data_vector = vectorized.transform(clean_reviews)
test_data_vector = test_data_vector.toarray()

# predict using random forest
test_pred = rf.predict(test_data_vector)


(25000, 2)
Index(['id', 'review'], dtype='object')
Cleaning reviews...



In [21]:
# output dataframe
output = pd.DataFrame( data = {"id" : test_data["id"], 'sentiment': test_pred})

# convert to csv
output.to_csv("BoW_model.csv", index = False, quoting = 3)

In [17]:
output.shape
output.head()

Unnamed: 0,id,review
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1


In [18]:
test_data.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [19]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [20]:
sample = pd.read_csv("sampleSubmission.csv")
sample.head()
# type(sample.id[0])

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0
