In [47]:
%matplotlib inline
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup as bs
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# Reading Data

In [48]:
train = pd.read_csv('data/labeledTrainData.tsv', delimiter='\t', quoting=3)
train.head(5)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [49]:
train.shape

(25000, 3)

# Preprocessing

In [50]:
stops = set(stopwords.words('english'))
pat = re.compile(r'[^a-zA-Z]')
def review_to_words(raw_review):
    review_text = bs(raw_review, 'lxml').get_text()
    letters_only = pat.sub(' ', review_text)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if w not in stops]
    return ' '.join(meaningful_words)

train['words'] = train.review.map(review_to_words)

In [51]:
train.head(5)

Unnamed: 0,id,sentiment,review,words
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff going moment mj started listening music ...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film starts manager nicholas bell giving welco...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assumed praised film greatest filmed oper...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbly trashy wondrously unpretentious explo...


In [52]:
vectorizer = CountVectorizer(max_features=5000)

features = vectorizer.fit_transform(train.words).toarray()

In [53]:
features.shape

(25000, 5000)

# Training the model

In [54]:
clf = RandomForestClassifier(n_estimators=100)

clf.fit(features, train.sentiment)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Predicting

In [55]:
test = pd.read_csv('data/testData.tsv', delimiter='\t', quoting=3)
test.head(5)

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [56]:
test['words'] = test.review.map(review_to_words)

In [57]:
features = vectorizer.transform(test.words).toarray()
features.shape

(25000, 5000)

In [58]:
h = clf.predict(features)
h

array([1, 0, 1, ..., 0, 1, 1])

In [59]:
df = pd.DataFrame({'id': test.id, 'sentiment': h})
df.to_csv('data/submit.csv', index=False, quoting=3)