In [1]:
import pandas as pd
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [2]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [3]:
from bs4 import BeautifulSoup
import re

In [4]:
import nltk
# nltk.download() #download text data set, including stop words

In [5]:
from nltk.corpus import stopwords

In [6]:
print stopwords.words("english")

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [7]:
def review_to_words (raw_review) : 
    #the input and output are a single string 
    #
    # remove html
    review_text = BeautifulSoup(raw_review).get_text()
    #
    #remove non-alphabets
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #
    #lower case, split into words for the removal of stopwords
    words = letters_only.lower().split()
    #
    #In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    #
    #remove stop words
    meaningful_words = [w for w in words if not w in stops]
    #
    #join back to single string
    return(" ".join(meaningful_words))



In [8]:
clean_review = review_to_words(train["review"][1])
print clean_review

classic war worlds timothy hines entertaining film obviously goes great effort lengths faithfully recreate h g wells classic book mr hines succeeds watched film appreciated fact standard predictable hollywood fare comes every year e g spielberg version tom cruise slightest resemblance book obviously everyone looks different things movie envision amateur critics look criticize everything others rate movie important bases like entertained people never agree critics enjoyed effort mr hines put faithful h g wells classic novel found entertaining made easy overlook critics perceive shortcomings




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [9]:
# stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_review (clean_review) :
    words = clean_review.split()
    stemmed_words = [stemmer.stem(w) for w in words if w!="oed"]
        # singular exception in entire dataset
    return (" ".join(stemmed_words))

In [10]:
stemmed_review = stem_review(clean_review)
print stemmed_review

classic war world timothi hine entertain film obvious goe great effort length faith recreat h g well classic book mr hine succe watch film appreci fact standard predict hollywood fare come everi year e g spielberg version tom cruis slightest resembl book obvious everyon look differ thing movi envis amateur critic look critic everyth other rate movi import base like entertain peopl never agre critic enjoy effort mr hine put faith h g well classic novel found entertain made easi overlook critic perceiv shortcom


In [12]:
num_reviews = train["review"].size

#initialising an empty list
clean_train_reviews = []

for i in xrange(0, num_reviews) : 
    clean_train_reviews.append(review_to_words(train["review"][i]))

In [13]:
stemmed_train_reviews = []

for i in xrange(0, num_reviews) :
    stemmed_train_reviews.append(stem_review(clean_train_reviews[i]))

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,stop_words = None, \
                             max_features = 5000) 
# max_features is the max number of words to be included in the vector model; the vector will consist of 
# the 5000(=max_features) most frequent words

# fit_transform() does two functions: 
# First, it fits the model and learns the vocabulary
# Second, it transforms our training data into feature vectors. 
# The input to fit_transform should be a list of strings.

train_data_features = vectorizer.fit_transform(stemmed_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(norm="l2")
TfIdf_train_data_features = tfidf_transformer.fit_transform(train_data_features)
TfIdf_train_data_features = TfIdf_train_data_features.toarray()

In [16]:
print train_data_features.shape

(25000, 5000)


In [17]:
print TfIdf_train_data_features.shape

(25000, 5000)


In [18]:
vocab = vectorizer.get_feature_names()
print vocab

[u'abandon', u'abc', u'abil', u'abl', u'abomin', u'aborigin', u'abort', u'abound', u'abraham', u'abrupt', u'abruptli', u'absenc', u'absent', u'absolut', u'absorb', u'absurd', u'abund', u'abus', u'abysm', u'academi', u'accent', u'accept', u'access', u'accid', u'accident', u'acclaim', u'accompani', u'accomplish', u'accord', u'account', u'accur', u'accuraci', u'accus', u'ace', u'achiev', u'acid', u'acknowledg', u'acquaint', u'acquir', u'across', u'act', u'action', u'activ', u'actor', u'actress', u'actual', u'ad', u'adam', u'adapt', u'add', u'addict', u'addit', u'address', u'adequ', u'admir', u'admit', u'admittedli', u'adolesc', u'adopt', u'ador', u'adult', u'advanc', u'advantag', u'adventur', u'advertis', u'advic', u'advis', u'aesthet', u'affair', u'affect', u'affleck', u'afford', u'aforement', u'afraid', u'africa', u'african', u'afternoon', u'afterward', u'age', u'agenc', u'agenda', u'agent', u'aggress', u'ago', u'agre', u'ah', u'ahead', u'aid', u'aim', u'air', u'airplan', u'airport', u'

In [19]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

288 abandon
125 abc
562 abil
1259 abl
83 abomin
69 aborigin
92 abort
63 abound
93 abraham
76 abrupt
60 abruptli
118 absenc
83 absent
1845 absolut
154 absorb
403 absurd
66 abund
398 abus
110 abysm
298 academi
704 accent
781 accept
165 access
344 accid
246 accident
118 acclaim
197 accompani
271 accomplish
311 accord
297 account
349 accur
82 accuraci
204 accus
75 ace
578 achiev
102 acid
109 acknowledg
73 acquaint
97 acquir
971 across
8794 act
3694 action
268 activ
6876 actor
1588 actress
5065 actual
793 ad
409 adam
835 adapt
1147 add
261 addict
499 addit
183 address
148 adequ
440 admir
738 admit
134 admittedli
112 adolesc
162 adopt
226 ador
887 adult
275 advanc
172 advantag
773 adventur
228 advertis
262 advic
195 advis
78 aesthet
419 affair
428 affect
66 affleck
139 afford
126 aforement
343 afraid
212 africa
284 african
197 afternoon
183 afterward
1726 age
79 agenc
86 agenda
455 agent
103 aggress
1033 ago
779 agre
119 ah
396 ahead
289 aid
325 aim
842 air
106 airplan
96 airport
195 aka
67 

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100)
forest_tfidf = RandomForestClassifier(n_estimators = 100)

# Fit the forest to the training set, using the bag of words as features 
# and the sentiment labels as the response variable

forest = forest.fit( train_data_features, train["sentiment"] )
forest_tfidf = forest_tfidf.fit( TfIdf_train_data_features, train["sentiment"] )


In [22]:
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3 )

num_test_reviews = test["review"].size
clean_test_reviews = []
stemmed_test_reviews = []

for i in xrange (0, num_test_reviews) : 
    clean_test_reviews.append(review_to_words(test["review"][i]))
    stemmed_test_reviews.append(stem_review(clean_test_reviews[i]))
    
test_data_features = vectorizer.transform(stemmed_test_reviews)
TfIdf_test_data_features = tfidf_transformer.transform(test_data_features)

test_data_features = test_data_features.toarray()
TfIdf_test_data_features = TfIdf_test_data_features.toarray()

result = forest.predict(test_data_features)
result_tfidf = forest_tfidf.predict(TfIdf_test_data_features)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
#output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output = pd.DataFrame( data={"id":test["id"], "sentiment":result_tfidf} )

# Use pandas to write the comma-separated output file
#output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )
output.to_csv( "Bag_of_Words_model_withStemmingAndTfIdf.csv", index=False, quoting=3 )