In [1]:
import pandas as pd       
train = pd.read_csv("./Data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [2]:
train.shape

(25000, 3)

In [3]:
train.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


Beautiful soup remove all the html related tags and give only the text

In [11]:
from bs4 import BeautifulSoup  

In [9]:
example1 = BeautifulSoup(train["review"][0])  

# print train["review"][0]
#print example1.get_text()

Removing all punctuations and all non alphabet characters with space 

In [10]:
import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
# print letters_only

In [14]:
lower_case = letters_only.lower() 
words = lower_case.split()    
print len(words)

437


In [20]:
from nltk.corpus import stopwords

In [22]:
words = set([w for w in words if not w in stopwords.words("english")])
print len(words)

166


In [23]:
def review_to_words( raw_review ):
    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 

    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 

    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             

    # 4. Get stopwords Set
    stops = set(stopwords.words("english"))                  
    
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   

    # 6. Join the words back into one string
    return( " ".join( meaningful_words )) 

In [24]:
clean_review = review_to_words( train["review"][0] )
print clean_review

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

Can also do Stemming and Lemmatizing. But I haven't done it here.

In [27]:
print "Cleaning and parsing the training set movie reviews...\n"
clean_train_reviews = []
for i in xrange( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%5000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_reviews )                                                                    
    clean_train_reviews.append( review_to_words( train["review"][i] ))

Cleaning and parsing the training set movie reviews...

Review 5000 of 25000

Review 10000 of 25000

Review 15000 of 25000

Review 20000 of 25000

Review 25000 of 25000



In [47]:
from sklearn.model_selection import train_test_split
X = clean_train_reviews
y = train["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(X_train)
train_data_features = train_data_features.toarray()

In [51]:
train_data_features.shape

(16750, 5000)

In [52]:
vocab = vectorizer.get_feature_names()
print vocab[:20]

[u'abandoned', u'abc', u'abilities', u'ability', u'able', u'abraham', u'abrupt', u'absence', u'absolute', u'absolutely', u'absorbed', u'absurd', u'abuse', u'abusive', u'abysmal', u'academy', u'accent', u'accents', u'accept', u'acceptable']


### print the counts of each word in the vocabulary

In [53]:
import numpy as np
dist = np.sum(train_data_features, axis=0)
for tag, count in zip(vocab, dist):
    print count, tag

133 abandoned
86 abc
61 abilities
294 ability
849 able
62 abraham
53 abrupt
71 absence
237 absolute
970 absolutely
55 absorbed
198 absurd
134 abuse
63 abusive
72 abysmal
184 academy
325 accent
141 accents
205 accept
84 acceptable
94 accepted
62 access
209 accident
120 accidentally
56 acclaimed
64 accompanied
56 accomplish
80 accomplished
192 according
128 account
59 accuracy
171 accurate
88 accused
121 achieve
91 achieved
81 achievement
56 acid
655 across
799 act
421 acted
4341 acting
2282 action
212 actions
55 activities
1622 actor
3030 actors
794 actress
239 actresses
259 acts
539 actual
2860 actually
91 ad
196 adam
60 adams
311 adaptation
53 adaptations
110 adapted
543 add
269 added
60 addicted
106 adding
237 addition
53 additional
213 adds
80 adequate
81 admire
421 admit
84 admittedly
64 adorable
348 adult
256 adults
74 advance
95 advantage
329 adventure
131 adventures
58 advertising
175 advice
53 advise
239 affair
67 affect
79 affected
70 afford
92 aforementioned
234 afraid
129 af

## Random Forest

In [54]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit( train_data_features, y_train)

Training the random forest...


### very important don't fit the test data to the model. Only do the Transform

In [55]:
# use only transform and not fit_transform
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()
test_data_features.shape

(8250, 5000)

In [56]:
result = forest.predict(test_data_features)

In [63]:
from sklearn import metrics
print ('{0:2.4f}%'.format(metrics.accuracy_score(result,y_test)*100))

84.4364%
