In [1]:
#Imports
import webbrowser
site = "https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words"
webbrowser.open_new(site)


import pandas as pd
import nltk
from nltk.corpus import stopwords

import string
import re #Regular Expressions - for removing punctuation

#For Vectorizing
from sklearn.feature_extraction.text import CountVectorizer

#For Training
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

from bs4 import BeautifulSoup 

In [2]:
dataset = pd.read_csv('BOPlabeledTrainData.tsv', sep = '\t', header = 0, quoting=3) #quoting = 3 ignores tripple quotes

dataset.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
#25,000 reviews
dataset.shape

(25000, 3)

In [4]:
dataset['sentiment'].shape

(25000L,)

In [5]:
dataset.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [6]:
#Check out a review

dataset['review'][2]

'"The film starts with a manager (Nicholas Bell) giving welcome investors (Robert Carradine) to Primal Park . A secret project mutating a primal animal using fossilized DNA, like \xc2\xa8Jurassik Park\xc2\xa8, and some scientists resurrect one of nature\'s most fearsome predators, the Sabretooth tiger or Smilodon . Scientific ambition turns deadly, however, and when the high voltage fence is opened the creature escape and begins savagely stalking its prey - the human visitors , tourists and scientific.Meanwhile some youngsters enter in the restricted area of the security center and are attacked by a pack of large pre-historical animals which are deadlier and bigger . In addition , a security agent (Stacy Haiduk) and her mate (Brian Wimmer) fight hardly against the carnivorous Smilodons. The Sabretooths, themselves , of course, are the real star stars and they are astounding terrifyingly though not convincing. The giant animals savagely are stalking its prey and the group run afoul and 

In [7]:
#Need to remove HTML
#Instantiate a soup object. Identifies a structure in the html
example1 = BeautifulSoup(dataset['review'][2])



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [8]:
#Calling get_text() gives you the text of the review, without tags or markup

no_html = example1.get_text()
no_html

u'"The film starts with a manager (Nicholas Bell) giving welcome investors (Robert Carradine) to Primal Park . A secret project mutating a primal animal using fossilized DNA, like \xa8Jurassik Park\xa8, and some scientists resurrect one of nature\'s most fearsome predators, the Sabretooth tiger or Smilodon . Scientific ambition turns deadly, however, and when the high voltage fence is opened the creature escape and begins savagely stalking its prey - the human visitors , tourists and scientific.Meanwhile some youngsters enter in the restricted area of the security center and are attacked by a pack of large pre-historical animals which are deadlier and bigger . In addition , a security agent (Stacy Haiduk) and her mate (Brian Wimmer) fight hardly against the carnivorous Smilodons. The Sabretooths, themselves , of course, are the real star stars and they are astounding terrifyingly though not convincing. The giant animals savagely are stalking its prey and the group run afoul and fight a

In [9]:
#Example to take out punctuation using re.sub
#[a-zA-Z] are letters
#[^a-zA-Z] are non-letters

no_punc = re.sub("[^a-zA-Z]", #The pattern to search for - non-letters
                " ", # The pattern to replace it with
                no_html) #The text to use
print no_punc

 The film starts with a manager  Nicholas Bell  giving welcome investors  Robert Carradine  to Primal Park   A secret project mutating a primal animal using fossilized DNA  like  Jurassik Park   and some scientists resurrect one of nature s most fearsome predators  the Sabretooth tiger or Smilodon   Scientific ambition turns deadly  however  and when the high voltage fence is opened the creature escape and begins savagely stalking its prey   the human visitors   tourists and scientific Meanwhile some youngsters enter in the restricted area of the security center and are attacked by a pack of large pre historical animals which are deadlier and bigger   In addition   a security agent  Stacy Haiduk  and her mate  Brian Wimmer  fight hardly against the carnivorous Smilodons  The Sabretooths  themselves   of course  are the real star stars and they are astounding terrifyingly though not convincing  The giant animals savagely are stalking its prey and the group run afoul and fight against on

In [6]:
#The method above takes out all non-letters where the one below only takes out punctuation

#Or
no_punc2 = [char for char in no_html if char not in string.punctuation]
no_punc2 = "".join(no_punc2)
no_punc2

u'The film starts with a manager Nicholas Bell giving welcome investors Robert Carradine to Primal Park  A secret project mutating a primal animal using fossilized DNA like \xa8Jurassik Park\xa8 and some scientists resurrect one of natures most fearsome predators the Sabretooth tiger or Smilodon  Scientific ambition turns deadly however and when the high voltage fence is opened the creature escape and begins savagely stalking its prey  the human visitors  tourists and scientificMeanwhile some youngsters enter in the restricted area of the security center and are attacked by a pack of large prehistorical animals which are deadlier and bigger  In addition  a security agent Stacy Haiduk and her mate Brian Wimmer fight hardly against the carnivorous Smilodons The Sabretooths themselves  of course are the real star stars and they are astounding terrifyingly though not convincing The giant animals savagely are stalking its prey and the group run afoul and fight against one natures most fears

In [10]:
#Function to loop through to process text
def review_to_words(raw_review):
    #Input is a single (raw) string and output is a single (processed) string
    
    #Step 1: Remove HTML
    html_gone = BeautifulSoup(raw_review).get_text()
    
    #Step 2: Remove punctuation (not always the best but we will do it here)
    punc_gone = re.sub("[^a-zA-Z]", " ", html_gone)
    
    #Step 3: Put in lowercase and split into individual words
    words = punc_gone.lower().split()
    
    #Step 4: Turn stopwords to a set rather than a list. This is done for speed. Set always faster than list
    stops = set(stopwords.words('english'))
    
    #Step 5: Remove Stopwords
    clean_text = [w for w in words if w not in stops]
    
    #Step 6: Join words back into one string seperated by space and return result
    return(" ".join(clean_text))
    


In [11]:
#Get number of reviews
num_reviews = dataset['review'].size
num_reviews

25000

In [12]:
#Initialize an empty list to hold reviews
clean_train_reviews = []

In [13]:
#Create a loop to clean all reviews

#Loop over each review: 
for i in xrange(0, num_reviews):
    #To see progress
    if ((i+1)%1000 == 0):
        print "Review {} of {}".format(i+1, num_reviews)
    clean_train_reviews.append(review_to_words(dataset['review'][i]))

Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


In [14]:
# Time to Vectorize (once data has been cleaned and stopwords removed)
vectorizer = CountVectorizer(analyzer = "word", max_features = 5000)

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

train_data_features = vectorizer.fit_transform(clean_train_reviews)

#Then convert to array
train_data_features = train_data_features.toarray()

In [15]:
print train_data_features.shape

(25000L, 5000L)


In [16]:
#Look at vocabulary
#Each word is a new feature. This is important to understand
vocab = vectorizer.get_feature_names()


In [17]:
#Will train using Random Forest

forest = RandomForestClassifier(n_estimators = 100)

forest.fit(train_data_features, dataset['sentiment'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
#Bring in the testing data
#Notice that this does not have sentiments. Maybe you have to make a submission to see results?
#Try splitting traing data


testing_data = pd.read_csv("BOPtestData.tsv", sep = '\t', quoting = 3)
testing_data.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [19]:
print testing_data.shape

(25000, 2)


In [24]:
#Clean up all testing data
num_test_reviews = len(testing_data['review'])
clean_test_reviews = []


In [27]:
print "Cleaning and parsing the test set movie reviews...\n"
for i in xrange(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print "Review {} of {}".format(i+1, num_test_reviews)
    clean_review = review_to_words( testing_data["review"][i] )
    clean_test_reviews.append( clean_test_reviews )

Cleaning and parsing the test set movie reviews...

Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


In [29]:
test_data_features = vectorizer.fit_transform(clean_test_reviews)
#Then convert to array
test_data_features = test_data_features.toarray()

AttributeError: 'list' object has no attribute 'lower'

In [15]:
print "Check me out in {}, and {}".format('one', 'two')

Check me out in one, and two


In [10]:
review_to_words(dataset['review'][3])

u'must assumed praised film greatest filmed opera ever read somewhere either care opera care wagner care anything except desire appear cultured either representation wagner swan song movie strikes unmitigated disaster leaden reading score matched tricksy lugubrious realisation text questionable people ideas opera matter play especially one shakespeare allowed anywhere near theatre film studio syberberg fashionably without smallest justification wagner text decided parsifal bisexual integration title character latter stages transmutes kind beatnik babe though one continues sing high tenor actors film singers get double dose armin jordan conductor seen face heard voice amfortas also appears monstrously double exposure kind batonzilla conductor ate monsalvat playing good friday music way transcendant loveliness nature represented scattering shopworn flaccid crocuses stuck ill laid turf expedient baffles theatre sometimes piece imperfections thoughts think syberberg splice parsifal gurnema

In [17]:
#Test
lowerC = no_punc2.lower()
lowerC

u' the film starts with a manager  nicholas bell  giving welcome investors  robert carradine  to primal park   a secret project mutating a primal animal using fossilized dna  like  jurassik park   and some scientists resurrect one of nature s most fearsome predators  the sabretooth tiger or smilodon   scientific ambition turns deadly  however  and when the high voltage fence is opened the creature escape and begins savagely stalking its prey   the human visitors   tourists and scientific meanwhile some youngsters enter in the restricted area of the security center and are attacked by a pack of large pre historical animals which are deadlier and bigger   in addition   a security agent  stacy haiduk  and her mate  brian wimmer  fight hardly against the carnivorous smilodons  the sabretooths  themselves   of course  are the real star stars and they are astounding terrifyingly though not convincing  the giant animals savagely are stalking its prey and the group run afoul and fight against 

In [20]:
print lowerC.split()

[u'the', u'film', u'starts', u'with', u'a', u'manager', u'nicholas', u'bell', u'giving', u'welcome', u'investors', u'robert', u'carradine', u'to', u'primal', u'park', u'a', u'secret', u'project', u'mutating', u'a', u'primal', u'animal', u'using', u'fossilized', u'dna', u'like', u'jurassik', u'park', u'and', u'some', u'scientists', u'resurrect', u'one', u'of', u'nature', u's', u'most', u'fearsome', u'predators', u'the', u'sabretooth', u'tiger', u'or', u'smilodon', u'scientific', u'ambition', u'turns', u'deadly', u'however', u'and', u'when', u'the', u'high', u'voltage', u'fence', u'is', u'opened', u'the', u'creature', u'escape', u'and', u'begins', u'savagely', u'stalking', u'its', u'prey', u'the', u'human', u'visitors', u'tourists', u'and', u'scientific', u'meanwhile', u'some', u'youngsters', u'enter', u'in', u'the', u'restricted', u'area', u'of', u'the', u'security', u'center', u'and', u'are', u'attacked', u'by', u'a', u'pack', u'of', u'large', u'pre', u'historical', u'animals', u'which

In [7]:
stopwords.words('english')

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u'her',
 u'hers',
 u'herself',
 u'it',
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'then',
 u'once',
 u'here',
 u'there',
 u'when',
 u'where',
 u'why',
 u'how',
 u'all

In [8]:
sw = set(stopwords.words('english'))
sw

{u'a',
 u'about',
 u'above',
 u'after',
 u'again',
 u'against',
 u'ain',
 u'all',
 u'am',
 u'an',
 u'and',
 u'any',
 u'are',
 u'aren',
 u'as',
 u'at',
 u'be',
 u'because',
 u'been',
 u'before',
 u'being',
 u'below',
 u'between',
 u'both',
 u'but',
 u'by',
 u'can',
 u'couldn',
 u'd',
 u'did',
 u'didn',
 u'do',
 u'does',
 u'doesn',
 u'doing',
 u'don',
 u'down',
 u'during',
 u'each',
 u'few',
 u'for',
 u'from',
 u'further',
 u'had',
 u'hadn',
 u'has',
 u'hasn',
 u'have',
 u'haven',
 u'having',
 u'he',
 u'her',
 u'here',
 u'hers',
 u'herself',
 u'him',
 u'himself',
 u'his',
 u'how',
 u'i',
 u'if',
 u'in',
 u'into',
 u'is',
 u'isn',
 u'it',
 u'its',
 u'itself',
 u'just',
 u'll',
 u'm',
 u'ma',
 u'me',
 u'mightn',
 u'more',
 u'most',
 u'mustn',
 u'my',
 u'myself',
 u'needn',
 u'no',
 u'nor',
 u'not',
 u'now',
 u'o',
 u'of',
 u'off',
 u'on',
 u'once',
 u'only',
 u'or',
 u'other',
 u'our',
 u'ours',
 u'ourselves',
 u'out',
 u'over',
 u'own',
 u're',
 u's',
 u'same',
 u'shan',
 u'she',
 u'shoul