In [96]:
#imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
#nltk.download()
from nltk.corpus import stopwords # Import the stop word list
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()

<h2>I. Reading Data</h2>

In [74]:

#read the training file such that first line is header, file is tab delimited, ignore the double quotes
train=pd.read_csv(r"C:\Users\Priyanka\Documents\Priya_Documents\IU_Data_Science\NLP_IMDB_review\Data\labeledTrainData.tsv",\
                 header=0,delimiter="\t",quoting=3)

#read the test file such that first line is header, file is tab delimited, ignore the double quotes
test=pd.read_csv(r"C:\Users\Priyanka\Documents\Priya_Documents\IU_Data_Science\NLP_IMDB_review\Data\testData.tsv",\
                 header=0,delimiter="\t",quoting=3)


In [17]:
#quick look at the data
print(train.shape)
print(train.columns.values)
print(train['id'][0])
print(train['sentiment'][0])
print(train['review'][0])

(25000, 3)
['id' 'sentiment' 'review']
"5814_8"
1
"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br

Clearly the above text needs cleaning. There are html tags that needs to be removed etc. etc.
<h2>II. Cleaning and Pre-processing the data</h2>
<ol>1. Removing HTML markup using the Beautiful Soup package</ol>
<ol>2. Dealing with punctuation, numbers etc. using Regular Expression. To start, we will only keep the letters and space, but enhance it further by keeping emoticons like :) etc.</ol>
<ol>3. Convert to lowercase</ol>
<ol>4. Tokenize to words using NLTK</ol>
<ol>5. Lemmatize using NLTK</ol>
<ol>6. Remove stop words using NLTK</ol>

In [68]:
def data_cleanup( raw_text ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    text = BeautifulSoup(raw_text).get_text() 
    #
    # 2. Remove non-letters        
    text = re.sub("[^a-zA-Z]", " ", text) 
    #
    # 3. Convert to lower case
    words = text.lower()   
    #
    # 4. Tokenize
    words=word_tokenize(words)
    #
    # 5. Lemmatize
    words = [lemmatizer.lemmatize(w) for w in words]
    #
    # 6. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 7. Remove stop words
    words = [w for w in words if not w in stops]   
    #
    # 8. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( words )) 

In [69]:
#Text the data_cleanup function with a review
clean_review = data_cleanup( train["review"][0] )
print (clean_review)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought wa really cool eighty maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema wa originally released ha subtle message mj feeling towards press also obvious message drug bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fan would say made fan true really nice actual feature film bit finally start minute excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord want mj dead bad beyond mj overheard plan nah joe pesci character ranted wanted people know supplying drug etc dunno maybe hate mj music lot cool thing like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually director hate working one k

<h3>Training Data Claeanup </h3>

In [73]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
print ("Cleaning and parsing the training set movie reviews...\n")

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews ) )
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( data_cleanup( train["review"][i] ))

Cleaning and parsing the training set movie reviews...





 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



<h3>Test Data Claeanup </h3>

In [75]:
# Get the number of reviews based on the dataframe column size
num_reviews_test = test["review"].size

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
print ("Cleaning and parsing the test data movie reviews...\n")

# Initialize an empty list to hold the clean reviews
clean_test_reviews = []
for i in range( 0, num_reviews_test ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews_test ) )
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_test_reviews.append( data_cleanup( test["review"][i] ))

Cleaning and parsing the test data movie reviews...





 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



<h2>III. Creating features from bag of words (unigram) using scikit-learn countVectorizer</h2>
<br>
So far we have our dataset all tidied up. Next, we need some kind of numeric representation of the data. We will use the bag of words approach to create a sparse vector with count of words. In the IMDB data, we have a very large number of reviews, which will give us a large vocabulary. To limit the size of the feature vectors, we should choose some maximum vocabulary size. Below, we use the 5000 most frequent words (remembering that stop words have already been removed).

We'll be using the feature_extraction module from scikit-learn to create bag-of-words features.

In [90]:
print("creating the bag of words....\n")
from sklearn.feature_extraction.text import CountVectorizer


#initialize the CountVectorizer (scikit learn's bag-of-words tool)
vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, 
                            stop_words=None, max_features=5000)
#fit_transform will fit the model to learn the vocabulary with the data and then transform into feature vector...
train_data_features=vectorizer.fit_transform(clean_train_reviews)
#transform the test data...
test_data_features=vectorizer.transform(clean_test_reviews)

#convert to numpy arrays
train_data_features=train_data_features.toarray()
test_data_features=test_data_features.toarray()
                                        
#print the shape
print(train_data_features.shape)
print(train_data_features[0])

creating the bag of words....

(25000, 5000)
[0 0 0 ..., 0 0 0]


In [88]:
#A quick look at the features...
vocab=vectorizer.get_feature_names()
#print(vocab)

<h2>III. Classification Algorithm -scikit learn</h2>


In [100]:
# k-fold cross validation...
from sklearn.cross_validation import KFold, cross_val_score
# random forest classifier
from sklearn.ensemble import RandomForestClassifier

forest=RandomForestClassifier(n_estimators = 100)

scores=cross_val_score(estimator=forest,
                 X=train_data_features,
                 y=train["sentiment"],
                 cv=10)

print('CV scores:\n',scores)

print([np.mean(scores),np.std(scores)])

CV scores:
 [ 0.838   0.8424  0.8652  0.8372  0.836   0.8288  0.8332  0.8548  0.8312
  0.8492]
0.8416 0.0109354469502


In [102]:
#The score with random forest on training / cross-validation data
print("The Random Forest Classifier Score with 10 fold cross validation")
print([np.mean(scores),np.std(scores)])

The Random Forest Classifier Score with 10 fold cross validation
[0.84160000000000001, 0.010935446950170792]


In [106]:
# Logistic regression claasifier with 10 fold cross validation
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression(random_state=0)

scoresLR=cross_val_score(estimator=lr,
                         X=train_data_features,
                         y=train["sentiment"],
                         cv=10)

print('CV scores:\n',scoresLR)

#The score with Logistic Regression on training / cross-validation data
print("The Logistic Regression Classifier Score with 10 fold cross validation")
print([np.mean(scoresLR),np.std(scoresLR)])


CV scores:
 [ 0.8624  0.866   0.8756  0.8564  0.8392  0.8524  0.864   0.8624  0.8596
  0.8604]
The Logistic Regression Classifier Score with 10 fold cross validation
[0.85984000000000016, 0.0090016887304549854]


In [108]:
# Up until this point, we have trained the classifier on 90 percent of the data...
#We will now need to fit the classifiers on the entire datasets...This step is important as we do not 
#want to waste any training data.
forest.fit(train_data_features,train["sentiment"].values)
lr.fit(train_data_features,train["sentiment"].values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

<h2>Predict test data sentiment and prepare the output file for kaggle </h2

In [110]:
# Use the random forest to make sentiment label predictions
resultRF = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
outputRF = pd.DataFrame( data={"id":test["id"], "sentiment":resultRF} )

# Use pandas to write the comma-separated output file
outputRF.to_csv( r"C:\Users\Priyanka\Documents\Priya_Documents\IU_Data_Science\NLP_IMDB_review\results\Bag_of_Words_model_RF.csv", index=False, quoting=3 )

In [111]:
# Use the random forest to make sentiment label predictions
resultLR = lr.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
outputLR = pd.DataFrame( data={"id":test["id"], "sentiment":resultLR} )

# Use pandas to write the comma-separated output file
outputLR.to_csv( r"C:\Users\Priyanka\Documents\Priya_Documents\IU_Data_Science\NLP_IMDB_review\results\Bag_of_Words_model_LR.csv", index=False, quoting=3 )