# Preprocessing data

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in xrange( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( review_to_words( train["review"][i] ) )
    
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

# Train-test split procedure

In [None]:
from sklearn.cross_validation import train_test_split

X = train_data_features
y = train["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score as AUC

forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit( X_train, y_train )
y_pred = forest.predict(X_test)

prob = forest.predict_proba( X_test )
auc = AUC(y_test, prob[:,1])
acc = accuracy_score(y_test, y_pred)
print auc, acc # 0.917725832785 0.84112

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score as AUC

knn = MultinomialNB()
knn = knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

prob = knn.predict_proba( X_test )
auc = AUC(y_test, prob[:,1])
acc = accuracy_score(y_test, y_pred)
print auc, acc # 0.913334167559 0.84896

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr = lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

prob = lr.predict_proba( X_test )
auc = AUC(y_test, prob[:,1])
acc = accuracy_score(y_test, y_pred)
print auc, acc # 0.923799636764 0.85792
print classification_report(y_test, y_pred, target_names=["negative", "positive"])