# Use Google's Word2Vec for movie reviews
https://www.kaggle.com/c/word2vec-nlp-tutorial

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
from bs4 import BeautifulSoup  
import re
import nltk
#nltk.download() 
from nltk.corpus import stopwords # Import the stop word list
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3 )

## Data Cleaning and Text Preprocessing

In [None]:
# example BeatifulSoup object to remove HTML tags and markups
example1 = BeautifulSoup(train['review'][0], 'lxml')
example1.get_text()
# Use regular expressions to do a find-and-replace for punctuations and numbers
# [] indicates group membership and ^ means "not". In other words, the re.sub() statement says, "Find anything that is NOT a lowercase letter (a-z) or an upper case letter (A-Z), and replace it with a space."
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()               # Split into words
# Remove stop words from "words"
words = [w for w in words if not w in stopwords.words("english")]
#print(words)

In [None]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, 'lxml').get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

In [None]:
# do preprocessing for every review row
train['review_preprocessed'] = train['review'].apply(lambda x: review_to_words(x))
train.head()

### convert text to bag of words representation

In [None]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(list(train['review_preprocessed']))
# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
train_data_features.shape

In [None]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()

In [None]:
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print(count, tag)

## train the model and predict the sentiment

In [None]:
# Initialize a Random Forest classifier with 100 trees
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
forest = RandomForestClassifier(n_estimators = 100).fit(X = train_data_features, y = train["sentiment"])

In [None]:
# do preprocessing for every review row in test set
test['review_preprocessed'] = test['review'].apply(lambda x: review_to_words(x))
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(list(test['review_preprocessed']))
test_data_features = test_data_features.toarray()

In [None]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

In [None]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )