# Use different NLP methods to predict movie reviews
https://www.kaggle.com/c/word2vec-nlp-tutorial

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_colwidth', 100)
from bs4 import BeautifulSoup  
import re
import nltk
#nltk.download() 
from nltk.corpus import stopwords # Import the stop word list
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import gensim
from gensim.models import word2vec
import logging

In [None]:
# load data
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( 'unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3 )

## Data Cleaning and Text Preprocessing for Bag of Words

Removing HTML Markup and dealing with Punctuation, Numbers and Stopwords

In [None]:
# example BeatifulSoup object to remove HTML tags and markups
example1 = BeautifulSoup(train['review'][0], 'lxml')
example1.get_text()
# Use regular expressions to do a find-and-replace for punctuations and numbers
# [] indicates group membership and ^ means "not". In other words, the re.sub() statement says, "Find anything that is NOT a lowercase letter (a-z) or an upper case letter (A-Z), and replace it with a space."
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()               # Split into words
# Remove stop words from "words"
words = [w for w in words if not w in stopwords.words("english")]
#print(words)

In [None]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, 'lxml').get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

In [None]:
# do preprocessing for every review row
train['review_preprocessed'] = train['review'].apply(lambda x: review_to_words(x))
train.head()

### convert text to bag of words representation

In [None]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
# choose only the 5000 most frequent words as features
vectorizer = CountVectorizer(analyzer = 'word',tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(list(train['review_preprocessed']))
# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
train_data_features.shape

In [None]:
# The words in the vocabulary
vocab = vectorizer.get_feature_names()

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each tuple (word in the vocabulary, count of the vocabulary word)
# print the vocabulary word and the number of times it appears in the training set
for tag, count in zip(vocab, dist):
    print(count, tag)

## Train a random forest model and predict the sentiment with a bag of words representation

In [None]:
# Initialize a Random Forest classifier with 100 trees
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
forest = RandomForestClassifier(n_estimators = 100).fit(X = train_data_features, y = train['sentiment'])

In [None]:
# do preprocessing for every review row in test set
test['review_preprocessed'] = test['review'].apply(lambda x: review_to_words(x))
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(list(test['review_preprocessed']))
test_data_features = test_data_features.toarray()

In [None]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

In [None]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})

# Use pandas to write the comma-separated output file
output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3)

# Data Cleaning and Text Preprocessing for Word2Vec

Word2Vec expects single sentences, each one as a list of words. In other words, the input format is a list of lists. For this reason, we'll use NLTK's punkt tokenizer for sentence splitting.

To train Word2Vec it is better not to remove stop words because the algorithm relies on the broader context of the sentence in order to produce high-quality word vectors. For this reason, we will make stop word removal optional in the function below. 

In [None]:
def review_to_sentenceslist(review, tokenizer, remove_stopwords=False):
    # Function to split a review into parsed sentences and converts them
    # to a sequence of words,
    # optionally removing stop words.  Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, get a list of words of the sentence and append it to the sentences list
            #
            #  Remove HTML
            review_text = BeautifulSoup(raw_sentence,'lxml').get_text()
            #  
            #  Remove non-letters
            review_text = re.sub("[^a-zA-Z]"," ", review_text)
            #
            #  Convert words to lower case and split them
            words = review_text.lower().split()
            #
            #  Optionally remove stop words (false by default)
            if remove_stopwords:
                stops = set(stopwords.words("english"))
                words = [w for w in words if not w in stops]
            sentences.append(words)
    #
    # 3. return the list of sentences (each sentence is a list of words, so this returns a list of lists
    return(sentences)            

In [None]:
# Load the punkt tokenizer for sentence splitting
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# apply function to every review and create list of sentences, where each sentence is a list of words
print('parse unlabeld train sentences')
unlabeled_train['sentencelist'] = unlabeled_train['review'].apply(lambda x: review_to_sentenceslist(x,tokenizer=tokenizer, remove_stopwords=False))
print('parse labeld train sentences')
train['sentencelist'] = train['review'].apply(lambda x: review_to_sentenceslist(x,tokenizer=tokenizer, remove_stopwords=False))

#### we need to concatenate the list of lists

append adds an element to a list, and extend concatenates the first list with another list (or another iterable, not necessarily a list.)


* append adds its argument as a single element to the end of a list. The length of the list itself will increase by one

x = [1, 2, 3]  
x.append([4, 5])  
print (x)  

result: [1, 2, 3, [4, 5]]

* extend iterates over its argument adding each element to the list, extending the list. The length of the list will increase by however many elements were in the iterable argument

x = [1, 2, 3]  
x.extend([4, 5])  
print (x)  

result: [1, 2, 3, 4, 5]

In [None]:
# Initialize an empty list of sentences
sentences = [] 

# extend sentences list to concatenate list of sentences from labeled and unlabeled train Dataframes
print('extend sentences list with labeled training set sentences')
for sentencelist in train['sentencelist']:
    sentences.extend(sentencelist)

print('extend sentences list with unlabeled training set sentences')
for sentencelist in unlabeled_train['sentencelist']:
    sentences.extend(sentencelist)
print('number of sentences:',len(sentences))

## Train and save Word2Vec model

In [None]:
# configure logging to watch Word2Vec flow
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# train the Word2Vec Model
print('train Word2Vec model...')
word2vec_model = word2vec.Word2Vec(sentences=sentences, workers=4, size=300, min_count=40, window=10, sample=1e-3)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
word2vec_model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = '300features_40minwords_10context_word2vec_model'
word2vec_model.save(model_name)

In [None]:
word2vec_model.most_similar('awful')