In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
import nltk
#nltk.download()
from nltk.corpus import stopwords

### Read labeledTrainData into a dataframe for analysis

In [3]:
datafile = os.path.join('..', 'data', 'labeledTrainData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df.head()

### Perform the following steps to normalize the review data:

1. remove html markup
1. remove any punctuation
1. split into words
1. remove all stopwords
1. recombine into a single string

In [4]:
def display(text, title):
    print(title)
    print('-----------------------')
    print(text) 

In [5]:
raw_example = df['review'][1]
display(raw_example, 'raw text')

In [6]:
example = BeautifulSoup(raw_example, 'html.parser').get_text()
display(example, 'no markup')

In [7]:
example_letters = re.sub(r'[^a-zA-Z]', ' ', example)
display(example_letters, 'only letters')

In [8]:
words = example_letters.lower().split()
display(words, 'word list')

In [9]:
words_nostop = [w for w in words if w not in stopwords.words('english')]
display(words_nostop, 'no stopwords')

In [10]:
eng_stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ' '.join(words)

In [11]:
clean_text(raw_example)

### Clean review text and add to dataframe

In [12]:
df['clean_review'] = df.review.apply(clean_text)
df.head()

### Train word vectorizer and get word vectors

In [13]:
vectorizer = CountVectorizer(max_features = 5000) 
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
train_data_features.shape

### Train Classifier

In [14]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, df.sentiment)

#### should be more or less perfect. Testing on training data... bad, I know. But this just verifies that the model is consistent

In [15]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

### Clear up some resources

In [16]:
del df
del train_data_features

### Read test data and perform predictions

In [17]:
datafile = os.path.join('..', 'data', 'testData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df['clean_review'] = df.review.apply(clean_text)
df.head()

In [18]:
test_data_features = vectorizer.transform(df.clean_review).toarray()
test_data_features.shape

In [19]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})

In [20]:
output.head()

In [22]:
output.to_csv(os.path.join('..', 'data', 'Bag_of_Words_model.csv'), index=False)

In [23]:
del df
del test_data_features