In [67]:
# Read and concatenate data into test and train sets.
import numpy as np
import pandas as pd

yelp = pd.read_csv('/Users/danielrigney/Documents/KAGGLE/SentimentAnalysis/input/yelp_labelled.txt', sep='\t', names=['text', 'label'])
imdb = pd.read_csv('/Users/danielrigney/Documents/KAGGLE/SentimentAnalysis/input/imdb_labelled.txt', sep='\t', names=['text', 'label'])
test = pd.read_csv('/Users/danielrigney/Documents/KAGGLE/SentimentAnalysis/input/amazon_cells_labelled.txt', sep='\t', names=['text', 'label'])

train = pd.concat([imdb,yelp], axis=0)

To the clean the data we execute the following tasks.
- Ensure all text is lower case.
- Remove any punctuation.
- Remove common words that add no value (stop words).
- Remove domain specific words.
- Remove numerical values 0 and 1.

In [68]:
## Ensure all letters are lowercase.
train['text'] = train['text'].str.lower()

## Remove punctuation.
train['text'] = train['text'].str.replace(r'[^\w\s]', '')

## Remove stop words.
from nltk.corpus import stopwords

stop_words = [r'\b' + stopword + r'\b' for stopword in stopwords.words('english')]
stop_words = '|'.join(stop_words)
train['text'] = train['text'].str.replace(stop_words, '')

## Remove domain specific words.
domain_specific_words = r'\bactors\b|\bacting\b|\bcast\b|\bcharacters\b|\bfood\b|\bfilm\b|\bmovie\b|\brestaurant\b'
train['text'] = train['text'].str.replace(domain_specific_words, '')

## Remove numerical values.
numerical_values = r'\b0\b|\b1\b'
train['text'] = train['text'].str.replace(numerical_values, '')

## Print the ten most frequently used words.
frequency = ' '.join(train['text']).split()
frequency = pd.Series(frequency).value_counts()
print(frequency[:10])

LookupError: 
**********************************************************************
  Resource 'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/Users/danielrigney/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

First we must extract our features.
We will use the frequency of each word in each review as our feature. This means our feature space is the size of all distinct words in the sample. (an extension of this method is to use the sequence of n consecutive words (n-gram) as features.)
After counting the frequecy of each word, we normalise the frequency by scaling down the weight of words that occur often and scaling up the weight of words that occur less often. This is method is called term frequency - inverse term document frequency.
Next we train the model using the features and the acutal sentiment scores. 
Finally, we use the trained model to predict the sentiment of the test reviews.
Three models have been tested in this analysis. 
Naive Bayes - assumes each feature is independent, works well with categorical input.
Logistic Regression - used to estimate the probability of a binary event. 
Support Vector Machine - more efficient than logisitic regression when the number of features is high (as is the case with this task).

In [72]:
#Q3 soln.
# 3. Train the model.
#from sklearn.feature_extraction.text import CountVectorizer

#count_vectorizer = CountVectorizer()
#train_word_count = count_vectorizer.fit_transform(train['text'])

#from sklearn.feature_extraction.text import TfidfTransformer

#tfidf_transformer = TfidfTransformer()
#train_tfidf = tfidf_transformer.fit_transform(train_word_count)

## Naive Bayes Model
#from sklearn.naive_bayes import MultinomialNB

#clf_nb = MultinomialNB().fit(train_tfidf, train['label'])
#y_pred_nb = clf_nb.predict(tfidf_transformer.transform(count_vectorizer.transform(test['text'])))

# Logistic Regression Model
#from sklearn.linear_model import LogisticRegression

#clf_log = LogisticRegression(random_state=345).fit(train_tfidf, train['label'])
#y_pred_log = clf_log.predict(tfidf_transformer.transform(count_vectorizer.transform(test['text'])))

# Support Vector Machine
#from sklearn.svm import LinearSVC

#clf_svm = LinearSVC(random_state=345).fit(train_tfidf, train['label'])
#y_pred_svm = clf_svm.predict(tfidf_transformer.transform(count_vectorizer.transform(test['text'])))

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Naive Bayes Model
pipeline = Pipeline(steps=[('count_vectorizer', CountVectorizer()),
                           ('tfidf', TfidfTransformer()),
                           ('classifier', MultinomialNB())
                          ])
pipeline.fit(train['text'], train['label'])
predictions_nb = pipeline.predict(test['text'])

# Logistic Regression Model
pipeline = Pipeline(steps=[('count_vectorizer', CountVectorizer()), 
                          ('tfidf', TfidfTransformer()),
                          ('classifier', LogisticRegression(random_state=345))
                          ])
pipeline.fit(train['text'], train['label'])
predictions_log = pipeline.predict(test['text'])

# Support Vector Machine Model
pipeline = Pipeline(steps=[('count_vectorizer', CountVectorizer()), 
                          ('tfidf', TfidfTransformer()),
                          ('classifier', LinearSVC(random_state=345))
                          ])
pipeline.fit(train['text'], train['label'])
predictions_svm = pipeline.predict(test['text'])


  (0, 2661)	0.187502656021
  (0, 2890)	0.224242606308
  (0, 1538)	0.129022749621
  (0, 1928)	0.226226815093
  (0, 3164)	0.262370939054
  (0, 1160)	0.153389126166
  (0, 1776)	0.227250814139
  (0, 2935)	0.130492025595
  (0, 2177)	0.291231562208
  (0, 1541)	0.129684987284
  (0, 1477)	0.321457750403
  (0, 1376)	0.21381256343
  (0, 2883)	0.08997459118
  (0, 3065)	0.262370939054
  (0, 3042)	0.324235155546
  (0, 1253)	0.223280672168
  (0, 425)	0.247603184942
  (0, 651)	0.376544613376
  (1, 1263)	0.339877460144
  (1, 471)	0.501576444364
  (1, 1017)	0.48181406076
  (1, 3088)	0.633055758523
  (2, 1160)	0.363736833495
  (2, 2883)	0.213359797456
  (2, 1280)	0.398142856633
  :	:
  (1999, 2890)	0.166322158229
  (1999, 2935)	0.0967867600452
  (1999, 1541)	0.0961881746297
  (1999, 1477)	0.119213622484
  (1999, 2883)	0.200204168653
  (1999, 425)	0.183648847032
  (1999, 1456)	0.160367130109
  (1999, 2888)	0.187369726766
  (1999, 1962)	0.111106041745
  (1999, 3156)	0.254806545614
  (1999, 1892)	0.1179089

"\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.svm import LinearSVC\n\n# Naive Bayes Model\npipeline = Pipeline(steps=[('count_vectorizer', CountVectorizer()),\n                           ('tfidf', TfidfTransformer()),\n                           ('classifier', MultinomialNB())\n                          ])\npipeline.fit(train['text'], train['label'])\npredictions_nb = pipeline.predict(test['text'])\n\n# Logistic Regression Model\npipeline = Pipeline(steps=[('count_vectorizer', CountVectorizer()), \n                          ('tfidf', TfidfTransformer()),\n                          ('classifier', LogisticRegression(random_state=345))\n                          ])\npipeline.fit(train['text'], train['label'])\npredictions_log = pipeline.predict(test['text'])\n\n# Support Vector Machine Model\npipeline = Pipeline(steps=[('count_vectorizer', CountVectorizer()), \n                    

To evaluate the performance of the model we use three metrics.

Recall = ratio of correctly identified positive reviews to the total number of actual positive reviews.

Precision = ratio of correctly identified positive reviews to the total predicted positive reviews.

Accuracy = ratio of correctly classified reviews divided by the total number of reviews.


In [66]:
# 4. Evaluate the model.

from sklearn.metrics import accuracy_score, precision_score, recall_score

pd.DataFrame(data={'naive_bayes': [accuracy_score(test['label'], y_pred_nb),
                                  precision_score(test['label'], y_pred_nb),
                                  recall_score(test['label'], y_pred_nb)],
                  'logistic_regression': [accuracy_score(test['label'], y_pred_log),
                                         precision_score(test['label'], y_pred_log),
                                         recall_score(test['label'], y_pred_log)],
                  'support_vector_machine': [accuracy_score(test['label'], y_pred_svm),
                                         precision_score(test['label'], y_pred_svm),
                                         recall_score(test['label'], y_pred_svm)]},
             index=['accuracy', 'precision', 'recall'])

Unnamed: 0,logistic_regression,naive_bayes,support_vector_machine
accuracy,0.708556,0.740642,0.736631
precision,0.793706,0.824324,0.807818
recall,0.588083,0.632124,0.642487


Overall, the accuracy of the naive bayes model performs better than the other two models with the sentiment of 74% of reviews correctly identified. The precision of the naive bayes model is quite high while the recall of the support vector machine is higher.