# Import modules

In [0]:
import pandas as pd
import numpy as np
import requests

import sklearn
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import chi2, SelectKBest

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/c1977808/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Read data

In [0]:
def read_data(path_pos, path_neg):
    pos = pd.read_csv(path_pos, sep="\n", header=None, names=['review'])
    pos['positive']=1
    neg = pd.read_csv(path_neg, sep="\n", header=None, names=['review'])
    neg['positive']=0
    combined_df = pos.append(neg)
    combined_df = shuffle(combined_df, random_state=42)
    return(combined_df)

In [0]:
# read in training data
train = read_data(path_pos="Data/IMDb/train/imdb_train_pos.txt",
                  path_neg="Data/IMDb/train/imdb_train_neg.txt")

dev = read_data(path_pos="Data/IMDb/dev/imdb_dev_pos.txt",
                path_neg="Data/IMDb/dev/imdb_dev_neg.txt")

test = read_data(path_pos="Data/IMDb/test/imdb_test_pos.txt",
                 path_neg="Data/IMDb/test/imdb_test_neg.txt")

# Explore data and shuffle

In [0]:
train.head()

Unnamed: 0,review,positive
4016,This is a comedy/romance movie directed by And...,0
6475,"During the Sci-Fi TZ marathon of January 31, 1...",1
5684,radio is possibly one of the best films i have...,0
862,I was -Unlike most of the reviewers- not born ...,1
5970,"When i started watching ""Surface""for the first...",1


In [0]:
print("No of positive reviews\n-----")
print(train['positive'].value_counts()[1])
print("\nNo of negative reviews\n-----")
print(train['positive'].value_counts()[0])

No of positive reviews
-----
7483

No of negative reviews
-----
7517


# Feature 1 - tf-idf

The first feature is based on a vocabulary of tokens collected from the training data. Term Frequency times Inverse Document Frequency will be used. Term Frequency (tf) counts the number of times a token in the vocabulary is used in each review, relative to how frequently it appears in that review. This acts as a way of normalising the count. Inverse Document Frequency (idf) penalises tokens that appear across many reviews, as these terms offer less information than those that appear in fewer reviews. 

## Create set of stopwords to remove later

In [0]:
# take set of stopwords from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# manually add more punctuation
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
stopwords.add("#")
stopwords.add("@")
stopwords.add(":")
stopwords.add("'s")
stopwords.add("’")
stopwords.add("...")
stopwords.add("n't")
stopwords.add("'re")
stopwords.add("'")
stopwords.add("-")
stopwords.add(";")
stopwords.add("/")
stopwords.add(">")
stopwords.add("<")
stopwords.add("br")
stopwords.add("(")
stopwords.add(")")
stopwords.add("''")
stopwords.add("&")

## Define custom transformers

Need to create a simple transormer so just the dataset can be used to feed into the full pipeline later. Currently, feature 1 pipeline takes `train['review']` while feature 2 pipeline takes `train` as it's argument. This needs to be consistent.

In [0]:
class selectReview(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return(X['review'])

## Create transformation pipeline

In [0]:
feature_1_vocab = Pipeline([
    ('select_review', selectReview()),
    ('count', CountVectorizer(stop_words=stopwords, max_features=500)),
    ('tfidf', TfidfTransformer())
])

# Feature 2 - tf-idf (bi-grams)

In [0]:
feature_2_vocab = Pipeline([
    ('select_review', selectReview()),
    ('count', CountVectorizer(stop_words=stopwords, max_features=500, ngram_range=(2,2))),
    ('tfidf', TfidfTransformer())
])

# Feature 3 - Sentiment

In [0]:
vader = SentimentIntensityAnalyzer()

## Define custom transformers

In [0]:
class getSentiment(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        features_array=[]
        for index, row in X.iterrows():
            pos = vader.polarity_scores(row['review'])['pos']
            neu = vader.polarity_scores(row['review'])['neu']
            neg = vader.polarity_scores(row['review'])['neg']
            features_array.append([pos, neu, neg])
        return(np.asarray(features_array))

## Create transformation pipeline

In [0]:
feature_3_sentiment = Pipeline([
    ('get_sentiment', getSentiment())
])

# Combine all features

In [0]:
feature_engineering = FeatureUnion(transformer_list=[
    ("feature_1_vocab", feature_1_vocab),
    ("feature_2_vocab", feature_2_vocab),
    ("feature_3_sentiment", feature_3_sentiment)
])

In [0]:
X_train = feature_engineering.fit_transform(train)

In [0]:
print(X_train.shape)

(15000, 1003)


# Perform feature selection

*Currently unsure whether to perform feature selection as part of Pipeline or through a standard function. Pipeline offers auto tuning of parameters but function allows flexibility to specify test, dev, or train set to be used!*

In [0]:
def feature_selection(matrix, data_split, k):
    reduced_features = SelectKBest(chi2, k=k).fit_transform(matrix, np.asarray(data_split['positive']))
    return(reduced_features)

In [0]:
X_train_reduced = feature_selection(X_train, train, 10)

# Run and evaluate different models

Try a series of different machine learning algorithms here such as; SVM (Linear, Polynomial and RBF), SGD, Decision Trees, Logistic Regression. Using GridSearch to test for the best hyperparameters.

In [0]:
y_train = np.asarray(train['positive'])
y_dev = np.asarray(dev['positive'])
X_dev = feature_engineering.transform(dev)

## SVM

### Linear

In [0]:
svm_clf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("svm_clf", sklearn.svm.LinearSVC(loss='hinge'))
])

In [0]:
svm_clf.fit(X_train, y_train)
svm_clf_pred = svm_clf.predict(X_dev)



In [0]:
print(classification_report(y_dev, svm_clf_pred))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      2482
           1       0.85      0.82      0.84      2518

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      5000
weighted avg       0.84      0.84      0.84      5000



### RBF

In [0]:
rbf_svm_clf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("svm_clf", sklearn.svm.SVC(kernel="rbf"))
])

In [0]:
rbf_svm_clf.fit(X_train, y_train)
rbf_svm_clf_pred = rbf_svm_clf.predict(X_dev)

In [0]:
print(classification_report(y_dev, rbf_svm_clf_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      2482
           1       0.85      0.85      0.85      2518

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



### Polynomial

In [0]:
poly_svm_clf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("svm_clf", sklearn.svm.SVC(kernel="poly", degree=3, coef0=1))
])

In [0]:
poly_svm_clf.fit(X_train, y_train)
poly_svm_clf_pred = poly_svm_clf.predict(X_dev)

In [0]:
print(classification_report(y_dev, poly_svm_clf_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      2482
           1       0.85      0.85      0.85      2518

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



**NEXT STEPS: run polynomial and RBF SVM using the kernel trick. Then run a decision tree and logistic regression to shortlist ~ five suitable models. Then run `GridSearchCV` to find the best combination of hyperparameters for each of the ~ five models. Run this inside a loop that performs feature selection to reduced the number of variables to 10, 100, 500, 1000?. Finally, run the best performing model on the unseen test data.**