## Introduction and Dataset
TODO: Introduce the Dataset

In [1]:
import pandas as pd

# Train and Test are used for training/hyperparameter tuning
# Validation set is used exclusively for set validation and comparison

# Using the static split so that the results are comparable with the paper being referenced.
train_df = pd.read_table("data/train.tsv", header=None)
test_df = pd.read_table("data/test.tsv", header=None)
validation_df = pd.read_table("data/dev.tsv", header=None)

print(train_df)

train_y = train_df[0].to_numpy()
train_x = train_df[1].to_numpy()

test_y = test_df[0].to_numpy()
test_x = test_df[1].to_numpy()

validation_y = validation_df[0].to_numpy()
validation_x = validation_df[1].to_numpy()

      0                                                  1
0     1  a stirring , funny and finally transporting re...
1     0  apparently reassembled from the cutting-room f...
2     0  they presume their audience wo n't sit still f...
3     1  this is a visually stunning rumination on love...
4     1  jonathan parker 's bartleby should have been t...
...  ..                                                ...
6915  1  painful , horrifying and oppressively tragic ,...
6916  0  take care is nicely performed by a quintet of ...
6917  0  the script covers huge , heavy topics in a bla...
6918  0  a seriously bad film with seriously warped log...
6919  1  a deliciously nonsensical comedy about a city ...

[6920 rows x 2 columns]


In [2]:
print(train_x)
print(train_y)

print(test_x)
print(test_y)

['a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films'
 'apparently reassembled from the cutting-room floor of any given daytime soap .'
 "they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science-fiction elements of bug-eyed monsters and futuristic women in skimpy clothes ."
 ...
 "the script covers huge , heavy topics in a bland , surfacey way that does n't offer any insight into why , for instance , good things happen to bad people ."
 'a seriously bad film with seriously warped logic by writer-director kurt wimmer at the screenplay level .'
 'a deliciously nonsensical comedy about a city coming apart at its seams .']
[1 0 0 ... 0 0 1]
['no movement , no yuks , not much of anything .'
 "a gob of drivel so sickly sweet , even the eager consumers of moore 's pasteurized ditties will retch it up like rancid crème brûlée ."
 'gangs of new york is a

In [3]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize #import the tokenize package
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer  # import the snowball stemmer (also known as Porter2)

from nltk.stem import WordNetLemmatizer

from sklearn.base import BaseEstimator, TransformerMixin

#https://medium.com/@maleeshadesilva21/preprocessing-steps-for-natural-language-processing-nlp-a-beginners-guide-d6d9bf7689c9
nltk.download('averaged_perceptron_tagger_eng')

# Based on stopwords.words("english")
stopwords_english_tuned = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves']

# noinspection PyPep8Naming
class PreProcessor(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.all_stopwords = stopwords_english_tuned
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = SnowballStemmer("english")

    def fit(self, X, y=None):
        # Strip out words in 75% or more of the texts
        token_counts = {}
        for x in X:
            tokens = word_tokenize(x)
            for token in tokens:
                token_counts[token] = token_counts.get(token, 0) + 1

        common_threshold = len(X) * 0.75
        for token in token_counts:
            if token_counts[token] >= common_threshold:
                self.all_stopwords.append(token)

        return self

    def posToWordnetPos(self, pos):
        #https://medium.com/@maleeshadesilva21/preprocessing-steps-for-natural-language-processing-nlp-a-beginners-guide-d6d9bf7689c9
        tag_mapping = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_mapping.get(pos[0].upper(), wordnet.NOUN)

    def transform(self, X, y=None):
        prep_text = []
        for x in X:
            tokens = nltk.pos_tag(word_tokenize(x))
            tokens_stripped = [token for token in tokens if token[0] not in self.all_stopwords]
            tokens_lemmatized = [[self.lemmatizer.lemmatize(token[0], self.posToWordnetPos(token[1])) for token in tokens_stripped]]
            tokens_stemmed = [[self.stemmer.stem(token[0]) for token in tokens_stripped]]
            prep_text += tokens_stemmed

        prep_sentences = [" ".join(sentence) for sentence in prep_text]
        return prep_sentences

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [46]:
import numpy as np
# Simply test this shiz works

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

from gensim.models import Word2Vec

class Word2VecTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, vector_size=300, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        self.model = Word2Vec(X, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self
    
    def _word2vec_rep(self, sentence):
        embs = [self.model.wv[word] for word in sentence if word in self.model.wv.index_to_key]
        if len(embs) == 0:
            return np.zeros(self.vector_size)
        sent_emb = np.mean(np.array(embs), axis=0)
        return sent_emb

    def transform(self, X, y=None):
        return np.array([self._word2vec_rep(doc) for doc in X])

text_clf = Pipeline([
    ('prep', PreProcessor()),
    #('hash', HashingVectorizer()),
    #('count', CountVectorizer(max_features=300)),
    ('vec', TfidfVectorizer()),
    #('vec', Word2VecTransformer()),
    #('rep', TfidfTransformer()),
    #('mod', KNeighborsClassifier()),
    ('mod', MLPClassifier(solver='lbfgs', hidden_layer_sizes=(32,)))
])

text_clf.fit(train_x, train_y)
predictions = text_clf.predict(test_x)
acc = accuracy_score(test_y, predictions)
print(acc)

0.7539813289401428


## Representation Learning

In [5]:
import random
# Need to test evaluate: CountVec, Word2Vec, TfidfVec, Hash and Hash + TfidfTransformer.
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

# Ensure consistency between runs
seed = 1337
random.seed(seed)
np.random.seed = seed

# Prepare to measure performance between rep. learning methods
accuracy_scores = {}
f1_scores = {}

In [6]:
# Count Vectorizer

from sklearn.feature_extraction.text import CountVectorizer

count_pipeline = Pipeline([
    ("pre", PreProcessor()),
    ("vec", CountVectorizer()),
    ("clf", KNeighborsClassifier()),
])

param_grid = {
    #"vec__max_features": [100, 200, 300, 400, 500, 600],
    "vec__max_features": [400],
}

count_grid = GridSearchCV(count_pipeline, param_grid)
count_grid.fit(train_x, train_y)

print(count_grid.best_params_)

predictions = count_grid.predict(test_x)
accuracy_scores["count"] = accuracy_score(test_y, predictions)
f1_scores["count"] = f1_score(test_y, predictions)

{'vec__max_features': 400}


In [101]:
# Word2Vec

class Word2VecTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, vector_size=300, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        self.model = Word2Vec(X, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self

    def _word2vec_rep(self, sentence):
        embs = [self.model.wv[word] for word in sentence if word in self.model.wv.index_to_key]
        if len(embs) == 0:
            return np.zeros(self.vector_size)
        sent_emb = np.mean(np.array(embs), axis=0)
        return sent_emb

    def transform(self, X, y=None):
        return np.array([self._word2vec_rep(doc) for doc in X])

w2v_pipeline = Pipeline([
    ("pre", PreProcessor()),
    ("vec", Word2VecTransformer()),
    #("clf", KNeighborsClassifier()),
    ("clf", MLPClassifier(solver='lbfgs', hidden_layer_sizes=(64,32,), max_iter=800))
])

param_grid = {
    #"vec__vector_size": [100, 200, 300, 400],
    #"vec__window": [2, 3, 5],
    #"vec__min_count": [1,2,3],
    "vec__vector_size": [300],
    "vec__window": [5],
    "vec__min_count": [1],
}

w2v_grid = GridSearchCV(w2v_pipeline, param_grid)
w2v_grid.fit(train_x, train_y)

print(w2v_grid.best_params_)

predictions = w2v_grid.predict(test_x)
accuracy_scores["w2v"] = accuracy_score(test_y, predictions)
f1_scores["w2v"] = f1_score(test_y, predictions)

print(accuracy_scores["w2v"])
print(f1_scores["w2v"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

{'vec__min_count': 1, 'vec__vector_size': 300, 'vec__window': 5}
0.5606809445359693
0.5820271682340648


In [8]:
# TF-IDF Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_pipeline = Pipeline([
    ("pre", PreProcessor()),
    ("vec", TfidfVectorizer()),
    ("clf", KNeighborsClassifier()),
])

param_grid = {
    #"vec__lowercase": [True, False],
    #"vec__max_features": [100, 200, 300, 400],
    #"vec__ngram_range": [(1, 1), (1,2), (2,2)],
    #"vec__norm": [None, "l1", "l2"],
    "vec__lowercase": [True],
    "vec__max_features": [200],
    "vec__ngram_range": [(1,2)],
    "vec__norm": ["l2"],
}

tfidf_grid = GridSearchCV(tfidf_pipeline, param_grid)
tfidf_grid.fit(train_x, train_y)

print(tfidf_grid.best_params_)

predictions = tfidf_grid.predict(test_x)
accuracy_scores["tfidf"] = accuracy_score(test_y, predictions)
f1_scores["tfidf"] = f1_score(test_y, predictions)

{'vec__lowercase': True, 'vec__max_features': 200, 'vec__ngram_range': (1, 2), 'vec__norm': 'l2'}


In [9]:
# Hashing Vectorizer

from sklearn.feature_extraction.text import HashingVectorizer

hash_vectorizer_pipeline = Pipeline([
    ("pre", PreProcessor()),
    ("hash", HashingVectorizer()),
    ("clf", KNeighborsClassifier()),
])

param_grid = {
    #"hash__lowercase": [True, False],
    #"hash__strip_accents": ["unicode", "ascii"],
    #"hash__norm": [None, "l1", "l2"],
    "hash__lowercase": [True],
    "hash__strip_accents": ["unicode"],
    "hash__norm": ["l2"],
}

hash_grid = GridSearchCV(hash_vectorizer_pipeline, param_grid)
hash_grid.fit(train_x, train_y)

print(hash_grid.best_params_)

predictions = hash_grid.predict(test_x)
accuracy_scores["hash"] = accuracy_score(test_y, predictions)
f1_scores["hash"] = f1_score(test_y, predictions)

{'hash__lowercase': True, 'hash__norm': 'l2', 'hash__strip_accents': 'unicode'}


In [10]:
# Hashing Vectorizer + TF-IDF Transformer

from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer

hash_tfidf_vectorizer_pipeline = Pipeline([
    ("pre", PreProcessor()),
    ("hash", HashingVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("clf", KNeighborsClassifier()),
])

param_grid = {
    # "hash__lowercase": [True, False],
    # "hash__strip_accents": ["unicode", "ascii"],
    # "hash__norm": [None, "l1", "l2"],
    # "tfidf__norm": [None, "l1", "l2"],
    # "tfidf__use_idf": [True, False],
    # "tfidf__smooth_idf": [True, False],
    # "tfidf__sublinear_tf": [True, False],
    "hash__lowercase": [True],
    "hash__strip_accents": ["unicode"],
    "hash__norm": ["l2"],
    "tfidf__norm": ["l2"],
    "tfidf__use_idf": [True],
    "tfidf__smooth_idf": [True],
    "tfidf__sublinear_tf": [False],
}

hash_tfidf_grid = GridSearchCV(hash_tfidf_vectorizer_pipeline, param_grid)
hash_tfidf_grid.fit(train_x, train_y)

print(hash_tfidf_grid.best_params_)

predictions = hash_tfidf_grid.predict(test_x)
accuracy_scores["hash_tfidf"] = accuracy_score(test_y, predictions)
f1_scores["hash_tfidf"] = f1_score(test_y, predictions)

{'hash__lowercase': True, 'hash__norm': 'l2', 'hash__strip_accents': 'unicode', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True}


In [11]:
print(accuracy_scores)
print(f1_scores)

{'count': 0.5689181768259198, 'w2v': 0.5112575507962658, 'tfidf': 0.6166941241076331, 'hash': 0.6743547501372872, 'hash_tfidf': 0.742998352553542}
{'count': 0.5290941811637673, 'w2v': 0.5514112903225806, 'tfidf': 0.6096196868008948, 'hash': 0.6957414058491534, 'hash_tfidf': 0.7513283740701382}


After comparing Word-2-Vector, CountVectorizer, TF-IDF, HashingVectorizer and HashingVectorizer combined with TF-IDF; Hash + TF-IDF resulted in the most performant model when used for kNN classification. As such, this will be the representation used for the remainder of this work.

## NLP Algorithms

In [126]:
# TODO: implement LSTM and another algorithm

from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing import sequence
from scikeras.wrappers import KerasClassifier

max_len = 0
for x in train_x:
    max_len = max(max_len, len(x))
for x in test_x:
    max_len = max(max_len, len(x))

# Keras input length pre-process
class SequencePadding(BaseEstimator, TransformerMixin):
    def __init__(self, max_len):
        self.max_len = max_len

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return sequence.pad_sequences(X.toarray(), maxlen=self.max_len)

model = keras.Sequential(
    [
        layers.Input((400,)),
        #layers.Embedding(input_dim=2**20, output_dim=256),
        layers.Embedding(input_dim=400, output_dim=64),
        layers.SimpleRNN(400, activation="relu"),
        #layers.Dense(256, activation="relu"),
        #layers.LSTM(256),
        #layers.Dense(256, activation='relu'),
        layers.Dense(1, activation='sigmoid'),
    ]
)

#model.summary()
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 1
batch_size = 64

def get_model(hidden_layer_dim, meta):
    # note that meta is a special argument that will be
    # handed a dict containing input metadata
    n_features_in_ = meta["n_features_in_"]
    X_shape_ = meta["X_shape_"]
    n_classes_ = meta["n_classes_"]

    print(meta)
    print(n_features_in_)
    print(X_shape_)
    print(n_classes_)

    print(X_shape_[1:])

    model = keras.Sequential([
        layers.Input(X_shape_[1:]),
        #layers.Dense(n_features_in_, activation="relu"),
        #layers.Dense(hidden_layer_dim, activation="relu"),

        layers.Embedding(n_features_in_, hidden_layer_dim, input_length=max_len),
        #layers.Bidirectional(layers.LSTM(hidden_layer_dim)),
        layers.SimpleRNN(hidden_layer_dim, activation="relu"),
        #layers.Dense(n_features_in_, activation="relu"),
        layers.Dense(1, activation="sigmoid"),

        #layers.Embedding(n_features_in_, 128),
        #layers.Dropout(0.5),
        #layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3),
        #layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3),
        #layers.GlobalMaxPooling1D(),
        #layers.Dense(128, activation="relu"),
        #layers.Dropout(0.5),
        #layers.Dense(1, activation="sigmoid"),
    ])

    model.summary()
    return model

clf = KerasClassifier(
    get_model,
    loss="binary_crossentropy",
    optimizer="adam",
    hidden_layer_dim=64,
    metrics=["accuracy"],
    epochs=10,
    #batch_size=batch_size,
)

pipeline = Pipeline([
    ("pre", PreProcessor()),
    #("hash", HashingVectorizer()),
    #("tfidf", TfidfTransformer()),
    #("vec", CountVectorizer(max_features=400)),
    #("vec", Word2VecTransformer()),
    ("vec", TfidfVectorizer(max_features=400)),
    #("vec", TfidfVectorizer(max_features=800)),
    #("pad", SequencePadding(400)),
    ("clf", KerasClassifier(model, epochs=epochs, batch_size=batch_size)),
    #("clf", KNeighborsClassifier()),
    #("clf", MLPClassifier(solver='lbfgs', hidden_layer_sizes=(32,)))
    ("clf", clf),
])

pipeline.fit(train_x, train_y)
predictions = pipeline.predict(test_x)
acc = accuracy_score(test_y, predictions)
print(acc)

{'classes_': array([0, 1]), 'target_type_': 'binary', 'y_dtype_': dtype('int64'), 'y_ndim_': 1, 'X_dtype_': dtype('float32'), 'X_shape_': (6920, 400), 'n_features_in_': 400, 'target_encoder_': ClassifierLabelEncoder(loss='binary_crossentropy'), 'n_classes_': 2, 'n_outputs_': 1, 'n_outputs_expected_': 1, 'feature_encoder_': FunctionTransformer()}
400
(6920, 400)
2
(400,)




Epoch 1/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.5130 - loss: 0.6933
Epoch 2/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.5202 - loss: 0.6925
Epoch 3/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.5172 - loss: 0.6927
Epoch 4/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.5217 - loss: 0.6923
Epoch 5/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.5182 - loss: 0.6926
Epoch 6/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.5129 - loss: 0.6931
Epoch 7/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.5289 - loss: 0.6916
Epoch 8/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.5185 - loss: 0.6925
Epoch 9/10
[1m217/217[0m [32m

## Evaluation

## Paper Overview

## Implementation of C-LSTM Algorithm
- Word2Vec for vectorization

## Paper Evaluation