## Introduction and Dataset
asdf

In [23]:
import pandas as pd
from sklearn.neural_network import MLPClassifier

# Train and Test are used for training/hyperparameter tuning
# Validation set is used exclusively for set validation and comparison

# Using the static split so that the results are comparable with the paper being referenced.

train_df = pd.read_table("data/train.tsv", header=None)
test_df = pd.read_table("data/test.tsv", header=None)
validation_df = pd.read_table("data/dev.tsv", header=None)

print(train_df)

train_y = train_df[0].to_numpy()
train_x = train_df[1].to_numpy()

test_y = test_df[0].to_numpy()
test_x = test_df[1].to_numpy()

validation_y = validation_df[0].to_numpy()
validation_x = validation_df[1].to_numpy()

      0                                                  1
0     1  a stirring , funny and finally transporting re...
1     0  apparently reassembled from the cutting-room f...
2     0  they presume their audience wo n't sit still f...
3     1  this is a visually stunning rumination on love...
4     1  jonathan parker 's bartleby should have been t...
...  ..                                                ...
6915  1  painful , horrifying and oppressively tragic ,...
6916  0  take care is nicely performed by a quintet of ...
6917  0  the script covers huge , heavy topics in a bla...
6918  0  a seriously bad film with seriously warped log...
6919  1  a deliciously nonsensical comedy about a city ...

[6920 rows x 2 columns]


In [4]:
print(train_x)
print(train_y)

print(test_x)
print(test_y)

['a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films'
 'apparently reassembled from the cutting-room floor of any given daytime soap .'
 "they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science-fiction elements of bug-eyed monsters and futuristic women in skimpy clothes ."
 ...
 "the script covers huge , heavy topics in a bland , surfacey way that does n't offer any insight into why , for instance , good things happen to bad people ."
 'a seriously bad film with seriously warped logic by writer-director kurt wimmer at the screenplay level .'
 'a deliciously nonsensical comedy about a city coming apart at its seams .']
[1 0 0 ... 0 0 1]
['no movement , no yuks , not much of anything .'
 "a gob of drivel so sickly sweet , even the eager consumers of moore 's pasteurized ditties will retch it up like rancid crème brûlée ."
 'gangs of new york is a

In [71]:
import nltk #import the natural language toolkit

nltk.download('punkt') #download the package in nltk which supports tokenization
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords') #download the nltk package for stopwords

from nltk.tokenize import word_tokenize #import the tokenize package
from nltk.corpus import stopwords, wordnet  # import the package from the corpus
from nltk.stem.snowball import SnowballStemmer, PorterStemmer  # import the snowball stemmer (also known as Porter2)

from nltk.stem import WordNetLemmatizer

from sklearn.base import BaseEstimator, TransformerMixin

print(stopwords.words("english"))

#https://medium.com/@maleeshadesilva21/preprocessing-steps-for-natural-language-processing-nlp-a-beginners-guide-d6d9bf7689c9
nltk.download('averaged_perceptron_tagger_eng')

class PreProcessor(BaseEstimator, TransformerMixin):

    def __init__(self):
        # TODO: Find words that appear in > 50% of the examples. and words appearing in 1-2 examples.
        #       Add these to the stopwords
        self.all_stopwords = stopwords.words("english")
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # convert POS tag to WordNet format
        def get_wordnet_pos(word):
            tag = nltk.pos_tag([word])[0][1][0].upper()
            tag_dict = {"J": wordnet.ADJ,
                        "N": wordnet.NOUN,
                        "V": wordnet.VERB,
                        "R": wordnet.ADV}
            return tag_dict.get(tag, wordnet.NOUN)
        
        prep_text = []
        for x in X:
            token_text = word_tokenize(x)
            #normd_text = [token.lower() for token in token_text if token.isalpha()]
    
            # Remove stopwords
            #swr_text = [token for token in normd_text if token not in self.all_stopwords]
    
            # Lemmatizer works well, stopwords is minor reduction, normalization is major reduction
            prep_text += [[self.lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in token_text]]
            #prep_text += [[self.stemmer.stem(word) for word in swr_text]]
    
        prep_sentences = [" ".join(sentence) for sentence in prep_text]
        return prep_sentences

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/reecemackie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [75]:
import numpy as np
# Simply test this shiz works

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

from gensim.models import Word2Vec

class Word2VecTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, vector_size=300, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        self.model = Word2Vec(X, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self
    
    def _word2vec_rep(self, sentence):
        embs = [self.model.wv[word] for word in sentence if word in self.model.wv.index_to_key]
        if len(embs) == 0:
            return np.zeros(self.vector_size)
        sent_emb = np.mean(np.array(embs), axis=0)
        return sent_emb

    def transform(self, X, y=None):
        return np.array([self._word2vec_rep(doc) for doc in X])

text_clf = Pipeline([
    ('prep', PreProcessor()),
    #('count', CountVectorizer(max_features=300)),
    ('vec', TfidfVectorizer()),
    #('vec', Word2VecTransformer()),
    #('rep', TfidfTransformer()),
    #('mod', KNeighborsClassifier()),
    ('mod', MLPClassifier(solver='lbfgs', hidden_layer_sizes=(32,)))
])


text_clf.fit(train_x, train_y)
predictions = text_clf.predict(test_x)
acc = accuracy_score(predictions, test_y)
print(acc)

0.7880285557386052


## Representation Learning

## NLP Algorithms

## Evaluation

## Paper Overview

## Implementation of C-LSTM Algorithm

## Paper Evaluation