In [1]:
# VADER: https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-rule-based-vader-and-nltk-72067970fb71

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.sparse

In [2]:
def load_cleaned_submissions():
    data = pd.read_pickle("./data/cleaned_submissions.pkl")
    data = data.loc[data["sentiment"] != "", ["text", "sentiment"]] # Only get labeled instances
    return data

In [3]:
def encode_vectorize_data(data):
    # Encode labels
    lenc = LabelEncoder()
    y = lenc.fit_transform(data["sentiment"])

    # Vectorize text using tfidf
    tfidf = TfidfVectorizer(preprocessor=' '.join, lowercase=False, min_df=5) # min_df = Minimum occurance of words
    X = tfidf.fit_transform(data["text"])
    return X, y

In [4]:
def load_sparse_matrices():
    X = scipy.sparse.load_npz('./data/X_sparse.npz')
    y = np.load("./data/y_sparse.npy")
    return X, y

In [5]:
def split_train_test(X, y):
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
    return X_train, X_test, y_train, y_test

# SVM

In [14]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

pipeline = Pipeline([
        ("svc", SVC())
    ])

param_grid = {"svc__kernel": ["poly"], "svc__C": [3]} #["poly", "rbf", "sigmoid", "linear"], [3, 4, 5, 6, 7]
CV = GridSearchCV(pipeline, param_grid, cv = 5)
# pipeline.get_params().keys() See all available parameters
CV.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('svc', SVC())]),
             param_grid={'svc__C': [3], 'svc__kernel': ['poly']})

In [17]:
import pickle
pickle.dump(CV, open("./data/svm_first_model_poly_3.sav", "wb"))

In [18]:
loaded_model = pickle.load(open("./data/svm_first_model_poly_3.sav", "rb"))

In [19]:
print("Accuracy of {} can be achieved with the following parameters: {}".format(loaded_model.score(X_test, y_test), CV.best_params_))

Accuracy of 0.8837895792141246 can be achieved with the following parameters: {'svc__C': 3, 'svc__kernel': 'poly'}


# Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

pipeline = Pipeline([
        ("mnb", MultinomialNB())
    ])

param_grid = {"mnb__alpha": [1]}
CV = GridSearchCV(pipeline, param_grid, cv = 5)
# pipeline.get_params().keys() See all available parameters
CV.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('mnb', MultinomialNB())]),
             param_grid={'mnb__alpha': [1]})

In [23]:
import pickle
pickle.dump(CV, open("./data/nb_first_model_mnb.sav", "wb"))

In [24]:
loaded_model = pickle.load(open("./data/nb_first_model_mnb.sav", "rb"))

In [25]:
print("Accuracy of {} can be achieved with the following parameters: {}".format(loaded_model.score(X_test, y_test), CV.best_params_))

Accuracy of 0.7632626918042831 can be achieved with the following parameters: {'mnb__alpha': 1}


# LSTM

In [6]:
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

In [9]:
X, y = load_sparse_matrices()
X_train, X_test, y_train, y_test = split_train_test(X, y)
del X, y

In [8]:
import tensorflow as tf
print(tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


/device:GPU:0
Num GPUs Available:  1
Num GPUs Available:  1


In [20]:
# Flatten array
# Limit Size for DEV
X_train_lstm = X_train.toarray()[:, :, None]
y_train_lstm = y_train[:]
del X_train
del y_train

MemoryError: Unable to allocate 20.6 GiB for an array with shape (143635, 19267) and data type float64

In [10]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))
print(tf.config.list_physical_devices())

2.7.0
Num GPUs Available:  1
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [13]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

In [21]:
from tensorflow.keras.layers import Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer # for creating custom layers

class OneHotEncoding(Layer):
     def __init__(self, num_classes=None, sequence_length=None):
         if num_classes is None or sequence_length is None:
             raise ValueError("Can't leave params @num_classes or @sequence_length empty")
         super(OneHotEncoding, self).__init__()
         self.num_classes = num_classes
         self.sequence_length = sequence_length
     def encode(self, inputs):
         return K.one_hot(indices=inputs,
                          num_classes=self.num_classes)
     def call(self, inputs):
         return Lambda(function=self.encode,
                       input_shape=(self.sequence_length,))(inputs)

In [25]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, CuDNNLSTM

def build_model(dim, hidden_states, opt):
    model = Sequential()
    # model.add(Embedding(19267, dim)) # Input dim is X_train_lstm.shape[1], dim is the output dimensionality
    model.add(CuDNNLSTM(hidden_states))
    model.add(Dense(1, activation = "softmax"))
    model.compile(opt, "categorical_crossentropy", metrics = ["acc"])
    return model

In [None]:
# Implements a generator to feed the model batches of the data
dim = [256] #[256, 512, 1024]
hidden_states = [16] #[16, 32, 64]
optimi = ["rmsprop"] #["rmsprop", "SGD", "Adam"]

acc = []
val_acc = []
loss = []
val_loss = []

param_list = []


for d in dim:
    for state in hidden_states:
        for opt in optimi:
            model = build_model(d, state, opt)
            history = model.fit(X_train, y_train,
                        epochs=2,
                        batch_size=8,
                        validation_split=0.2,
                        verbose=2)
            acc.append(history.history['acc'])
            val_acc.append(history.history['val_acc'])
            loss.append(history.history['loss'])
            val_loss.append(history.history['val_loss'])
            param_list.append("Optimizer: " + opt + " - States: " + str(state) + " - Dimensions:" + str(d))


In [15]:
# Original implementation. Runs out of memory when using the full dataset.
dim = [256] #[256, 512, 1024]
hidden_states = [16] #[16, 32, 64]
optimi = ["rmsprop"] #["rmsprop", "SGD", "Adam"]

acc = []
val_acc = []
loss = []
val_loss = []

param_list = []

for d in dim:
    for state in hidden_states:
        for opt in optimi:
            model = build_model(d, state, opt)
            history = model.fit(X_train_lstm, y_train_lstm,
                        epochs=2,
                        batch_size=8,
                        validation_split=0.2,
                        verbose=2)
            acc.append(history.history['acc'])
            val_acc.append(history.history['val_acc'])
            loss.append(history.history['loss'])
            val_loss.append(history.history['val_loss'])
            param_list.append("Optimizer: " + opt + " - States: " + str(state) + " - Dimensions:" + str(d))


InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [30]:
# DEV!!!
model = Sequential()
model.add(Embedding(19267, 256))
model.add(LSTM(16))
model.add(Dense(1, activation = "softmax"))
model.compile("rmsprop", "categorical_crossentropy", metrics = ["acc"])
history = model.fit(X_train_lstm, y_train_lstm,
                        epochs=2,
                        batch_size=32,
                        verbose=0)

In [54]:
# Save Model
from keras.models import load_model
history.model.save('./data/small_lstm_model.h5')

In [55]:
loaded_model = load_model('./data/small_lstm_model.h5')

In [57]:
print(loaded_model.history)

None


In [62]:
print(loaded_model.history)

None


In [42]:
print(history.history) # Loss and Accuracy

{'loss': [0.0, 0.0], 'acc': [0.6518188714981079, 0.6518188714981079]}
{'verbose': 0, 'epochs': 2, 'steps': 4489}


In [64]:
import pickle
with open('./data/train_history', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

In [65]:
with open('./data/train_history', 'rb') as file_pi:
    h = pickle.load(file_pi)

In [66]:
print(h)

{'loss': [0.0, 0.0], 'acc': [0.6518188714981079, 0.6518188714981079]}


In [32]:
import matplotlib.pyplot as plt

def plot_history(data_list, label_list, title, xlabel='Epochs', ylabel=None):
    ''' Plots a list of vectors.

    Parameters:
        data_list  : list of vectors containing the values to plot
        label_list : list of labels describing the data, one per vector
        title      : title of the plot
        ylabel     : label for the y axis
    '''
    epochs = range(1, len(data_list[0]) + 1)

    for data, label in zip(data_list, label_list):
        plt.plot(epochs, data, label=label)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()

    plt.show()


In [33]:
plot_history(data_list=val_loss,
             label_list=[[""]*len(param_list)], #param_list,
             title='Comparison of different recurrent layer types',
             ylabel='Loss')
plot_history(data_list=val_acc,
             label_list=[[""]*len(param_list)], #param_list,
             title='Comparison of different recurrent layer types',
             ylabel='Validation accuracy')

NameError: name 'val_loss' is not defined

# BERT