In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

from sklearn.model_selection import train_test_split

import tensorflow as tf
import matplotlib.pyplot as plt

# Parameters

In [None]:
# keep tokens with a min occurrence
min_occurance = 10
# Word2Vec parameters
size_word2vec = 100
min_count_word2vec = 10
# NN s general parameters
batch_size = 512
epochs = 50
test_size=0.25
# LSTM s parameters
lstm_size = 512
lstm_dropout = 0.5
lstm_recurrent_dropout = 0.0

# Import cleaned_data CSV

In [None]:
df = pd.read_csv(
    "C:/Users/Théo/Documents/twitter_sentiment_analysis/data/cleaned_data.csv",
    # "C:/Users/HENAFF/Documents/Cours Polytech/S9 en Roumanie/Machine Learning - ML/data/mid_cleaned_data.csv",
    # nrows=20000,
    encoding='latin-1')
df['clean_text'] = df.clean_text.astype(str)

dict_word = {}
sentences = []
for items in df['clean_text'].iteritems():
    words = items[1].split(" ")
    sentences.append(words)
    for word in words:
        if word in dict_word:
            dict_word[word] += 1
        else:
            dict_word[word] = 1  # dictionary['UNK']


cleaned_dict_word = [k for k,c in dict_word.items() if c >= min_occurance]

# Load Word2Vec model

In [None]:
from gensim.models import Word2Vec

# Load pretrained model (since intermediate data is not included, the model cannot be refined with additional data)
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, norm_only=True)

# Define functions to create the word2vec weight matrix

In [None]:
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
    return embedding


def get_weight_matrix(vocab):
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 100))

    for w, i in vocab.items():
        # The word_index contains a token for all words of the training data so we need to limit that
        if i < vocab_size:
            try:
                vect = model.wv.get_vector(w)
                weight_matrix[i] = vect
            # Check if the word from the training data occurs in the GloVe word embeddings
            # Otherwise the vector is kept with only zeros
            except:
                pass
        else:
            break
    return weight_matrix

# Convert text to vector

In [None]:
max_length = max([len(s.split()) for s in df['clean_text']])
# fit the tokenizer on the documents
tk = Tokenizer(lower=True)
tk.fit_on_texts(df['clean_text'].values)

X_seq = tk.texts_to_sequences(df['clean_text'].values)
X_pad = pad_sequences(X_seq, maxlen=max_length, padding='post')  # maxlen must be equal to maxword

# define vocabulary size (largest integer value)
vocab_size = len(tk.word_index) + 1

# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
# embedding_vectors = get_weight_matrix(raw_embedding, tk.word_index)
embedding_vectors = get_weight_matrix(tk.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, size_word2vec, input_length=max_length, trainable=False, weights=[embedding_vectors])

# Split test/train data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, df['polarity'].values, test_size=test_size, random_state=1, shuffle=True)

X_train1 = X_train[batch_size:]
y_train1 = y_train[batch_size:]
X_valid = X_train[:batch_size]
y_valid = y_train[:batch_size]

# Model description

In [None]:
model = Sequential()

model.add(embedding_layer)
model.add(LSTM(lstm_size, dropout=lstm_dropout, recurrent_dropout=lstm_recurrent_dropout, return_sequences=True))
model.add(LSTM(lstm_size, dropout=lstm_dropout, recurrent_dropout=lstm_recurrent_dropout))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train1, y_train1, shuffle=True, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=epochs)

# Evaluate the model performance

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy : ", scores[1])


loss = history.history['loss']
loss_val = history.history['val_loss']
accuracy = history.history['accuracy']
accuracy_val = history.history['val_accuracy']
epochs = range(1, len(loss)+1)

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, loss_val, 'b--', label='validation loss')
plt.plot(epochs, accuracy, 'r', label='Training accuracy')
plt.plot(epochs, accuracy_val, 'r--', label='validation accuracy')

plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.legend()
plt.grid()
plt.show()
