In [123]:

# Sentence --->Cleaning--> Normalization --> Tokenizer ---> Vocab2number ---> Padding ---> Numerical input --->
#               Word2vecModel---> Embedding(sentences, words, features) ---> LSTM (sentences, words, features) ---> Dense ---> Prediction
#                 ||                      ||                                                ||                        ||
#             (setnences, features)     (setnences, words, features)                  (setnences, features)         (setnences, labels)

In [124]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec


In [125]:
df = pd.read_csv('HoF_data.tsv', sep='\t')
X, Y = df.Tweet, df['Task A ']



In [126]:

def text_transformation(text_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    transformed_text = tokenizer.texts_to_sequences(text_data)
    padded_text = pad_sequences(transformed_text, padding="pre", value=0)
    return tokenizer, padded_text


In [127]:
def inverse_transformation(tokenizer, test_data):
    tokenizer_dict = dict((v, k) for k,v in tokenizer.word_index.items())
    sentences = []
    for transformed_text in tokenizer.texts_to_sequences(test_data):
        sentence = []
        for token in transformed_text:
            sentence.append(tokenizer_dict[token])
        sentences.append(sentence)
        del sentence
    return sentences


In [128]:
def pretrained_embedding_creation(tokenizer, text_data, vector_size, model_name):
    tokenized_text = inverse_transformation(tokenizer, text_data)
    word2vec_model = Word2Vec(tokenized_text, size=vector_size, window=3, min_count=1)
    word2vec_model.save(model_name)


In [129]:
def label_encoder(Y):
    class_label_encoder = LabelEncoder()
    class_label_encoder.fit(Y)
    y_transformed = class_label_encoder.transform(Y).reshape(-1, 1)
    return class_label_encoder, y_transformed



In [130]:

def map_vector(tokenizer, word2vec_model, vector_dim):
    vocab = tokenizer.word_index
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, vector_dim))

    for word, i in vocab.items():
        try:
            weight_matrix[i] = word2vec_model.wv[word]
        except:
            pass
    return weight_matrix


In [131]:
# Parameter and Hyper-parameter initialization
embedding_dim = [25, 50, 100, 200]
lstm_units = 50
batch_size = 10
epoches = 10
test_data = 0.2

Tokenizer_obj, x_data = text_transformation(X)
label_encoder, y_data = label_encoder(Y)
pretrained_embedding_creation(Tokenizer_obj, X, embedding_dim[1], 'HoF_embedding.50d.txt')

X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data, test_size=test_data, shuffle=True)


In [132]:
# Load pretrained embedding from word2vec model
w2v_model = Word2Vec.load('HoF_embedding.50d.txt')
print(w2v_model)

Word2Vec(vocab=15497, size=50, alpha=0.025)


In [133]:
# For work with Glove word embedding
# steps:
# 1. Download Glove pretrained embedding. Here we are using Twitter pretrained embedding which contains 25, 50, 100, 200 dimentional vectors. We'll use all dimentional vectors and compare the results.
# 2. Use Gensim library to convert Glove vector to Word2vec vector
# 3. Load Wordvec model with the Word2Vec.load function of gensim library

# !wget http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
# !unzip glove*.zip

In [134]:
# Map numerical representation with pretrained embedding with respect to the token

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

# glove2word2vec(glove_input_file="glove.twitter.27B.25d.txt", word2vec_output_file="word2vec_glove_vectors.25d.txt")
glove_model_25d = KeyedVectors.load_word2vec_format("word2vec_glove_vectors.25d.txt", binary=False)
glove_25_weight_matrix = map_vector(Tokenizer_obj, glove_model_25d, embedding_dim[0])

# glove2word2vec(glove_input_file="glove.twitter.27B.50d.txt", word2vec_output_file="word2vec_glove_vectors.50d.txt")
glove_model_50d = KeyedVectors.load_word2vec_format("word2vec_glove_vectors.50d.txt", binary=False)
glove_50_weight_matrix = map_vector(Tokenizer_obj, glove_model_50d, embedding_dim[1])

# glove2word2vec(glove_input_file="glove.twitter.27B.100d.txt", word2vec_output_file="word2vec_glove_vectors.100d.txt")
glove_model_100d = KeyedVectors.load_word2vec_format("word2vec_glove_vectors.100d.txt", binary=False)
glove_100_weight_matrix = map_vector(Tokenizer_obj, glove_model_100d, embedding_dim[2])

# glove2word2vec(glove_input_file="glove.twitter.27B.200d.txt", word2vec_output_file="word2vec_glove_vectors.200d.txt")
glove_model_200d = KeyedVectors.load_word2vec_format("word2vec_glove_vectors.200d.txt", binary=False)
glove_200_weight_matrix = map_vector(Tokenizer_obj, glove_model_200d, embedding_dim[3])


  if __name__ == '__main__':


In [135]:
local_weight_matrix = map_vector(Tokenizer_obj, w2v_model, embedding_dim[1])

In [136]:

'''
vocab_size = 1 x 10000
output = 1 x 100
1.  Pretrained embeddings (PE)
2.  Input(Embedding Layer) == Output(Pretrained Embedding)
tokenizer.word_index = {'the':1, 'is':2, 'us':3, ......., 'home':10000}
PE = {'is':1, 'home':2, 'us':3, ......, 'the':10010}
'''


def singleLayerModel(vocab_size, embedding_dim, lstm_units, pretrained_embedding):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[pretrained_embedding], trainable=False, mask_zero=True))
    model.add(LSTM(units=lstm_units, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [137]:
sDLM_25 = singleLayerModel(len(Tokenizer_obj.word_index)+1, embedding_dim[0], lstm_units, glove_25_weight_matrix)
sDLM_25_history = sDLM_25.fit(X_train, Y_train, epochs=epoches, batch_size=batch_size, validation_data=(X_test, Y_test))

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 25)          387450    
                                                                 
 lstm_11 (LSTM)              (None, 50)                15200     
                                                                 
 dense_11 (Dense)            (None, 1)                 51        
                                                                 
Total params: 402,701
Trainable params: 15,251
Non-trainable params: 387,450
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
sDLM_50 = singleLayerModel(len(Tokenizer_obj.word_index)+1, embedding_dim[1], lstm_units, glove_50_weight_matrix)
sDLM_50_history = sDLM_50.fit(X_train, Y_train, epochs=epoches, batch_size=batch_size, validation_data=(X_test, Y_test))

In [None]:
sDLM_100 = singleLayerModel(len(Tokenizer_obj.word_index)+1, embedding_dim[2], lstm_units, glove_100_weight_matrix)
sDLM_100_history = sDLM_100.fit(X_train, Y_train, epochs=epoches, batch_size=batch_size, validation_data=(X_test, Y_test))

In [None]:
sDLM_200 = singleLayerModel(len(Tokenizer_obj.word_index)+1, embedding_dim[3], lstm_units, glove_200_weight_matrix)
sDLM_200_history = sDLM_200.fit(X_train, Y_train, epochs=epoches, batch_size=batch_size, validation_data=(X_test, Y_test))

In [None]:
DLM_local = singleLayerModel(len(Tokenizer_obj.word_index)+1, embedding_dim[1], lstm_units, local_weight_matrix)
sDLM_local_history = DLM_local.fit(X_train, Y_train, epochs=epoches, batch_size=batch_size, validation_data=(X_test, Y_test))

In [None]:
history = [sDLM_local_history, sDLM_25_history, sDLM_50_history, sDLM_100_history, sDLM_200_history]
history_name = ['sDLM_local_history', 'sDLM_25_history', 'sDLM_50_history', 'sDLM_100_history', 'sDLM_200_history']

his_train_acc, his_val_acc, his_train_loss, his_val_loss = dict(), dict(), dict(), dict()

for idx, hist in enumerate(history):
  his_train_acc[history_name[idx]] = hist.history['accuracy']
  his_val_acc[history_name[idx]] = hist.history['val_accuracy']

  his_train_loss[history_name[idx]] = hist.history['loss']
  his_val_loss[history_name[idx]] = hist.history['val_loss']

In [None]:
print(pd.DataFrame(his_train_acc))

In [None]:
print(pd.DataFrame(his_train_loss))

In [None]:
print(pd.DataFrame(his_val_acc))

In [None]:
print(pd.DataFrame(his_val_loss))

In [None]:
# The prediction is shown on a single trained model, sDLM_local_history. By the same way, you can use the other trained models to predict and perform evaluations.
predictedion = sDLM_local_history.predict(X_test)

In [None]:
predict_class = list()
for pred_prob in predictedion:
  if pred_prob >= 0.5:
    predict_class.append(1)
  else:
    predict_class.append(0)

predicted_labels = label_encoder.inverse_transform(predict_class)
Y_test_labels = label_encoder.inverse_transform(Y_test)

In [None]:
from sklearn import metrics

score = metrics.classification_report(Y_test_labels, predicted_labels, target_names=label_encoder.classes_)
cf_matrix = metrics.confusion_matrix(Y_test_labels, predicted_labels, labels=label_encoder.classes_)
print(score)
print(cf_matrix)


In [None]:
import seaborn as sn
import matplotlib.pyplot as plt


group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

sn.set(font_scale=1.4) # for label size
ax= plt.subplot()
sn.heatmap(cf_matrix, annot=labels, fmt='') # font size
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(label_encoder.classes_)
ax.yaxis.set_ticklabels(label_encoder.classes_)

plt.show()