In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from our_functionsv3 import *

import tensorflow.keras as keras
from tensorflow import set_random_seed
set_random_seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.callbacks import EarlyStopping
from keras import regularizers
import time
import matplotlib
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [3]:
from LSTM_functions import *

In [13]:
from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten

### Load data

In [14]:
X_train, Y_train = get_data(pos = "twitter-datasets/train_pos_preprocessed_to_use.txt", neg = "twitter-datasets/train_neg_preprocessed_to_use.txt")
Y_train_oh = convert_to_one_hot(Y_train, C=2)

In [15]:
ids, _ = get_test_data("twitter-datasets/test_data.txt")
X_test = read_data("twitter-datasets/test_preprocessed_to_use.txt")

In [16]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs_only_alpha('dictionnary/glove.twitter.27B.200d.txt')

### Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences

In [17]:
max_length = get_max_length(X_train)

In [18]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_length)
X_test_indices = sentences_to_indices(X_test, word_to_index, max_length)

37711 words were not in the dictionary
1898 words were not in the dictionary


### Model

In [19]:
def smiley_LSTM_improved(input_shape, word_to_vec_map, word_to_index, dropout_rate, hidden_layers, reg, l1_lambda, l2_lambda):
    
    if reg=='l2':
        regularizer = regularizers.l2(l2_lambda)
    elif reg=='l1':
        regularizer = regularizers.l1(l1_lambda)
    else:
        regularizer = None
        
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    X = Conv2D(32, kernel_size=(3, 3), activation='relu') (embeddings)
    X = MaxPooling2D(pool_size=(2, 2)) (X)
    X = Conv2D(64, (3, 3), activation='relu') (X)
    X = Dropout(0.5)(X)
    X = Dense(128, activation='relu')
    X = Dropout(0.5)(X)    
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    '''
    X = LSTM(32, return_sequences = True, return_state = False, kernel_regularizer=regularizer)(X)
    X = Dropout(0.5)(X) 
    X = LSTM(32, return_sequences = True, return_state = False, kernel_regularizer=regularizer)(X)
    X = Dropout(0.5)(X)   
    X = LSTM(32, return_sequences = False, return_state = False)(X)
    X = Dropout(0.5)(X)
    '''
    
    # Propagate X through a Dense layer with softmax activation to get back a batch of 2-dimensional vectors.
    X = Dense(2)(X)
    
    # Add a softmax activation
    X = Activation('softmax')(X)

    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

In [20]:
def complete_model_improved(X_train_indices, Y_train_oh, word_to_vec_map, word_to_index, max_length, summary = False, dropout_rate = 0.25, batch_size = 128, 
                   epochs = 50, loss ='categorical_crossentropy', optimizer ='adam', hidden_layers = 1, 
                            reg = '', l1_lambda = 0, l2_lambda = 0):
    
    model = smiley_LSTM_improved((max_length,), word_to_vec_map, word_to_index, dropout_rate, hidden_layers, reg, l1_lambda, l2_lambda)
    
    if summary:
        model.summary()
        
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    
    earlystop = EarlyStopping(monitor='val_acc', min_delta=0.0001, patience=3, verbose=1, mode='auto')
    callbacks_list = [earlystop]
    
    start = time.time()
    history_lstm = model.fit(X_train_indices, Y_train_oh, epochs = 50, callbacks=callbacks_list, batch_size = 32, validation_split = 0.1, shuffle=True)
    end = time.time()
    print("Model took {} seconds (which is {} minutes or {} hours) to train".format((end - start), (end - start)/60, (end - start)/3600))
    
    return history_lstm, model

In [21]:
l2_lambda = [0]
for l2 in l2_lambda:
    history, model = complete_model_improved(X_train_indices, Y_train_oh, word_to_vec_map, word_to_index, max_length, summary = True, dropout_rate = 50, batch_size = 128, epochs = 50, loss ='categorical_crossentropy', 
                                             optimizer = 'adam', hidden_layers = 0, reg = 'l2', l1_lambda = 0, l2_lambda = l2)
    label = predict_lstm(model, X_test_indices)
    path = 'submissions/submission_model_cnn.csv'
    create_csv_submission(ids, label, path)

KeyboardInterrupt: 