In [105]:
import pandas as pd
import pickle
import numpy as np
import os
from keras.models import Sequential, load_model, Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, Embedding, Concatenate
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import LSTM, Bidirectional, Convolution1D, Conv1D, AveragePooling1D, MaxPooling1D, GlobalMaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import SGD

from gensim.scripts.glove2word2vec import glove2word2vec 
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

import html, re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/syht/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/syht/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
path = "data/"
GLOVE_DIR = "data/glove.twitter.27B/"

In [7]:
input_dim = 100

In [8]:
def ReadCSV(datafile, labelfile):
    inputdata = pd.io.parsers.read_csv(open(datafile, "r"),delimiter=",")
    data = inputdata.as_matrix()
    #data = (tmpdata/255.0)-0.5
    label = np.loadtxt(open(labelfile, "rb"),delimiter=",")
    return data, label

In [9]:
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

def stopwordsremoval(sentence):
    stopwords_removed = [word for word in sentence.split(' ') if word not in stopwords.words('english')]
    return stopwords_removed

def clean_str(string):
    string = html.unescape(string)
    string = string.replace("\\n", " ")
    #string = string.replace("_NEG", "")
    #string = string.replace("_NEGFIRST", "")
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) #removes @---, 
    string = re.sub(r"\*", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'m", " \'m", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ,", string)
    string = re.sub(r"!", " !", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ?", string)
    string = re.sub(r"\s{2,}", " ", string)
    return stopwordsremoval(strip_punctuation(string.strip().lower()))

def preprocessing(train_file): ## we will return everything as dictionaries
    corpus_dict = {}
    intensity_dict = {}
    affect_dict = {}
    df=pd.read_csv(train_file,encoding='utf-8')
    id = df['ID'] # not used
    train_sentences=df['Tweet']
    intensity_scores=df['Intensity Score']
    affect_dimension = df['Affect Dimension']
    
    for (k1,v1),(k2,v2),(k3,v3) in zip(train_sentences.iteritems(), intensity_scores.iteritems(), affect_dimension.iteritems()):
        intensity_dict[k2] = v2
        affect_dict[k3] = v3
        # adding processed tweets in a dict
        sentence = sent_tokenize(v1) # sentence tokenize, list of sentences
        processed_tweet = []
        for sen in sentence:
            sen1=""
            sen1 = clean_str(sen)
            processed_tweet = processed_tweet+sen1
        corpus_dict[k1]=processed_tweet 
    return corpus_dict,affect_dict,intensity_dict

In [10]:
def one_hot_encoding(y):
    y = to_categorical(y)
    return y[:,1:] #remove extra zero column at the first

In [11]:
def prepare_data(data_file_name):
    data_path = path + data_file_name
    processed_data_path = path + 'processed-' + data_file_name
    # check if file is processed
    if os.path.isfile(processed_data_path):
        print("Processed file:", data_file_name)
        df = pd.read_csv(processed_data_path)
        inputs = [str(x).split() for x in df.iloc[:, 1].values]
        labels = df.iloc[:, 0].values
        return (inputs, labels)
    
    # preprocessing and save into csv file
    print("Preprocessing data file:", data_file_name)
    inputs, labels, _ = preprocessing(data_path)

    # convert dict into array
    inputs = dict_to_array(inputs)
    labels = dict_to_array(labels)

    # save into csv
    df_save = pd.DataFrame({'x': [' '.join(x) for x in inputs], 'label': labels})
    df_save.to_csv(processed_data_path, encoding='utf-8', index=False)
    
    return (inputs, labels)    

In [13]:
# convert glove to w2v
glove_input_file = GLOVE_DIR + 'glove.twitter.27B.100d.txt'
word2vec_output_file = GLOVE_DIR + 'word2vec.twitter.27B.100d.txt'

if not os.path.isfile(word2vec_output_file):
    glove2word2vec(glove_input_file, word2vec_output_file)
    print("Glove to Word2Vec conversion Done!")

word2vec = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
print("Load word2vec done!")

Glove to Word2Vec conversion Done!
Load word2vec done!


In [125]:
# read data file
train_data, train_label = prepare_data('EI-reg-En-full-train.csv')
dev_data, dev_label = prepare_data('EI-reg-En-full-dev.csv')
test_data, test_label = prepare_data('EI-reg-En-part-test.csv')

print("Train:", len(train_data), len(train_label))
print("Val:", len(dev_data), len(dev_label))
print("Test:", len(test_data), len(test_label))

Processed file: EI-reg-En-full-train.csv
Processed file: EI-reg-En-full-dev.csv
Processed file: EI-reg-En-part-test.csv
Train: 7102 7102
Val: 1464 1464
Test: 2000 2000


In [15]:
input_data = np.concatenate((train_data, dev_data, test_data))
max_sequence_length = max([len(x) for x in input_data])
print("Max sequence length:", max_sequence_length)

Max sequence length: 23


In [16]:
# embedding data
def embedding(data, max_len):
    data_eb = [] #np.zeros((len(data), max_len, input_dim))
    for i in range(len(data)):
        row_eb = []
        for j, token in enumerate(data[i]):
            if token in word2vec:
                #data_eb[i][-len(data[i]) + j] = word2vec[token]     
                row_eb.append(word2vec[token])
        data_eb.append(row_eb)
    return pad_sequences(data_eb, maxlen=max_len)

train_data = embedding(train_data, max_sequence_length)
dev_data = embedding(dev_data, max_sequence_length)
test_data = embedding(test_data, max_sequence_length)

print("Train embedding:", train_data.shape, train_label.shape)
print("Dev embedding:", dev_data.shape, dev_label.shape)
print("Test embedding:", test_data.shape, test_label.shape)

Train embedding: (7102, 23, 100) (7102,)
Dev embedding: (1464, 23, 100) (1464,)
Test embedding: (2000, 23, 100) (2000,)


In [129]:
# convert label to one-hot vector
labels = np.concatenate((train_label, dev_label, test_label))
number_classes = len(np.unique(labels))
print("Number class:", number_classes)
y_oh = one_hot_encoding(labels)

train_label = y_oh[:train_label.shape[0]]
dev_label = y_oh[train_label.shape[0]:train_label.shape[0] + dev_label.shape[0]]
test_label = y_oh[-test_label.shape[0]:]

print("One-hot encoded:", train_label.shape, dev_label.shape, test_label.shape)

Number class: 4
One-hot encoded: (7102, 4) (1464, 4) (2000, 4)


In [18]:
def compile_model_lstm(input_dim, latent_dim, num_class):
    '''Create model

    Args:
        input_dim (int): dim of embedding vector (glove dimension)
        latent_dim (int): dim of output from LSTM layer
        num_class (int): number output class
    '''
    inputs = Input(shape=(None, input_dim))
    lstm = LSTM(latent_dim)(inputs)
    drop = Dropout(0.3)(lstm)
    #flat = Flatten()(drop)
    out = Dense(num_class, activation='softmax')(drop)

    model = Model(inputs, out)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [19]:
def compile_model_bi_lstm(input_dim, latent_dim, num_class):
    '''Create model

    Args:
        input_dim (int): dim of embedding vector (glove dimension)
        latent_dim (int): dim of output from LSTM layer
        num_class (int): number output class
    '''
    inputs = Input(shape=(None, input_dim))
    lstm = Bidirectional(LSTM(latent_dim))(inputs)
    drop = Dropout(0.5)(lstm)
    out = Dense(num_class, activation='softmax')(drop)

    model = Model(inputs, out)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [102]:
def compile_model_cnn(input_dim, max_len, num_class):
    inputs = Input(shape=(max_len, input_dim), name='input', dtype='float32')
    
    conv = Convolution1D(256, kernel_size=3, padding='valid', activation='relu')(inputs)
    conv = MaxPooling1D(pool_size=3)(conv)
    
    conv = Flatten()(conv)
    conv = Dropout(0.2)(conv)
    
    # Dense
    z = Dense(1024, activation='relu')(conv)
    z = Dropout(0.5)(z)
    
    # Dense
    z = Dense(1024, activation='relu')(conv)
    z = Dropout(0.5)(z)
    
    # Output dense
    out = Dense(num_class, activation='softmax')(z)
    
    model = Model(inputs, out)
    sgd = SGD(lr=0.01, momentum=0.9)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [103]:
def compile_model_cnn_2(input_dim, max_len, num_class):
    inputs = Input(shape=(max_len, input_dim), name='input', dtype='float32')
    
    filter_kernels = [5, 3, 3]
    nb_filter = 256
    convs = []
    # Convolution
    for fz in filter_kernels:
        conv = Convolution1D(nb_filter, kernel_size=fz, padding='valid', activation='relu')(inputs)
        pool = MaxPooling1D(pool_size=3)(conv)
        flatten = Flatten()(pool)
        convs.append(flatten)
    
    conv_out = Concatenate()(convs)

    # Dense
    z = Dense(1024, activation='relu')(conv_out)
    z = Dropout(0.5)(z)
    
    # Dense
    z = Dense(1024, activation='relu')(z)
    z = Dropout(0.5)(z)
    
    # Output dense
    out = Dense(num_class, activation='softmax')(z)
    
    model = Model(inputs, out)
    sgd = SGD(lr=0.01, momentum=0.9)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    model.summary()
    
    return model

In [52]:
def compile_model_cnn_lstm(input_dim, latent_dim, num_class):
    inputs = Input(shape=(None, input_dim))
    
    conv = Convolution1D(256, kernel_size=3, padding='valid', activation='relu')(inputs)
    conv = MaxPooling1D(pool_size=2)(conv)
    
    lstm = LSTM(latent_dim)(conv)
    drop = Dropout(0.3)(lstm)
    
    out = Dense(num_class, activation='softmax')(drop)

    model = Model(inputs, out)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [110]:
# Concat to train on both train + dev set, only validate on test set
X_train = np.concatenate((train_data, dev_data))
y_train = np.concatenate((train_label, dev_label))
print("Training size:", X_train.shape)
print("Test size:", test_data.shape)

Training size: (8566, 23, 100)
Test size: (2000, 23, 100)


In [111]:
def run_lstm(epochs=1, batch_size=128):
    # create lstm model
    model = compile_model_lstm(input_dim, 64, number_classes)

    checkpointer = ModelCheckpoint(filepath='twitter-emotion-lstm.h5', verbose=1, save_best_only=True)
    model.fit(X_train, y_train, validation_data=(test_data, test_label), callbacks=[checkpointer], 
              shuffle=True, epochs=epochs, batch_size=batch_size, verbose=1)

In [112]:
def run_bi_lstm(epochs=1, batch_size=128):
    # create bi-lstm model
    model = compile_model_bi_lstm(input_dim, 64, number_classes)

    checkpointer = ModelCheckpoint(filepath='twitter-emotion-bi_lstm.h5', verbose=1, save_best_only=True)
    model.fit(X_train, y_train, validation_data=(test_data, test_label), callbacks=[checkpointer], 
              shuffle=True, epochs=epochs, batch_size=batch_size, verbose=1)

In [113]:
def run_cnn_lstm(epochs=1, batch_size=128):
    # create cnn-lstm model
    model = compile_model_cnn_lstm(input_dim, 64, number_classes)

    checkpointer = ModelCheckpoint(filepath='twitter-emotion-cnn-lstm.h5', verbose=1, save_best_only=True)
    model.fit(X_train, y_train, validation_data=(test_data, test_label), callbacks=[checkpointer], 
              shuffle=True, epochs=epochs, batch_size=batch_size, verbose=1)


In [114]:
def run_cnn(epochs=1, batch_size=128):
    # create cnn model
    model = compile_model_cnn(input_dim, max_sequence_length, number_classes)

    checkpointer = ModelCheckpoint(filepath='twitter-emotion-cnn.h5', verbose=1, save_best_only=True)
    model.fit(X_train, y_train, validation_data=(test_data, test_label), callbacks=[checkpointer], 
              shuffle=True, epochs=epochs, batch_size=batch_size, verbose=1)

In [115]:
def run_cnn_2(epochs=1, batch_size=128):
    # create cnn model
    model = compile_model_cnn_2(input_dim, max_sequence_length, number_classes)

    checkpointer = ModelCheckpoint(filepath='twitter-emotion-cnn_2.h5', verbose=1, save_best_only=True)
    model.fit(X_train, y_train, validation_data=(test_data, test_label), callbacks=[checkpointer], 
              shuffle=True, epochs=epochs, batch_size=batch_size, verbose=1)

In [117]:
#run_lstm(10)
run_bi_lstm(10)
#run_cnn_lstm(10)
#run_cnn(10)
#run_cnn_2(10)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, None, 100)         0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               84480     
_________________________________________________________________
dropout_32 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 4)                 516       
Total params: 84,996
Trainable params: 84,996
Non-trainable params: 0
_________________________________________________________________
Train on 8566 samples, validate on 2000 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 1.31252, saving model to twitter-emotion-bi_lstm.h5
 - 11s - loss: 1.3541 - acc: 0.3324 - val_loss: 1.3125 - val_acc: 0.3605
Epoch 2/10
Epoch 00002: val_loss improved f