First, we import Google's trained word2Vec model

In [1]:
import gensim



In [2]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format ('models/GoogleNews-vectors-negative300.bin', binary=True)  

Now, we prepare the data by encoding every sentence as a sequence of word2vec-encoded words

In [4]:
import pandas as pd
import numpy as np
import nltk
import csv
from nltk.corpus import stopwords

In [32]:
raw_data_train = pd.read_csv("data/train.csv")
raw_data_test = pd.read_csv("data/test.csv")

In [34]:
frames = [raw_data_train, raw_data_test]

In [36]:
raw_data = pd.concat(frames, axis = 0)

In [37]:
print("The combined dataset contains {0} rows and {1} columns".format(len(raw_data), len(raw_data.columns)))

The combined dataset contains 5783 rows and 7 columns


We only need to keep three columns: The topic to compute the topic-relevance of a sentence, the sentence itself and the label of the argument

In [41]:
raw_data = raw_data[["topic", "candidate", "label"]]

In [42]:
raw_data.head()

Unnamed: 0,topic,candidate,label
0,We should limit executive compensation,A say on pay - a non-binding vote of the gener...,0
1,We should limit executive compensation,"A February 2009 report, published by the Insti...",1
2,We should limit executive compensation,The Financial Crisis has had a relatively smal...,0
3,We should limit executive compensation,"1990-1992 Lineberger Cancer Center, SPA person...",0
4,We should limit executive compensation,Countering the public uproar over excessive ex...,0


In [46]:
raw_data = raw_data.rename(index=str, columns={"candidate": "sentence", "label": "annotation"})

In [47]:
#from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics.pairwise import cosine_similarity
def encode_sentences(data):
    N_sentences = len(data) 
    encoded_sentences = []
    print("---------------------Now encoding sentences!---------------------")
    print("Max iterations:", N_sentences)
    # shuffle the dataframe rows
    data = data.sample(frac=1)
    
    labels = data.annotation.copy()
    labels = labels.values
    
    
    # take the topic that the sentence comes from,
    # to compute topic relevance
    topics = data.topic
    topics = list(topics)
    
    # Store the different amount of word counts,
    # together with the indices of the sentences
    # that contain this amount of words
    word_counts = {}
    
    max_words = 0
    
    
    # for each sentence:
    for i in range(N_sentences):
        # take the sentence from the dataframe
        sentence = data.sentence.iloc[i]
        # tokenize the sentence
        tokens = nltk.word_tokenize(sentence)
        # filter puncuation and stop words from the tokens
        words = []
        for token in tokens:
            if(token[0] not in ".,:;[](){}!?-_`'~\"^/1234567890"):
                words.append(token)
        N_words = len(words)
        
        # keep track of the maximum sentence length
        if(N_words > max_words):
            max_words = N_words
        
        
        # if this amount of words has been
        # encountered before, add the index
        # of the sentence
        if(N_words in word_counts):
            word_counts[N_words].append(i)
        # else, create new entry with index
        else:
             word_counts[N_words] = [i]
     
        # encode topic and add similarity of sentence to topic
        # as additional feature
        topic = topics[i]
        topic_words = topic.split()
        topic_vectors = []
        # compute the average word vector for the topic
        for word in topic_words:
            if(word in word2vec):
                word_vector = word2vec[word]
            else:
                word_vector = np.random.uniform(low = -0.01, high = 0.01, size = (300))
            topic_vectors.append(word_vector)
        topic_vectors = np.asarray(topic_vectors)
        avg_topic_vector = np.mean(topic_vectors, axis = 0)
            
        # store a sentence as a sequence of word vectors
        sequence = []
        for word in words:
            # embed a word using the Google word2vec model,
            # if it exists in the dictionary
            if(word in word2vec):
                 word_vector = word2vec[word]
            # if word does not exist in the word2vec model, 
            # add a randomized word vector instead
            else:
                word_vector = np.random.uniform(low = -0.01, high = 0.01, size = (300))
        
            
            # compute similarity between word and topic, then add as feature
            similarity = cosine_similarity([word_vector], [avg_topic_vector])
            #print("Current word:", word, "Curren topic:",  topic,  "similarity:", similarity)
            word_vector = np.append(word_vector, similarity)     
            # add word to the sequence
            sequence.append(word_vector)
        # convert list sequence to numpy array for convenience
        sequence = np.asarray(sequence)
        # print progress every 1000 epochs
        if(i % 1000 == 0):
            print("iteration :", i )
        encoded_sentences.append(sequence)
        
    encoded_sentences = np.asarray(encoded_sentences)
    
    
    """print("Now zero padding..")
    for i in range(N_sentences):
        # print progress every 1000 epochs
        if(i % 1000 == 0):
            print("iteration :", i )
        # compute how much zero padding is needed
        N_words = len(encoded_sentences[i])
        padding_needed = max_words - N_words
        for j in range(padding_needed):
            encoded_sentences[i] = np.append(encoded_sentences[i], [None], axis = 0)"""
        
    
    
    # create batches to speed-up training
    # group sentences with equal word counts into the same batches
    all_batches = []
    label_batches = []
    #print(max_words)
    #print(word_counts)
    for count in word_counts:
        # get the sentences with this amount of words
        sentence_idx = word_counts[count]
        batch = []
        label_batch = []
        # add each sentence with this amount of words
        # to the batch
        for idx in sentence_idx:
            batch.append(encoded_sentences[idx])
            label_batch.append(labels[idx])
            #print(label_batch)
        batch = np.asarray(batch)
        label_batch = np.asarray(label_batch)
        
        all_batches.append(batch)
        label_batches.append(label_batch)
        
    all_batches = np.asarray(all_batches)
    label_batches = np.asarray(label_batches)
    # now, all the different batches are stored in an
    # array, where each batch can be accessed by an 
    # index
    return all_batches, label_batches, data
    
    
    #return encoded_sentences, labels
        

Take the annotation as the labels and convert all the arguments to the positive class and the non-arguments to the negative class

In [48]:
encoded_sentences, labels, shuffled_data = encode_sentences(raw_data)

---------------------Now encoding sentences!---------------------
Max iterations: 5783
iteration : 0
iteration : 1000
iteration : 2000
iteration : 3000
iteration : 4000
iteration : 5000


In [49]:
encoded_sentences.shape

(77,)

In [17]:
#encoded_sentences, labels = encode_sentences(raw_data)

In [18]:
#labels.shape

In [19]:
#encoded_sentences[100][0]

In [20]:
#from keras.preprocessing.sequence import pad_sequences

In [21]:
#print(encoded_sentences.shape)
#encoded_sentences = pad_sequences(encoded_sentences, padding = "post", value = np.zeros(301), maxlen = 20)

Create train and test sets

In [50]:
N = len(encoded_sentences)
train_test_split = 0.5
validation_size = (1 - train_test_split) / 2
x_train = encoded_sentences[:int(train_test_split*N)]
y_train = labels[:int(train_test_split*N)]

x_val = encoded_sentences[int(train_test_split*N) : int(train_test_split*N) + int(validation_size*N)]
y_val = labels[int(train_test_split*N) : int(train_test_split*N) + int(validation_size*N)]

x_test = encoded_sentences[int(train_test_split*N) + int(validation_size*N):]
y_test = labels[int(train_test_split*N) + int(validation_size*N):]

initialize the Keras LSTM model

In [51]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.optimizers import RMSprop

def init_model(size = 50, dropout = 0.5, learning_rate = 0.01):
    model = Sequential()
    model.add(Bidirectional(LSTM(size), merge_mode='concat', input_shape=(None, 301)))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))

    rmsprop = RMSprop(lr=learning_rate, rho=0.9, epsilon=None, decay=0.0)
    model.compile(loss='binary_crossentropy',
                  optimizer=rmsprop,
                  metrics=['accuracy'])
    return model




Using TensorFlow backend.


In [52]:
epochs = 10

In [53]:
def get_accuracy(data, labels):
    total_accuracy = 0
    for i in range(len(data)):
        score = model.evaluate(data[i], labels[i], verbose = 0)
        #print(score)
        total_accuracy += score[1]
    total_accuracy = total_accuracy / len(data)
    return total_accuracy

In [38]:
from sklearn.model_selection import KFold

def kfold(data, labels, folds, epochs):
    kf = KFold(n_splits=folds)

    avg_accuracy = 0
    current_fold = 0
    for train_index, test_index in kf.split(data):
        # reset the classifier
        model = init_model()
        
        
        Xtrain = data[train_index]
        Xtest = data[test_index]
        
        Ytrain = labels[train_index]
        Ytest = labels[test_index]
        
        # train on the train data
        for e in range(epochs):
            for i in range(len(Xtrain)):
                model.fit(Xtrain[i], Ytrain[i], epochs=1, verbose = 0, batch_size = Xtrain[i].shape[0])
            # get train accuracy
            train_acc = get_accuracy(Xtrain, Ytrain)
            print("Fold: {0} \n Epoch: {1} \n Train accuracy: {2}".format(current_fold, e, train_acc))
                
        # test on the test data
        test_acc = get_accuracy(Xtest, Ytest)
        print("Fold: {0} \n Test accuracy: {2}".format(current_fold, test_acc))
        avg_accuracy += test_acc 
        current_fold += 1
            
    avg_accuracy /= folds
    print("{0}-fold cross validation accuracy: {1}".format(folds, avg_accuracy))

In [39]:
kfold(encoded_sentences, labels, 5, 10)

Fold: 0 
 Epoch: 0 
 Train accuracy: 0.5035470218752873
Fold: 0 
 Epoch: 1 
 Train accuracy: 0.5035470218752873
Fold: 0 
 Epoch: 2 
 Train accuracy: 0.5035470218752873
Fold: 0 
 Epoch: 3 
 Train accuracy: 0.5035470218752873
Fold: 0 
 Epoch: 4 
 Train accuracy: 0.5035470218752873


KeyboardInterrupt: 

In [57]:
model = init_model()

In [58]:
print("Amount of batches:", len(x_train))
for e in range(epochs):
    print("--------------Training epoch:--------------", e)
    total_correct = 0
    total = 0
    
    # shuffle training data and labels
    rng_state = np.random.get_state()
    np.random.shuffle(x_train)
    np.random.set_state(rng_state)
    np.random.shuffle(y_train)
    np.random.seed()
    
    for i in range(len(x_train)):
        correct = 0
        total += len(x_train[i])
        history = model.fit(x_train[i], y_train[i], epochs=1, verbose = 0, batch_size = x_train[i].shape[0])
        acc = history.history['acc'][0]
        #print("Batch-accuracy:", acc, "Samples:", len(encoded_sentences[i]))

        correct = acc * len(x_train[i])
        total_correct += correct
    print("Train accuracy:", total_correct / total)
    print("Validation accuracy:", get_accuracy(x_val, y_val))
    print("Test accuracy:", get_accuracy(x_test, y_test))
    
    #print("Accuracy:", get_accuracy(encoded_sentences, labels))   
        

    #acc = get_accuracy(x_train, y_train)   
    #print("Train accuracy:", acc)
    #acc = get_accuracy(x_test, y_test)   
    #print("Test accuracy:", acc)


Amount of batches: 38
--------------Training epoch:-------------- 0
Train accuracy: 0.6435624024408501
Validation accuracy: 0.6741855400834673
Test accuracy: 0.6953373059630394
--------------Training epoch:-------------- 1
Train accuracy: 0.7041864164562022
Validation accuracy: 0.6372867213774114
Test accuracy: 0.6566964372992515
--------------Training epoch:-------------- 2
Train accuracy: 0.7452606650137035
Validation accuracy: 0.6935771669399236
Test accuracy: 0.6681547701358795
--------------Training epoch:-------------- 3
Train accuracy: 0.784952606717463
Validation accuracy: 0.6924459774280382
Test accuracy: 0.8132440507411957
--------------Training epoch:-------------- 4
Train accuracy: 0.7964060012763146
Validation accuracy: 0.6470128891666582
Test accuracy: 0.6022817522287369
--------------Training epoch:-------------- 5
Train accuracy: 0.8384676166055327
Validation accuracy: 0.7086559744744388
Test accuracy: 0.620089291036129
--------------Training epoch:-------------- 6


KeyboardInterrupt: 

In [59]:
print("Test accuracy:", get_accuracy(x_test, y_test))

Test accuracy: 0.7735615119338035


In [60]:
model.save("models/bilstm_evidence.h5")

In [46]:
#import keras
#model = keras.models.load_model("models/bilstm_cos.h5")

In [47]:
print("Test accuracy:", get_accuracy(x_test, y_test))

Test accuracy: 0.76


TypeError: get_accuracy() missing 2 required positional arguments: 'data' and 'labels'