First, we import Google's trained word2Vec model

In [1]:
import gensim



In [2]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format ('GoogleNews-vectors-negative300.bin', binary=True)  

Now, we prepare the data by encoding every sentence as a sequence of word2vec-encoded words

In [171]:
import pandas as pd
import numpy as np
import nltk
import csv
from nltk.corpus import stopwords

In [172]:
raw_data_abo = pd.read_csv("data/abortion.tsv", sep = "\t")
raw_data_clo = pd.read_csv("data/cloning.tsv", sep = "\t")
raw_data_dp = pd.read_csv("data/death_penalty.tsv", sep = "\t") 
raw_data_gun = pd.read_csv("data/gun_control.tsv", sep = "\t")
raw_data_mari = pd.read_csv("data/marijuana_legalization.tsv", sep = "\t")
raw_data_wage = pd.read_csv("data/minimum_wage.tsv", sep = "\t")
raw_data_nuc = pd.read_csv("data/nuclear_energy.tsv", sep = "\t")
raw_data_school = pd.read_csv("data/school_uniforms.tsv", sep = "\t", quoting=csv.QUOTE_NONE)

In [173]:
frames = [raw_data_abo, raw_data_clo, raw_data_dp, raw_data_gun, raw_data_mari, raw_data_wage, raw_data_nuc, raw_data_school]

In [174]:
raw_data = pd.concat(frames, axis = 0)

In [175]:
print("The combined dataset contains {0} rows and {1} columns".format(len(raw_data), len(raw_data.columns)))

The combined dataset contains 24507 rows and 7 columns


We only need to keep three columns: The topic to compute the topic-relevance of a sentence, the sentence itself and the label of the argument

In [176]:
raw_data = raw_data[["topic", "sentence", "annotation"]]

In [177]:
raw_data.head()

Unnamed: 0,topic,sentence,annotation
0,abortion,This means it has to steer monetary policy to ...,NoArgument
1,abortion,Where did you get that ?,NoArgument
2,abortion,Nathanson later became pro-life .,NoArgument
3,abortion,In this case we may never do evil ( directly a...,Argument_against
4,abortion,With that I would like to give everyone someth...,NoArgument


In [230]:
#from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics.pairwise import cosine_similarity
def encode_sentences(data, labels):
    N_sentences = len(data) 
    encoded_sentences = []
    print("---------------------Now encoding sentences!---------------------")
    print("Max iterations:", N_sentences)
    # shuffle the dataframe rows
    data = data.sample(frac=1)
    
    # store the maximum amount of words, 
    # for zero padding later
    max_words = 0
    
    # take the topic that the sentence comes from,
    # to compute topic relevance
    topics = data.topic
    topics = list(topics)
    
    # Store the different amount of word counts,
    # together with the indices of the sentences
    # that contain this amount of words
    word_counts = {}
    
    
    # for each sentence:
    for i in range(N_sentences):
        # take the sentence from the dataframe
        sentence = data.sentence.iloc[i]
        # tokenize the sentence
        tokens = nltk.word_tokenize(sentence)
        # filter puncuation and stop words from the tokens
        words = []
        for token in tokens:
            if(token[0] not in ".,:;[](){}!?-_`'~\"^/1234567890"  and token not in stopwords.words("English")):
                words.append(token)
        N_words = len(words)
        
        # if this amount of words has been
        # encountered before, add the index
        # of the sentence
        if(N_words in word_counts):
            word_counts[N_words].append(i)
        # else, create new entry with index
        else:
             word_counts[N_words] = [i]
        # update max words 
        if(N_words > max_words):
            max_words = N_words
        # store a sentence as a sequence of word vectors
        sequence = []
        for word in words:
            # embed a word using the Google word2vec model,
            # if it exists in the dictionary
            if(word in word2vec):
                 word_vector = word2vec[word]
            # if word does not exist in the word2vec model, 
            # add a randomized word vector instead
            else:
                word_vector = np.random.uniform(low = -0.01, high = 0.01, size = (300))
            
            # encode topic and add similarity of sentence to topic
            # as additional feature
            topic = topics[i]
            topic_words = topic.split()
            topic_vectors = []
            # compute the average word vector for the topic
            for word in topic_words:
                if(word in word2vec):
                    word_vector = word2vec[word]
                else:
                    word_vector = np.random.uniform(low = -0.01, high = 0.01, size = (300))
                topic_vectors.append(word_vector)
            topic_vectors = np.asarray(topic_vectors)
            avg_topic_vector = np.mean(topic_vectors, axis = 0)
            
            # compute similarity between word and topic, then add as feature
            similarity = cosine_similarity([word_vector], [avg_topic_vector])
            #print("Current word:", word, "Curren topic:",  topic,  "similarity:", similarity)
            word_vector = np.append(word_vector, similarity)
            
            
            # add word to the sequence
            sequence.append(word_vector)
        # convert list sequence to numpy array for convenience
        sequence = np.asarray(sequence)
        # print progress every 1000 epochs
        if(i % 1000 == 0):
            print("iteration :", i )
        encoded_sentences.append(sequence)
        
    encoded_sentences = np.asarray(encoded_sentences)
    
    # create batches to speed-up training
    # group sentences with equal word counts into the same batches
    all_batches = []
    labels_batches = []
    for count in word_counts:
        # get the sentences with this amount of words
        sentence_idx = word_counts[count]
        batch = []
        label_batch = []
        # add each sentence with this amount of words
        # to the batch
        for idx in sentence_idx:
            batch.append(encoded_sentences[idx])
            label_batch.append(labels[i])
        batch = np.asarray(batch)
        label_batch = np.asarray(label_batch)
        
        all_batches.append(batch)
        labels_batches.append(label_batch)
        
    all_batches = np.asarray(all_batches)
    labels_batches = np.asarray(labels_batches)
    # now, all the different batches are stored in an
    # array, where each batch can be accessed by an 
    # index
    return all_batches, label_batches, data 
       
        

In [None]:
labels = shuffled_data.annotation.copy()
labels[labels == "NoArgument"] = 0
labels[labels != 0] = 1

In [231]:
encoded_sentences, labels, shuffled_data = encode_sentences(raw_data, labels)

---------------------Now encoding sentences!---------------------
Max iterations: 24507
iteration : 0
iteration : 1000
iteration : 2000
iteration : 3000
iteration : 4000
iteration : 5000
iteration : 6000
iteration : 7000
iteration : 8000
iteration : 9000
iteration : 10000
iteration : 11000
iteration : 12000
iteration : 13000
iteration : 14000
iteration : 15000
iteration : 16000
iteration : 17000
iteration : 18000
iteration : 19000
iteration : 20000
iteration : 21000
iteration : 22000
iteration : 23000
iteration : 24000


Create train and test sets

In [238]:
N = len(encoded_sentences)
train_test_split = 0.5
x_train = encoded_sentences[:int(train_test_split*N)]
y_train = labels.values[:int(train_test_split*N)]

x_test = encoded_sentences[int(train_test_split*N):]
y_test = labels.values[int(train_test_split*N):]

initialize the Keras LSTM model

In [239]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers import Bidirectional


model = Sequential()
model.add(Bidirectional(LSTM(128), merge_mode='concat', input_shape = (None, 301)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


In [240]:
epochs = 5

In [241]:
def get_accuracy(data, labels):
    correct = 0
    for i in range(len(data)):
        sample = data[i].reshape(1, data[i].shape[0], data[i].shape[1])
        label = np.array(labels[i])
        label = label.reshape((1,1))
        score = model.evaluate(sample, label, batch_size=1, verbose = 0)
        correct += score[1]
    return correct / len(data)

In [249]:
x_train[1].shape

(1335, 13, 301)

In [253]:
print("Amount of batches:", len(x_train))
for e in range(epochs):
    print("--------------Training epoch:--------------", e)
    for i in range(len(x_train)):
        print("Batch:", i)
        model.fit(x_train[i], y_train[i], epochs=1, verbose = 0, batch_size = x_train[i].shape[0])

Amount of batches: 65
--------------Training epoch:-------------- 0
Batch: 0


AttributeError: 'int' object has no attribute 'ndim'

In [208]:
"""for e in range(epochs):
    print("--------------Training epoch:--------------", e)
    for i in range(len(x_train)):
        if(i % 1000 == 0):
            print("Batch:", i)
        sample = x_train[i].reshape(1, x_train[i].shape[0], x_train[i].shape[1])
        label = np.array(y_train[i])
        label = label.reshape((1,1))
        model.fit(sample, label, epochs=1, verbose = 0)
    acc = get_accuracy(x_train, y_train)
    print("Epoch: {0}, accuracy: {1}".format(e, acc))"""
        

--------------Training epoch:-------------- 0
Batch: 0
Batch: 1000
Batch: 2000
Batch: 3000
Batch: 4000
Batch: 5000
Batch: 6000
Batch: 7000
Batch: 8000
Batch: 9000
Batch: 10000
Batch: 11000
Batch: 12000
Batch: 13000
Batch: 14000
Batch: 15000
Batch: 16000
Batch: 17000
Batch: 18000
Batch: 19000
Epoch: 0, accuracy: 2.340269277845777
--------------Training epoch:-------------- 1
Batch: 0


KeyboardInterrupt: 

In [211]:
get_accuracy(x_test, y_test)

0.5648714810281518

In [None]:
def detect_arguments(model, post):
    # split text into sentences
    post_sentences = post.split(".")
    # count amount of sentences
    n_sentences = len(post_sentences)
    # add topic
    topics = pd.Series(np.zeros(n_sentences), name = "topic")
    # make data ready for encoding
    sentences = pd.DataFrame(post_sentences)
    df = pd.concat([sentences,topics], axis = 1)
    df = df.rename(index=str, columns={0: "sentence"})
    encoded_sentences, shuffled = encode_sentences(df)
    
    arguments = []
    non_arguments = []
    
    # feed sentences into LSTM and get prediction
    for i in range(len(encoded_sentences)):
        n_words = encoded_sentences[i].shape[0]
        if(n_words > 0):
            n_features = encoded_sentences[i].shape[1]
            prediction = model.predict(encoded_sentences[i].reshape(1, n_words, n_features), batch_size=1, verbose=0)
            if(prediction > 0.5):
                arguments.append((prediction, post_sentences[i]))
            else:
                non_arguments.append((prediction, post_sentences[i]))
    return arguments, non_arguments           
    

In [None]:
post = "Immigrants shouldn't force people in their new home country to adapt to them. In fact, they should adapt to the customs of the new country, and if necessary, change their own behaviour so as to fit the new country's culture. To me, it's pretty simple. If you come into someone else's home, you should respect their rules. You shouldn't be bringing in your own preferences into the new country unless it doesn't affect the existing citizens negatively. For example, let's say you don't take off your shoes at home. Now, you enter into the home of someone who takes off their shoes, and makes that clear to you that you should take off your shoes. What makes more sense, to take off your shoes or insist on wearing your shoes in their house, complaining that they shouldn't be discriminating against your culture by forcing you to adapt to theirs, even though it's their home? It certainly makes a lot more sense to me that the visitor takes off their shoes. This view is somewhat inspired by hearing about stories like immigrants in Europe wanting to wear clothes like the hijab against the wishes of the community and/or government. I've also heard of another case, where a family that moved from China asked their Indian neighbours to stop cooking curry because they didn't like the pungent smell. Anyways, convince me why it is ok for new immigrants to refuse to adapt to new societal customs, or to force their new countrymen to adapt to theirs."

In [None]:
post

In [None]:
arguments, non_arguments = detect_arguments(model, post)

In [None]:
arguments

In [None]:
non_arguments