In [79]:
import glob
import os
import numpy as np
import io
import pickle
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, Dropout, GlobalMaxPool1D, Conv1D, MaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [2]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


In [3]:
def loadDataTask3(folder):
    result = []
    fileNames = glob.glob(folder + "/*.txt")
    for fileName in fileNames:
        articleId = fileName.split("/")[-1].split(".")[0]
        f = open(fileName, "r", encoding="utf8")
        data = f.read()
        f.close()
        labels = readLabelTask3(folder + "/" + articleId + ".task3.labels")
        result.append({"id": articleId, "data": data, "labels": labels})
        
    return result

def readLabelTask3(fileName):
    result = []
    f = open(fileName, "r")
    result = f.readlines()
    f.close()
    result = [x.replace("\n", "").split("\t") for x in result]
    return result

In [4]:
train_data = loadDataTask3("train-split/tasks-2-3/train-train/")

In [5]:
label2index = {
    "Appeal_to_Authority": 0,
    "Appeal_to_fear-prejudice": 1,
    "Bandwagon": 2,
    "Black-and-White_Fallacy": 3,
    "Causal_Oversimplification": 4,
    "Doubt": 5,
    "Exaggeration,Minimisation": 6,
    "Flag-Waving": 7,
    "Loaded_Language": 8,
    "Name_Calling,Labeling": 9,
    "Obfuscation,Intentional_Vagueness,Confusion": 10,
    "Red_Herring": 11,
    "Reductio_ad_hitlerum": 12,
    "Repetition": 13,
    "Slogans": 14,
    "Straw_Men": 15,
    "Thought-terminating_Cliches": 16,
    "Whataboutism": 17
}

index2label = [
    "Appeal_to_Authority",
    "Appeal_to_fear-prejudice",
    "Bandwagon",
    "Black-and-White_Fallacy",
    "Causal_Oversimplification",
    "Doubt",
    "Exaggeration,Minimisation",
    "Flag-Waving",
    "Loaded_Language",
    "Name_Calling,Labeling",
    "Obfuscation,Intentional_Vagueness,Confusion",
    "Red_Herring",
    "Reductio_ad_hitlerum",
    "Repetition",
    "Slogans",
    "Straw_Men",
    "Thought-terminating_Cliches",
    "Whataboutism"
              ]

In [23]:
def labelOutcome(label):
    indx = label2index[label]
    result = np.zeros(18, dtype=int)
    result[indx] = 1
    return result

def getLabelsData(data):
    trainX = []
    trainY = []
    for x in data:
        for y in x["labels"]:
            sen = [z for z in x["data"][int(y[2]):int(y[3])].split("\n") if z != ""]
            lab = [labelOutcome(y[1]) for z in range(len(sen))]
            
            trainX += sen
            trainY += lab
        
    return trainX, trainY

In [24]:
train_X, train_Y = getLabelsData(train_data)

In [25]:
train_Y

[array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [26]:
### tokenization
MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_X)
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
with open('tokenizer_label.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
max_sequence_length = max(len(s) for s in tokenizer.texts_to_sequences(train_X))
print('Max sequence length:', max_sequence_length)
print(num_words)

Max sequence length: 81
6798


In [87]:
EMBEDDING_DIM = 200
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
    # is just a space-separated text file in the format:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [88]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros.
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [51]:
LATENT_DIM = 32
print('Building model...')

model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM))
model.add(Bidirectional(LSTM(LATENT_DIM, return_sequences=True, recurrent_dropout=0.2)))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.2))
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(18, activation="softmax"))
model.compile(
  loss='binary_crossentropy',
  # optimizer='rmsprop',
  optimizer=Adam(lr=0.01),
  # optimizer=SGD(lr=0.01, momentum=0.9),
  metrics=['accuracy']
)

print(model.summary())

Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 20)          135960    
_________________________________________________________________
bidirectional_7 (Bidirection (None, None, 64)          13568     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 64)                0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 128)               8320      
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 18)                232

In [52]:
   
input_sequences = pad_sequences(tokenizer.texts_to_sequences(train_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences = np.array(train_Y)
print('Shape of data tensor:', input_sequences.shape)
print('Shape of output tensor:', output_sequences.shape)

VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 2
print('Training model...')
z = np.zeros((len(input_sequences), LATENT_DIM))
seed(1)
set_random_seed(2)
model.compile(
  loss='categorical_crossentropy',
  # optimizer='rmsprop',
  optimizer=Adam(lr=0.01),
  # optimizer=SGD(lr=0.01, momentum=0.9),
  metrics=['accuracy']
)
model.fit(
  input_sequences,
  output_sequences,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)
# with open('model_{:}_{:}.json'.format(label2index[label], version), "w") as json_file:
#    json_file.write(model.to_json())
# model.save_weights('model_{:}_{:}.h5'.format(label2index[label], version))
# print("Saved model to disk")
pred_Y = model.predict(input_sequences).round()

f1 = f1_score(output_sequences, pred_Y, average='micro')
print(f1)



Shape of data tensor: (4817, 81)
Shape of output tensor: (4817, 18)
Training model...
Train on 3853 samples, validate on 964 samples
Epoch 1/2
Epoch 2/2
0.5216186252771619


In [40]:
dev_data = loadDataTask3("train-split/tasks-2-3/train-dev/")

In [41]:
dev_X, dev_Y = getLabelsData(dev_data)
input_sequences_dev = pad_sequences(tokenizer.texts_to_sequences(dev_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences_dev = np.array(dev_Y)

In [53]:
pred_Y = model.predict(input_sequences_dev).round()

f1 = f1_score(output_sequences_dev, pred_Y, average='micro')
print(f1)

0.3219741480611046


In [89]:
LATENT_DIM = 32
print('Building model...')

modelCNN = Sequential()
modelCNN.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
#modelCNN.add(Conv1D(filters=32, kernel_size=5))
modelCNN.add(Conv1D(filters=LATENT_DIM, kernel_size=5, padding="same"))
modelCNN.add(MaxPooling1D(pool_size=3, strides=1, padding="same"))
modelCNN.add(Conv1D(filters=LATENT_DIM, kernel_size=4, padding="same"))
modelCNN.add(MaxPooling1D(pool_size=4, strides=1, padding="same"))
modelCNN.add(Conv1D(filters=LATENT_DIM, kernel_size=3, padding="same"))
#modelCNN.add(MaxPooling1D(pool_size=5, strides=1, padding="same"))
modelCNN.add(GlobalMaxPool1D())
modelCNN.add(Dropout(0.2))
modelCNN.add(Dense(128, activation="relu"))
modelCNN.add(Dropout(0.2))
modelCNN.add(Dense(18, activation="softmax"))
modelCNN.compile(
  loss='binary_crossentropy',
  # optimizer='rmsprop',
  optimizer=Adam(lr=0.01),
  # optimizer=SGD(lr=0.01, momentum=0.9),
  metrics=['accuracy']
)

print(modelCNN.summary())

Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, None, 200)         1359600   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, None, 32)          32032     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, None, 32)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, None, 32)          4128      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, None, 32)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 32)          3104      
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 32)                0  

In [90]:
input_sequences = pad_sequences(tokenizer.texts_to_sequences(train_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences = np.array(train_Y)
print('Shape of data tensor:', input_sequences.shape)
print('Shape of output tensor:', output_sequences.shape)

VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 5
print('Training model...')
z = np.zeros((len(input_sequences), LATENT_DIM))
seed(1)
set_random_seed(2)
modelCNN.compile(
  loss='categorical_crossentropy',
  # optimizer='rmsprop',
  optimizer=Adam(lr=0.01),
  # optimizer=SGD(lr=0.01, momentum=0.9),
  metrics=['accuracy']
)
modelCNN.fit(
  input_sequences,
  output_sequences,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)
# with open('model_{:}_{:}.json'.format(label2index[label], version), "w") as json_file:
#    json_file.write(model.to_json())
# model.save_weights('model_{:}_{:}.h5'.format(label2index[label], version))
# print("Saved model to disk")
pred_Y = model.predict(input_sequences).round()

f1 = f1_score(output_sequences, pred_Y, average='micro')
print(f1)

Shape of data tensor: (4817, 81)
Shape of output tensor: (4817, 18)
Training model...
Train on 3853 samples, validate on 964 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.5216186252771619


In [77]:
with open('model_label_cnn_v1.json', "w") as json_file:
        json_file.write(modelCNN.to_json())
modelCNN.save_weights('model_label_cnn_v1.h5')
print("Saved model to disk")

Saved model to disk


In [91]:
pred_Y = modelCNN.predict(input_sequences_dev).round()

f1 = f1_score(output_sequences_dev, pred_Y, average='micro')
print(f1)

0.34559643255295425
