In [162]:
import glob
import os
import numpy as np
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Input, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [165]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
 
    def on_epoch_end(self, epoch, logs={}):
        print(self.model)
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(" — val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return
 

metrics = Metrics()

In [115]:
def loadDataTask3(folder):
    result = []
    fileNames = glob.glob(folder + "/*.txt")
    for fileName in fileNames:
        articleId = fileName.split("/")[-1].split(".")[0]
        f = open(fileName, "r")
        data = f.read()
        f.close()
        sentences = [x for x in data.split("\n") if x != ""]
        labels = readLabelTask3(folder + "/" + articleId + ".task3.labels")
        labeledSentences = createLabeledSentences(data, labels)
        propagandas = extractPropagandas(data, labels)
        result.append({"id": articleId, "data": data, "sentences": sentences, "labels": labels, 
                      "labeledSentences": labeledSentences, "propagandas": propagandas})
        
    return result

def readLabelTask3(fileName):
    result = []
    f = open(fileName, "r")
    result = f.readlines()
    f.close()
    result = [x.replace("\n", "").split("\t") for x in result]
    return result


def updateSentence(sentence, char, start=None, stop=None):
    if start is None:
        start = 0;
        
    if stop is None:
        stop = len(sentence)
        
    s = list(sentence)
    for i in range(start, stop):
        if s[i] not in [" ", "\n"]:
            s[i] = char
    return "".join(s)

def word2label(word):
    if word[0] == "A":
        return 0
    else:
        return 1
    
def sen2label(sen):
    result = [word2label(x) for x in sen if x != ""]
    return result

def createLabeledSentences(data, labels):
    newData = updateSentence(data, "A")
    for x in labels:
        newData = updateSentence(newData, "B", int(x[2]), int(x[3]))
    result = [sen2label(x.split(" ")) for x in newData.split("\n") if x != ""]
    
    return result

def updateStartIndex(data, index):
    if index == 0:
        return index
    #if data[index] not in [" ", "\n"] and data[index - 1] in [" ", "\n"]:
    #    return index
    while index < len(data) and (data[index] in [" ", "\n"] or data[index - 1] not in [" ", "\n"]):
        index += 1
        
    return index

def updateStopIndex(data, index):
    if index == len(data) - 1:
        return index
    #if data[index] not in [" ", "\n"] and data[index - 1] in [" ", "\n"]:
    #    return index
    while index + 1 < len(data) and (data[index + 1] in [" ", "\n"] or data[index] not in [" ", "\n"]):
        index += 1
        
    return index

def extractPropagandas(data, labels):
    return [[data[updateStartIndex(data, int(x[2])):updateStopIndex(data, int(x[3]))], labelOutcome(x[1])] for x in labels]

label2index = {
    "Appeal_to_Authority": 0,
    "Appeal_to_fear-prejudice": 1,
    "Bandwagon": 2,
    "Black-and-White_Fallacy": 3,
    "Causal_Oversimplification": 4,
    "Doubt": 5,
    "Exaggeration,Minimisation": 6,
    "Flag-Waving": 7,
    "Loaded_Language": 8,
    "Name_Calling,Labeling": 9,
    "Obfuscation,Intentional_Vagueness,Confusion": 10,
    "Red_Herring": 11,
    "Reductio_ad_hitlerum": 12,
    "Repetition": 13,
    "Slogans": 14,
    "Straw_Men": 15,
    "Thought-terminating_Cliches": 16,
    "Whataboutism": 17
}

def labelOutcome(label):
    indx = label2index[label]
    result = np.zeros(18, dtype=int)
    result[indx] = 1
    return result
    
    
        

In [116]:
train_data = loadDataTask3("tasks-2-3/train")

In [117]:
train_data[0]

{'data': 'Vote targeting Jewish student politician was not anti-Semitic: McGill report\n\nMONTREAL \xe2\x80\x93 It was political disagreement \xe2\x80\x94 not anti-Semitism \xe2\x80\x94 that led to a Jewish student being voted off the McGill University student council\xe2\x80\x99s board of directors, an investigation ordered by the principal has concluded.\nBut Jewish groups on and off campus have denounced the report as flawed, saying it missed the significance of an anti-Semitic text circulated online before Noah Lew was removed from the board last fall.\nA joint statement from five campus Jewish groups says the report, released this week by principal Suzanne Fortier, \xe2\x80\x9cappears to condone discrimination against Jewish students at McGill based on the cultural and religious organizations they affiliate with.\xe2\x80\x9d\nThe controversy has its roots in McGill\xe2\x80\x99s long-running debate over the anti-Israel Boycott, Divestment and Sanctions movement.\nAt an Oct. 23 gene

In [197]:
### tokenization
all_lines = []
all_labels = []
for item in train_data:
    all_lines += item["sentences"]
    all_labels += item["labeledSentences"]

MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(all_lines)
input_sequences = tokenizer.texts_to_sequences(all_lines)
word2idx = tokenizer.word_index

In [198]:
# find max seq length
max_sequence_length_from_data = max(len(s) for s in input_sequences)
max_sequence_length_from_labels = max(len(s) for s in all_labels)
print('Max sequence length:', max_sequence_length_from_data)
print('Max label length:', max_sequence_length_from_labels)

('Max sequence length:', 129)
('Max label length:', 129)


In [199]:
MAX_SEQUENCE_LENGTH = 150
# pad sequences so that we get a N x T matrix
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
output_sequences = pad_sequences(all_labels, maxlen=max_sequence_length, padding='post')
output_sequences = np.reshape(output_sequences, output_sequences.shape + (1,))
print('Shape of data tensor:', input_sequences.shape)
print('Shape of output tensor:', output_sequences.shape)

('Shape of data tensor:', (14264, 129))
('Shape of output tensor:', (14264, 129, 1))


In [189]:
EMBEDDING_DIM = 50
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
    # is just a space-separated text file in the format:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [200]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros.
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [201]:
# load pre-trained word embeddings into an Embedding layer
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  # trainable=False
)

In [202]:
LATENT_DIM = 25
print('Building model...')

# create an LSTM network with a single LSTM
input_ = Input(shape=(max_sequence_length,))
initial_h = Input(shape=(LATENT_DIM,))
initial_c = Input(shape=(LATENT_DIM,))
x = embedding_layer(input_)
lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
x, _, _ = lstm(x, initial_state=[initial_h, initial_c]) # don't need the states here
dense = Dense(1, activation='sigmoid')
output = dense(x)

model = Model([input_, initial_h, initial_c], output)
model.compile(
  loss='binary_crossentropy',
  # optimizer='rmsprop',
  optimizer=Adam(lr=0.01),
  # optimizer=SGD(lr=0.01, momentum=0.9),
  metrics=['accuracy']
)

Building model...


In [203]:
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 129)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 129, 50)      1031600     input_19[0][0]                   
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 25)           0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           (None, 25)           0                                            
__________________________________________________________________________________________________
lstm_7 (LS

In [204]:
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 20
print('Training model...')
z = np.zeros((len(input_sequences), LATENT_DIM))
r = model.fit(
  [input_sequences, z, z],
  output_sequences,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)

Training model...
Train on 11411 samples, validate on 2853 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [205]:
pred = r.model.predict([input_sequences, z, z]).round()
pred = np.reshape(pred, pred.shape[:2])
target = np.reshape(output_sequences, output_sequences.shape[:2])
f1_score(target, pred)

0.70875624324899289

In [119]:
pro_lines = []
pro_output = []
for item in train_data:
    pro_lines += [x[0] for x in item["propagandas"]]
    pro_output += [x[1] for x in item["propagandas"]]
pro_sequences = tokenizer.texts_to_sequences(pro_lines)

In [123]:
pro_sequences
pro_output


[array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [113]:
np.zeros(18, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [124]:
pro_lines

['But Jewish groups on and off campus have denounced the report as flawed, saying it missed the significance of an anti-Semitic text circulated online before Noah Lew was removed from the board last',
 'now public, and a target was placed squarely',
 'Lew wrote, \xe2\x80\x9cMy Jewish identity was now public, and a target was placed squarely upon me by the McGill',
 'BDS movement.\xe2\x80\x9d',
 'material that was \xe2\x80\x9cinsensitive to',
 'as corrupt and politically powerful.\xe2\x80\x9d\nThe Jewish groups said this anti-Semitic rhetoric \xe2\x80\x9cwas used to encourage students to vote specifically against Noah Lew.\nIt is under this context that the (general assembly) occurred, and the report fundamentally misunderstands this, which',
 'to take this opportunity to make it',
 'were overwhelmingly',
 'CEO of',
 'then called out',
 '\xe2\x80\x9cChristianity is Europe\xe2\x80\x99s last hope.\xe2\x80\x9d',
 'The first',
 'and Soros, calling',
 'adds:\nSoros, for his part, compared Or

In [174]:
np.reshape(output_sequences, output_sequences.shape[:2])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)