# IMPORT OF PACKAGES

In [None]:
import glob
import os
import numpy as np
import io
import pickle
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, Dropout, MaxPooling1D, Conv1D, TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [None]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


# DATA LOAD

In [None]:
def loadDataTask3(folder):
    result = []
    fileNames = glob.glob(folder + "/*.txt")
    for fileName in fileNames:
        articleId = fileName.split("/")[-1].split(".")[0]
        f = open(fileName, "r", encoding="utf8")
        data = f.read()
        f.close()
        sentences = [x for x in data.split("\n") if x != ""]
        labels = readLabelTask3(folder + "/" + articleId + ".task3.labels")
        result.append({"id": articleId, "data": data, "sentences": sentences, "labels": labels})
        
    return result

def readLabelTask3(fileName):
    result = []
    f = open(fileName, "r")
    result = f.readlines()
    f.close()
    result = [x.replace("\n", "").split("\t") for x in result]
    return result

In [None]:
train_data = loadDataTask3("train-split/tasks-2-3/train-train/")
dev_data = loadDataTask3("train-split/tasks-2-3/train-dev/")

In [None]:
label2index = {
    "Appeal_to_Authority": 0,
    "Appeal_to_fear-prejudice": 1,
    "Bandwagon": 2,
    "Black-and-White_Fallacy": 3,
    "Causal_Oversimplification": 4,
    "Doubt": 5,
    "Exaggeration,Minimisation": 6,
    "Flag-Waving": 7,
    "Loaded_Language": 8,
    "Name_Calling,Labeling": 9,
    "Obfuscation,Intentional_Vagueness,Confusion": 10,
    "Red_Herring": 11,
    "Reductio_ad_hitlerum": 12,
    "Repetition": 13,
    "Slogans": 14,
    "Straw_Men": 15,
    "Thought-terminating_Cliches": 16,
    "Whataboutism": 17
}

index2label = [
    "Appeal_to_Authority",
    "Appeal_to_fear-prejudice",
    "Bandwagon",
    "Black-and-White_Fallacy",
    "Causal_Oversimplification",
    "Doubt",
    "Exaggeration,Minimisation",
    "Flag-Waving",
    "Loaded_Language",
    "Name_Calling,Labeling",
    "Obfuscation,Intentional_Vagueness,Confusion",
    "Red_Herring",
    "Reductio_ad_hitlerum",
    "Repetition",
    "Slogans",
    "Straw_Men",
    "Thought-terminating_Cliches",
    "Whataboutism"
              ]

# PROPAGANDA IDENTIFICATION MODELLING

## Data prep

In [None]:
def updateSentence(sentence, char, start=None, stop=None):
    if start is None:
        start = 0;
        
    if stop is None:
        stop = len(sentence)
        
    s = list(sentence)
    for i in range(start, stop):
        if i < len(s):
            if s[i] in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ':
                s[i] = " "
            if s[i] not in [" ", "\n"]:
                s[i] = char
    return "".join(s)

def word2label(word):
    if word[0] == "A":
        return 0
    else:
        return 1
    
def sen2label(sen):
    result = [word2label(x) for x in sen if x != ""]
    return result

def getTrainData(data):
    trainX = []
    trainY = []
    for x in data:
        trainX += x["sentences"]
        
        dataMask = updateSentence(x["data"], "A")
        for l in x["labels"]:
                dataMask = updateSentence(dataMask, "B", int(l[2]), int(l[3]))
        outcome = [sen2label(y.split(" ")) for y in dataMask.split("\n") if y != ""]
        trainY += outcome

        
    return trainX, trainY

In [None]:
train_X, train_Y = getTrainData(train_data)

## TOKENIZATION AND WORD EMBEDING

In [None]:
### tokenization
MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_X)
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
max_sequence_length = max(len(s) for s in tokenizer.texts_to_sequences(train_X))
print('Max sequence length:', max_sequence_length)
print(num_words)

In [None]:
EMBEDDING_DIM = 200
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
    # is just a space-separated text file in the format:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

In [None]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros.
            embedding_matrix[i] = embedding_vector

## Model Estimation

In [None]:
LATENT_DIM = 32

print('Building model...')

model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(filters=LATENT_DIM, kernel_size=5, padding="same"))
model.add(MaxPooling1D(pool_size=3, strides=1, padding="same"))
model.add(Conv1D(filters=LATENT_DIM, kernel_size=4, padding="same"))
model.add(MaxPooling1D(pool_size=4, strides=1, padding="same"))
model.add(Conv1D(filters=LATENT_DIM, kernel_size=3, padding="same"))
model.add(MaxPooling1D(pool_size=5, strides=1, padding="same"))
model.add(TimeDistributed(Dense(20, activation="relu")))
model.add(Dense(1, activation="sigmoid"))
model.compile(
  loss='binary_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy']
)

print(model.summary())

In [None]:
input_sequences = pad_sequences(tokenizer.texts_to_sequences(train_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences = pad_sequences(train_Y, maxlen=max_sequence_length, padding='post')
output_sequences = np.reshape(output_sequences, output_sequences.shape + (1,))

print('Shape of data tensor:', input_sequences.shape)
print('Shape of output tensor:', output_sequences.shape)

VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 5
print('Training model...')
z = np.zeros((len(input_sequences), LATENT_DIM))
seed(1)
set_random_seed(2)
model.compile(
  loss='binary_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy']
)
model.fit(
  input_sequences,
  output_sequences,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)

pred_Y = model.predict(input_sequences).round()
pred_Y = np.reshape(pred_Y, pred_Y.shape[:2])
act_Y = np.reshape(output_sequences, output_sequences.shape[:2])

f1 = f1_score(act_Y, pred_Y, average='micro')
print(f1)

In [None]:
with open('model_prob_cnn_v1.json', "w") as json_file:
        json_file.write(model.to_json())
model.save_weights('model_prob_cnn_v1.h5')
print("Saved model to disk")

## Model Validation

In [None]:
dev_X, dev_Y = getTrainData(dev_data)
input_sequences_dev = pad_sequences(tokenizer.texts_to_sequences(dev_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences_dev = pad_sequences(dev_Y, maxlen=max_sequence_length, padding='post')
output_sequences_dev = np.reshape(output_sequences_dev, output_sequences_dev.shape)

In [None]:
pred_Y = model.predict(input_sequences_dev).round()
pred_Y = np.reshape(pred_Y, pred_Y.shape[:2])
#act_Y = np.reshape(output_sequences_dev, output_sequences.shape[:2])

f1 = f1_score(output_sequences_dev, pred_Y, average='micro')
print(f1)

# LABELS MODELLING

## Data Prep

In [None]:
def labelOutcome(label):
    indx = label2index[label]
    result = np.zeros(18, dtype=int)
    result[indx] = 1
    return result

def getLabelsData(data):
    trainX = []
    trainY = []
    for x in data:
        for y in x["labels"]:
            sen = [z for z in x["data"][int(y[2]):int(y[3])].split("\n") if z != ""]
            lab = [labelOutcome(y[1]) for z in range(len(sen))]
            
            trainX += sen
            trainY += lab
        
    return trainX, trainY

In [None]:
train_X, train_Y = getLabelsData(train_data)

In [None]:
### tokenization
MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_X)
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
with open('tokenizer_label.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
max_sequence_length = max(len(s) for s in tokenizer.texts_to_sequences(train_X))
print('Max sequence length:', max_sequence_length)
print(num_words)

## Model Estimation

In [None]:
LATENT_DIM = 32
print('Building model...')

modelCNN = Sequential()
modelCNN.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
modelCNN.add(Conv1D(filters=LATENT_DIM, kernel_size=5, padding="same"))
modelCNN.add(MaxPooling1D(pool_size=3, strides=1, padding="same"))
modelCNN.add(Conv1D(filters=LATENT_DIM, kernel_size=4, padding="same"))
modelCNN.add(MaxPooling1D(pool_size=4, strides=1, padding="same"))
modelCNN.add(Conv1D(filters=LATENT_DIM, kernel_size=3, padding="same"))
modelCNN.add(GlobalMaxPool1D())
modelCNN.add(Dropout(0.2))
modelCNN.add(Dense(128, activation="relu"))
modelCNN.add(Dropout(0.2))
modelCNN.add(Dense(18, activation="softmax"))
modelCNN.compile(
  loss='binary_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy']
)

print(modelCNN.summary())

In [None]:
input_sequences = pad_sequences(tokenizer.texts_to_sequences(train_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences = np.array(train_Y)
print('Shape of data tensor:', input_sequences.shape)
print('Shape of output tensor:', output_sequences.shape)

VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 5
print('Training model...')
z = np.zeros((len(input_sequences), LATENT_DIM))
seed(1)
set_random_seed(2)
modelCNN.compile(
  loss='categorical_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy']
)
modelCNN.fit(
  input_sequences,
  output_sequences,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)

pred_Y = model.predict(input_sequences).round()

f1 = f1_score(output_sequences, pred_Y, average='micro')
print(f1)

In [None]:
with open('model_label_cnn_v1.json', "w") as json_file:
        json_file.write(modelCNN.to_json())
modelCNN.save_weights('model_label_cnn_v1.h5')
print("Saved model to disk")

## Model Validation

In [None]:
dev_X, dev_Y = getLabelsData(dev_data)
input_sequences_dev = pad_sequences(tokenizer.texts_to_sequences(dev_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences_dev = np.array(dev_Y)

In [None]:
pred_Y = modelCNN.predict(input_sequences_dev).round()

f1 = f1_score(output_sequences_dev, pred_Y, average='micro')
print(f1)