# PACKAGE IMPORT

In [None]:
import glob
import os
import numpy as np
import io
import pickle
import re
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Input, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from itertools import groupby
from keras.models import model_from_json

In [None]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

# DATA LOAD

In [None]:
regex = re.compile("article([0-9]+).*")

def loadDataTask3(folder):
    result = []
    fileNames = glob.glob(folder + "/*.txt")
    for fileName in fileNames:
        article = fileName.split("/")[-1].split(".")[0]
        articleId = regex.match(article).group(1)
        f = open(fileName, "r", encoding="utf8")
        data = f.read()
        f.close()
        sentences = [x for x in data.split("\n") if x != ""]
        split = [x for x in splitWithIndices(data, "\n")]
        result.append({"id": articleId, "data": data, "sentences": sentences, "split": split})
        
    return result

def splitWithIndices(s, c=' '):
    p = 0
    for k, g in groupby(s, lambda x:x==c):
        q = p + sum(1 for i in g)
        if not k:
            yield p, q # or p, q-1 if you are really sure you want that
        p = q

In [None]:
#dev_data = loadDataTask3("train-split/tasks-2-3/train-dev/")
#dev_data = loadDataTask3("tasks-2-3/train/")
#dev_data = loadDataTask3("dev-INPUT/tasks-2-3/dev/")
dev_data = loadDataTask3("test-INPUT/tasks-2-3/test/")

# DATA PREP ROUTINES

In [None]:
### tokenization
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
max_sequence_length = 129

In [None]:
label2index = {
    "Appeal_to_Authority": 0,
    "Appeal_to_fear-prejudice": 1,
    "Bandwagon": 2,
    "Black-and-White_Fallacy": 3,
    "Causal_Oversimplification": 4,
    "Doubt": 5,
    "Exaggeration,Minimisation": 6,
    "Flag-Waving": 7,
    "Loaded_Language": 8,
    "Name_Calling,Labeling": 9,
    "Obfuscation,Intentional_Vagueness,Confusion": 10,
    "Red_Herring": 11,
    "Reductio_ad_hitlerum": 12,
    "Repetition": 13,
    "Slogans": 14,
    "Straw_Men": 15,
    "Thought-terminating_Cliches": 16,
    "Whataboutism": 17
}

index2label = [
    "Appeal_to_Authority",
    "Appeal_to_fear-prejudice",
    "Bandwagon",
    "Black-and-White_Fallacy",
    "Causal_Oversimplification",
    "Doubt",
    "Exaggeration,Minimisation",
    "Flag-Waving",
    "Loaded_Language",
    "Name_Calling,Labeling",
    "Obfuscation,Intentional_Vagueness,Confusion",
    "Red_Herring",
    "Reductio_ad_hitlerum",
    "Repetition",
    "Slogans",
    "Straw_Men",
    "Thought-terminating_Cliches",
    "Whataboutism"
              ]

In [None]:
LATENT_DIM = 25

def updateSentence(sentence, char, start=None, stop=None):
    if start is None:
        start = 0;
        
    if stop is None:
        stop = len(sentence)
        
    s = list(sentence)
    for i in range(start, stop):
        if i < len(s):
            if s[i] in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ':
                s[i] = " "
            if s[i] not in [" ", "\n"]:
                s[i] = char
    return "".join(s)

def word2label(word):
    if word[0] == "A":
        return 0
    else:
        return 1
    
def sen2label(sen):
    result = [word2label(x) for x in sen if x != ""]
    return result

def predictModel(tokenizer, data, version, result):
    pred_X = []
    pred_loc = []
    for x in data:
        pred_X += x["sentences"]
        pred_loc += [(x["id"], y[0], y[1]) for y in x["split"]]
    
    input_sequences = pad_sequences(tokenizer.texts_to_sequences(pred_X),
                                    maxlen=max_sequence_length, padding='post')

    print('Shape of data tensor:', input_sequences.shape)
    
    # load json and create model
    json_file = open('model_prob_cnn_{:}.json'.format(version), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    # load weights into new model
    model.load_weights('model_prob_cnn_{:}.h5'.format(version))
    print("Loaded model from disk")
    

    pred_Y = model.predict(input_sequences).round().astype(int)
    pred_Y = np.reshape(pred_Y, pred_Y.shape[:2])    
        
    for i, x in enumerate(pred_Y):
        if sum(x) > 0:
            articleId = pred_loc[i][0]
            sen_start = pred_loc[i][1]
            sen = pred_X[i]
            pos = extractIndices(sen, x, sen_start)
            for p in pos:
                result.append([articleId, str(p[0]), str(p[1]), sen[(p[0] - sen_start):(p[1] - sen_start)]])
    return result    

In [None]:
def extractIndices(sentence, pred, sen_start):
    s = list(sentence)
    for i in range(len(s)):
        if s[i] in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ':
            s[i] = " "
    #word split
    split = [x for x in splitWithIndices(s)]
    
    #pred indices
    idx = [i for i in range(len(split)) if pred[i] == 1]
    if len(idx) == 0:
        return []
    prev = None
    start = []
    end = []
    for x in idx:
        if prev is None:
            start.append(x)
        elif prev + 1 < x:
            end.append(prev)
            start.append(x)
        prev = x
    end.append(idx[-1])
    
    labels = [x for x in zip(start, end) if x[0] < x[1]]
    return [(split[l[0]][0] + sen_start, split[l[1]][1] + sen_start) for l in labels]
        
    
def predictLabel(data, version):
    with open('tokenizer_label.pickle', 'rb') as handle:
        tokenizer_label = pickle.load(handle)
    pred_X = [x[3] for x in data]
    max_sequence_length = 81
    input_sequences = pad_sequences(tokenizer_label.texts_to_sequences(pred_X),
                                    maxlen=max_sequence_length, padding='post')
    print('Shape of data tensor:', input_sequences.shape)
    
    # load json and create model
    json_file = open('model_label_cnn_{:}.json'.format(version), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    # load weights into new model
    model.load_weights('model_label_cnn_{:}.h5'.format(version))
    print("Loaded model from disk")
    
    
    pred_Y = model.predict(input_sequences)
    pred_Y = [np.argmax(x) for x in pred_Y]
    
    result = []
    for i in range(len(pred_Y)):
        result.append([data[i][0], index2label[pred_Y[i]], data[i][1], data[i][2]])
    return result

def exportResult(result):
    with open("example-submission-task3-predictions_comb.txt", "w") as fout:
        for key, group in groupby(result, lambda x: x[0]):
            for thing in group:
                fout.write('\t'.join(thing[0:4]) + "\n")

# PREDICTION EXPORT

In [None]:
result = []
result = predictModel(tokenizer, dev_data, "v1", result)
result = predictLabel(result, "v1")
# Remove types for which we do not perform well
result = [x for x in result if x[1] not in ["Appeal_to_fear-prejudice", 
                                            "Appeal_to_Authority", 
                                            "Obfuscation,Intentional_Vagueness,Confusion",
                                            "Bandwagon",
                                            "Reductio_ad_hitlerum", 
                                            "Straw_Men", 
                                            "Whataboutism", 
                                            "Red_Herring", 
                                            "Repetition", 
                                            "Thought-terminating_Cliches", 
                                            "Slogans", 
                                            "Causal_Oversimplification"]]
exportResult(result)