In [1]:
import glob
import os
import numpy as np
import io
import pickle
import re
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Input, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from itertools import groupby
from keras.models import model_from_json

Using TensorFlow backend.


In [2]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

In [35]:
regex = re.compile("article([0-9]+).*")

def loadDataTask3(folder):
    result = []
    fileNames = glob.glob(folder + "/*.txt")
    for fileName in fileNames:
        article = fileName.split("/")[-1].split(".")[0]
        # labels = readLabelTask3(folder + "/" + article + ".task3.labels")
        articleId = regex.match(article).group(1)
        f = open(fileName, "r", encoding="utf8")
        data = f.read()
        f.close()
        sentences = [x for x in data.split("\n") if x != ""]
        split = [x for x in splitWithIndices(data, "\n")]
        # result.append({"id": articleId, "data": data, "sentences": sentences, "labels": labels, "split": split})
        result.append({"id": articleId, "data": data, "sentences": sentences, "split": split})
        
    return result

def readLabelTask3(fileName):
    result = []
    f = open(fileName, "r")
    result = f.readlines()
    f.close()
    result = [x.replace("\n", "").split("\t") for x in result]
    return result

def splitWithIndices(s, c=' '):
    p = 0
    for k, g in groupby(s, lambda x:x==c):
        q = p + sum(1 for i in g)
        if not k:
            yield p, q # or p, q-1 if you are really sure you want that
        p = q

In [55]:
#dev_data = loadDataTask3("train-split/tasks-2-3/train-dev/")
dev_data = loadDataTask3("dev-INPUT/tasks-2-3/dev/")

In [37]:
### tokenization
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
max_sequence_length = 129

In [38]:
label2index = {
    "Appeal_to_Authority": 0,
    "Appeal_to_fear-prejudice": 1,
    "Bandwagon": 2,
    "Black-and-White_Fallacy": 3,
    "Causal_Oversimplification": 4,
    "Doubt": 5,
    "Exaggeration,Minimisation": 6,
    "Flag-Waving": 7,
    "Loaded_Language": 8,
    "Name_Calling,Labeling": 9,
    "Obfuscation,Intentional_Vagueness,Confusion": 10,
    "Red_Herring": 11,
    "Reductio_ad_hitlerum": 12,
    "Repetition": 13,
    "Slogans": 14,
    "Straw_Men": 15,
    "Thought-terminating_Cliches": 16,
    "Whataboutism": 17
}

index2label = [
    "Appeal_to_Authority",
    "Appeal_to_fear-prejudice",
    "Bandwagon",
    "Black-and-White_Fallacy",
    "Causal_Oversimplification",
    "Doubt",
    "Exaggeration,Minimisation",
    "Flag-Waving",
    "Loaded_Language",
    "Name_Calling,Labeling",
    "Obfuscation,Intentional_Vagueness,Confusion",
    "Red_Herring",
    "Reductio_ad_hitlerum",
    "Repetition",
    "Slogans",
    "Straw_Men",
    "Thought-terminating_Cliches",
    "Whataboutism"
              ]

In [39]:
LATENT_DIM = 25

def updateSentence(sentence, char, start=None, stop=None):
    if start is None:
        start = 0;
        
    if stop is None:
        stop = len(sentence)
        
    s = list(sentence)
    for i in range(start, stop):
        if i < len(s):
            if s[i] in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ':
                s[i] = " "
            if s[i] not in [" ", "\n"]:
                s[i] = char
    return "".join(s)

def word2label(word):
    if word[0] == "A":
        return 0
    else:
        return 1
    
def sen2label(sen):
    result = [word2label(x) for x in sen if x != ""]
    return result

def predictModel(tokenizer, data, version, result, calcPerf=False):
    pred_X = []
    pred_loc = []
    for x in data:
        pred_X += x["sentences"]
        pred_loc += [(x["id"], y[0], y[1]) for y in x["split"]]
    
    input_sequences = pad_sequences(tokenizer.texts_to_sequences(pred_X),
                                    maxlen=max_sequence_length, padding='post')
    #output_sequences = pad_sequences(orig_Y, maxlen=max_sequence_length, padding='post')
    #output_sequences = np.reshape(output_sequences, output_sequences.shape + (1,))
    print('Shape of data tensor:', input_sequences.shape)
    #print('Shape of output tensor:', output_sequences.shape)
    
    # load json and create model
    json_file = open('model_prob_cnn_{:}.json'.format(version), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    # load weights into new model
    model.load_weights('model_prob_cnn_{:}.h5'.format(version))
    print("Loaded model from disk")
    
    #with open('model_{:}_{:}.pickle'.format(label2index[label], version), 'rb') as handle:
    #    model = pickle.load(handle)
        
        
    z = np.zeros((len(input_sequences), LATENT_DIM))
    
    #pred_Y = model.predict([input_sequences, z, z]).round().astype(int)
    pred_Y = model.predict(input_sequences).round().astype(int)
    pred_Y = np.reshape(pred_Y, pred_Y.shape[:2])
    
    if calcPerf:
        act_Y = []
        for x in data:
            dataMask = updateSentence(x["data"], "A")
            for l in x["labels"]:
                dataMask = updateSentence(dataMask, "B", int(l[2]), int(l[3]))
            outcome = [sen2label(y.split(" ")) for y in dataMask.split("\n") if y != ""]
            act_Y += outcome
            
        output_sequences = pad_sequences(act_Y, maxlen=max_sequence_length, padding='post')
        print(f1_score(output_sequences, pred_Y, average='micro'))
        
        
    for i, x in enumerate(pred_Y):
        if sum(x) > 0:
            articleId = pred_loc[i][0]
            sen_start = pred_loc[i][1]
            sen = pred_X[i]
            pos = extractIndices(sen, x, sen_start)
            for p in pos:
                result.append([articleId, str(p[0]), str(p[1]), sen[(p[0] - sen_start):(p[1] - sen_start)]])
    return result    

In [52]:
def extractIndices(sentence, pred, sen_start):
    s = list(sentence)
    for i in range(len(s)):
        if s[i] in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ':
            s[i] = " "
    #word split
    split = [x for x in splitWithIndices(s)]
    
    #pred indices
    idx = [i for i in range(len(split)) if pred[i] == 1]
    if len(idx) == 0:
        return []
    prev = None
    start = []
    end = []
    for x in idx:
        if prev is None:
            start.append(x)
        elif prev + 1 < x:
            end.append(prev)
            start.append(x)
        prev = x
    end.append(idx[-1])
    
    labels = [x for x in zip(start, end) if x[0] < x[1]]
    return [(split[l[0]][0] + sen_start, split[l[1]][1] + sen_start) for l in labels]
        
    
def predictLabel(data, version):
    with open('tokenizer_label.pickle', 'rb') as handle:
        tokenizer_label = pickle.load(handle)
    pred_X = [x[3] for x in data]
    max_sequence_length = 81
    input_sequences = pad_sequences(tokenizer_label.texts_to_sequences(pred_X),
                                    maxlen=max_sequence_length, padding='post')
    print('Shape of data tensor:', input_sequences.shape)
    
    # load json and create model
    json_file = open('model_label_cnn_{:}.json'.format(version), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    # load weights into new model
    model.load_weights('model_label_cnn_{:}.h5'.format(version))
    print("Loaded model from disk")
    
    
    #pred_Y = model.predict([input_sequences, z, z]).round().astype(int)
    pred_Y = model.predict(input_sequences)
    pred_Y = [np.argmax(x) for x in pred_Y]
    
    result = []
    for i in range(len(pred_Y)):
        result.append([data[i][0], index2label[pred_Y[i]], data[i][1], data[i][2]])
    return result

def exportResult(result):
    with open("example-submission-task3-predictions_comb.txt", "w") as fout:
        for key, group in groupby(result, lambda x: x[0]):
            for thing in group:
                fout.write('\t'.join(thing[0:4]) + "\n")

In [59]:
result = []
result = predictModel(tokenizer, dev_data, "v1", result)
result = predictLabel(result, "v1")
#result = [x for x in result if x[1] not in ["Slogans", "Name_Calling,Labeling", "Loaded_Language"]]
#result = [x for x in result if x[1] not in ["Loaded_Language"]]
result = [x for x in result if x[1] not in ["Appeal_to_fear-prejudice", 
                                            "Appeal_to_Authority", 
                                            "Obfuscation,Intentional_Vagueness,Confusion",
                                            "Bandwagon",
                                            "Reductio_ad_hitlerum", 
                                            "Straw_Men", 
                                            "Whataboutism", 
                                            "Red_Herring", 
                                            "Repetition", 
                                            "Thought-terminating_Cliches", 
                                            "Slogans", 
                                            "Causal_Oversimplification"]]
exportResult(result)

Shape of data tensor: (2034, 129)
Loaded model from disk
Shape of data tensor: (856, 81)
Loaded model from disk


In [13]:
rrr

[['697444415',
  '168',
  '217',
  'shot in the pelvis during the horrific attack and'],
 ['697444415', '745', '777', 'shooter, every other person that'],
 ['697444415', '824', '831', 'as well'],
 ['697444415', '937', '943', 'he ran'],
 ['697444415', '1144', '1169', 'a sudden bullets are just'],
 ['697444415', '1411', '1424', 'just going to'],
 ['697444415', '2068', '2082', 'you get out of'],
 ['697444415', '2376', '2384', 'was just'],
 ['697444415', '2695', '2725', 'no, you can’t go out here, you'],
 ['697444415', '2805', '2834', 'obviously a huge bombshell in'],
 ['111111132', '147', '155', 'huge fan'],
 ['111111132',
  '246',
  '296',
  'play matching games about Bible stories and listen'],
 ['111111132', '1771', '1808', 'and violates the personal consciences'],
 ['111111132', '2262', '2292', 'beneficial and is a good thing'],
 ['111111132', '2437', '2445', 'for some'],
 ['111111132',
  '2671',
  '2738',
  'religion, but it teaches character and respect and how important it'],
 ['11

In [42]:
aaa = predictLabel(rrr, "v1")

Shape of data tensor: (856, 81)
Loaded model from disk


In [43]:
aaa

[['710100700', 'Name_Calling,Labeling', '0', '11'],
 ['710100700', 'Doubt', '1295', '1336'],
 ['785801366', 'Doubt', '100', '105'],
 ['785801366', 'Appeal_to_fear-prejudice', '182', '255'],
 ['785801366', 'Doubt', '330', '409'],
 ['785801366', 'Exaggeration,Minimisation', '444', '525'],
 ['785801366', 'Black-and-White_Fallacy', '777', '850'],
 ['785801366', 'Loaded_Language', '1387', '1401'],
 ['785801366', 'Causal_Oversimplification', '1462', '1545'],
 ['785801366', 'Exaggeration,Minimisation', '1612', '1644'],
 ['785801366', 'Exaggeration,Minimisation', '1752', '1833'],
 ['785801366', 'Loaded_Language', '1973', '1997'],
 ['785801366', 'Loaded_Language', '2026', '2046'],
 ['785801366', 'Name_Calling,Labeling', '2140', '2161'],
 ['785801366', 'Loaded_Language', '2260', '2275'],
 ['785801366', 'Repetition', '2397', '2418'],
 ['785801366', 'Exaggeration,Minimisation', '2529', '2574'],
 ['785801366', 'Black-and-White_Fallacy', '2625', '2691'],
 ['785801366', 'Slogans', '3296', '3359'],
 [

In [249]:
rrr

[['698018235', 'Appeal_to_Authority', '1333', '1351', 'have been canceled'],
 ['703821117',
  'Appeal_to_Authority',
  '3523',
  '3557',
  'Guatemalan operation was brilliant'],
 ['111111131', 'Appeal_to_Authority', '4136', '4150', "I learned he's"],
 ['703698295', 'Appeal_to_Authority', '1182', '1190', 'Crux, he'],
 ['694327499',
  'Appeal_to_Authority',
  '1503',
  '1526',
  'Dr. Robert Fastiggi and'],
 ['700551604',
  'Appeal_to_Authority',
  '6721',
  '6743',
  'operation to establish'],
 ['701553469', 'Appeal_to_Authority', '1993', '2003', 'doubt from'],
 ['701553469',
  'Appeal_to_Authority',
  '2008',
  '2036',
  'information circulated about']]

In [45]:
exportResult(aaa)

In [46]:
aaa

[['710100700', 'Name_Calling,Labeling', '0', '11'],
 ['710100700', 'Doubt', '1295', '1336'],
 ['785801366', 'Doubt', '100', '105'],
 ['785801366', 'Appeal_to_fear-prejudice', '182', '255'],
 ['785801366', 'Doubt', '330', '409'],
 ['785801366', 'Exaggeration,Minimisation', '444', '525'],
 ['785801366', 'Black-and-White_Fallacy', '777', '850'],
 ['785801366', 'Loaded_Language', '1387', '1401'],
 ['785801366', 'Causal_Oversimplification', '1462', '1545'],
 ['785801366', 'Exaggeration,Minimisation', '1612', '1644'],
 ['785801366', 'Exaggeration,Minimisation', '1752', '1833'],
 ['785801366', 'Loaded_Language', '1973', '1997'],
 ['785801366', 'Loaded_Language', '2026', '2046'],
 ['785801366', 'Name_Calling,Labeling', '2140', '2161'],
 ['785801366', 'Loaded_Language', '2260', '2275'],
 ['785801366', 'Repetition', '2397', '2418'],
 ['785801366', 'Exaggeration,Minimisation', '2529', '2574'],
 ['785801366', 'Black-and-White_Fallacy', '2625', '2691'],
 ['785801366', 'Slogans', '3296', '3359'],
 [

In [None]:
#filter individual models
result = [x for x in result if x not in ["Slogans", "Name_Calling,Labeling", "Loaded_Language"]]
    