In [44]:
import glob
import os
import numpy as np
import io
import pickle
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, Dropout, MaxPooling1D, Conv1D, TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [4]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


In [5]:
def loadDataTask3(folder):
    result = []
    fileNames = glob.glob(folder + "/*.txt")
    for fileName in fileNames:
        articleId = fileName.split("/")[-1].split(".")[0]
        f = open(fileName, "r", encoding="utf8")
        data = f.read()
        f.close()
        sentences = [x for x in data.split("\n") if x != ""]
        labels = readLabelTask3(folder + "/" + articleId + ".task3.labels")
        result.append({"id": articleId, "data": data, "sentences": sentences, "labels": labels})
        
    return result

def readLabelTask3(fileName):
    result = []
    f = open(fileName, "r")
    result = f.readlines()
    f.close()
    result = [x.replace("\n", "").split("\t") for x in result]
    return result

In [6]:
train_data = loadDataTask3("train-split/tasks-2-3/train-train/")

In [7]:
def updateSentence(sentence, char, start=None, stop=None):
    if start is None:
        start = 0;
        
    if stop is None:
        stop = len(sentence)
        
    s = list(sentence)
    for i in range(start, stop):
        if i < len(s):
            if s[i] in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ':
                s[i] = " "
            if s[i] not in [" ", "\n"]:
                s[i] = char
    return "".join(s)

def word2label(word):
    if word[0] == "A":
        return 0
    else:
        return 1
    
def sen2label(sen):
    result = [word2label(x) for x in sen if x != ""]
    return result

def getTrainData(data):
    trainX = []
    trainY = []
    for x in data:
        trainX += x["sentences"]
        
        dataMask = updateSentence(x["data"], "A")
        for l in x["labels"]:
                dataMask = updateSentence(dataMask, "B", int(l[2]), int(l[3]))
        outcome = [sen2label(y.split(" ")) for y in dataMask.split("\n") if y != ""]
        trainY += outcome

        
    return trainX, trainY

In [8]:
train_X, train_Y = getTrainData(train_data)

In [9]:
### tokenization
MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_X)
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
max_sequence_length = max(len(s) for s in tokenizer.texts_to_sequences(train_X))
print('Max sequence length:', max_sequence_length)
print(num_words)

Max sequence length: 129
18931


In [35]:
EMBEDDING_DIM = 200
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
    # is just a space-separated text file in the format:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [36]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros.
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [64]:
LATENT_DIM = 32

print('Building model...')

model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
#model.add(Bidirectional(LSTM(LATENT_DIM, return_sequences=True, recurrent_dropout=0.1)))
model.add(Conv1D(filters=LATENT_DIM, kernel_size=5, padding="same"))
model.add(MaxPooling1D(pool_size=3, strides=1, padding="same"))
model.add(Conv1D(filters=LATENT_DIM, kernel_size=4, padding="same"))
model.add(MaxPooling1D(pool_size=4, strides=1, padding="same"))
model.add(Conv1D(filters=LATENT_DIM, kernel_size=3, padding="same"))
model.add(MaxPooling1D(pool_size=5, strides=1, padding="same"))
#model.add(Dropout(0.1))
model.add(TimeDistributed(Dense(20, activation="relu")))
#model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))
model.compile(
  loss='binary_crossentropy',
  # optimizer='rmsprop',
  optimizer=Adam(lr=0.01),
  # optimizer=SGD(lr=0.01, momentum=0.9),
  metrics=['accuracy']
)

print(model.summary())

Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 200)         3786200   
_________________________________________________________________
conv1d_19 (Conv1D)           (None, None, 32)          32032     
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, None, 32)          0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, None, 32)          4128      
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, None, 32)          0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, None, 32)          3104      
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, None, 32)          0  

In [19]:
label2index = {
    "Appeal_to_Authority": 0,
    "Appeal_to_fear-prejudice": 1,
    "Bandwagon": 2,
    "Black-and-White_Fallacy": 3,
    "Causal_Oversimplification": 4,
    "Doubt": 5,
    "Exaggeration,Minimisation": 6,
    "Flag-Waving": 7,
    "Loaded_Language": 8,
    "Name_Calling,Labeling": 9,
    "Obfuscation,Intentional_Vagueness,Confusion": 10,
    "Red_Herring": 11,
    "Reductio_ad_hitlerum": 12,
    "Repetition": 13,
    "Slogans": 14,
    "Straw_Men": 15,
    "Thought-terminating_Cliches": 16,
    "Whataboutism": 17
}

index2label = [
    "Appeal_to_Authority",
    "Appeal_to_fear-prejudice",
    "Bandwagon",
    "Black-and-White_Fallacy",
    "Causal_Oversimplification",
    "Doubt",
    "Exaggeration,Minimisation",
    "Flag-Waving",
    "Loaded_Language",
    "Name_Calling,Labeling",
    "Obfuscation,Intentional_Vagueness,Confusion",
    "Red_Herring",
    "Reductio_ad_hitlerum",
    "Repetition",
    "Slogans",
    "Straw_Men",
    "Thought-terminating_Cliches",
    "Whataboutism"
              ]

In [67]:
input_sequences = pad_sequences(tokenizer.texts_to_sequences(train_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences = pad_sequences(train_Y, maxlen=max_sequence_length, padding='post')
output_sequences = np.reshape(output_sequences, output_sequences.shape + (1,))

print('Shape of data tensor:', input_sequences.shape)
print('Shape of output tensor:', output_sequences.shape)

VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 5
print('Training model...')
z = np.zeros((len(input_sequences), LATENT_DIM))
seed(1)
set_random_seed(2)
model.compile(
  loss='binary_crossentropy',
  # optimizer='rmsprop',
  optimizer=Adam(lr=0.01),
  # optimizer=SGD(lr=0.01, momentum=0.9),
  metrics=['accuracy']
)
model.fit(
  input_sequences,
  output_sequences,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)
# with open('model_{:}_{:}.json'.format(label2index[label], version), "w") as json_file:
#    json_file.write(model.to_json())
# model.save_weights('model_{:}_{:}.h5'.format(label2index[label], version))
# print("Saved model to disk")
pred_Y = model.predict(input_sequences).round()
pred_Y = np.reshape(pred_Y, pred_Y.shape[:2])
act_Y = np.reshape(output_sequences, output_sequences.shape[:2])

f1 = f1_score(act_Y, pred_Y, average='micro')

print(f1)

Shape of data tensor: (12342, 129)
Shape of output tensor: (12342, 129, 1)
Training model...
Train on 9873 samples, validate on 2469 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.0


  'precision', 'predicted', average, warn_for)


In [56]:
with open('model_prob_cnn_v1.json', "w") as json_file:
        json_file.write(model.to_json())
model.save_weights('model_prob_cnn_v1.h5')
print("Saved model to disk")

Saved model to disk


In [66]:


def trainModel(model, tokenizer, data, label, version):
    print('Training for:', label)
    train_X, train_Y = getTrainData(data, label)
    
    input_sequences = pad_sequences(tokenizer.texts_to_sequences(train_X),
                                    maxlen=max_sequence_length, padding='post')
    output_sequences = pad_sequences(train_Y, maxlen=max_sequence_length, padding='post')
    output_sequences = np.reshape(output_sequences, output_sequences.shape + (1,))
    print('Shape of data tensor:', input_sequences.shape)
    print('Shape of output tensor:', output_sequences.shape)
    
    VALIDATION_SPLIT = 0.2
    BATCH_SIZE = 128
    EPOCHS = 5
    print('Training model...')
    z = np.zeros((len(input_sequences), LATENT_DIM))
    seed(1)
    set_random_seed(2)
    model.compile(
      loss='binary_crossentropy',
      # optimizer='rmsprop',
      optimizer=Adam(lr=0.01),
      # optimizer=SGD(lr=0.01, momentum=0.9),
      metrics=['accuracy']
    )
    model.fit(
      input_sequences,
      output_sequences,
      batch_size=BATCH_SIZE,
      epochs=EPOCHS,
      validation_split=VALIDATION_SPLIT
    )
    with open('model_{:}_{:}.json'.format(label2index[label], version), "w") as json_file:
        json_file.write(model.to_json())
    model.save_weights('model_{:}_{:}.h5'.format(label2index[label], version))
    print("Saved model to disk")
    pred_Y = model.predict(input_sequences).round()
    pred_Y = np.reshape(pred_Y, pred_Y.shape[:2])
    act_Y = np.reshape(output_sequences, output_sequences.shape[:2])

    f1 = f1_score(act_Y, pred_Y, average='micro')
    print(f1)
    return f1
    
    #with open('model_{:}_{:}.pickle'.format(label2index[label], version), 'wb') as handle:
    #    pickle.dump(r, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
dev_data = loadDataTask3("train-split/tasks-2-3/train-dev/")

In [26]:
dev_X, dev_Y = getTrainData(dev_data)
input_sequences_dev = pad_sequences(tokenizer.texts_to_sequences(dev_X),
                                maxlen=max_sequence_length, padding='post')
output_sequences_dev = pad_sequences(dev_Y, maxlen=max_sequence_length, padding='post')
output_sequences_dev = np.reshape(output_sequences_dev, output_sequences_dev.shape)

In [66]:
pred_Y = model.predict(input_sequences_dev).round()
pred_Y = np.reshape(pred_Y, pred_Y.shape[:2])
#act_Y = np.reshape(output_sequences_dev, output_sequences.shape[:2])

f1 = f1_score(output_sequences_dev, pred_Y, average='micro')
print(f1)

0.027375201288244767


In [71]:
trainModel(model, tokenizer, train_data, "Straw_Men", "v005")

Training for: Straw_Men
Shape of data tensor: (12342, 129)
Shape of output tensor: (12342, 129, 1)
Training model...
Train on 9873 samples, validate on 2469 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Saved model to disk
0.0


  'precision', 'predicted', average, warn_for)


0.0

In [33]:
x = [y for y in train_data if y["id"] == "article776368676"]

In [34]:
x[0]

{'data': "Trump To Sessions In Series Of Tweets: ‘Stop The Rigged Witch Hunt NOW!’\n\nPresident Donald Trump has taken to Twitter in order to call upon Attorney General Jeff Sessions to end the investigation into his alleged Russian collusion.\nTrump wants the Justice Department to “stop the rigged witch hunt” before it can “stain our country and further.”\nIn his Twitter post, Trump also blasted the 17 angry Democrats that are doing a conflicted Mueller’s dirty work.\n..This is a terrible situation and Attorney General Jeff Sessions should stop this Rigged Witch Hunt right now, before it continues to stain our country any further.\nBob Mueller is totally conflicted, and his 17 Angry Democrats that are doing his dirty work are a disgrace to USA!\n— Donald J. Trump (@realDonaldTrump) August 1, 2018\n“..This is a terrible situation and Attorney General Jeff Sessions should stop this Rigged Witch Hunt right now, before it continues to stain our country any further.\nBon Mueller is totally

In [31]:
len(x[0]["data"])

4185