In [1]:
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm


import torch
import spacy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# initialize hyper hyper parameters
torch.manual_seed(420)
np.random.seed(420)

In [3]:
import gensim
import pickle

In [4]:
#!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
#!gunzip numberbatch-en-19.08.txt.gz

In [5]:
numberbatch_from_bin = gensim.models.KeyedVectors.load_word2vec_format(
    '../data/numberbatch-en-19.08.txt', 
    binary=True, 
    unicode_errors='ignore'
)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# Load data

In [6]:
fallacies_df = pd.read_csv('../data/fallacies.csv', index_col=0)
approved_df = pd.read_csv('../data/approved.csv', index_col=0)

In [7]:
df = pd.concat([
    fallacies_df,
    approved_df[approved_df.n_supporters >= 5].drop("n_supporters", axis=1)
])
df.fallacy_reason = df.fallacy_reason.fillna('')
df = df[~df.premise_content.isna()]

In [8]:
vc = df.fallacy_type.value_counts()

In [9]:
df = df[df.fallacy_type.isin(vc.head(10).index)]

# Text preprocessing

In [10]:
nlp = spacy.load('en_core_web_sm')

In [33]:
def preprocess_sentence(sent):
    sent = sent.lower()
    sent = nlp(sent)
    words = map(lambda x: x.text, sent)
    #words = map(lambda x: x.lemma_, sent)
    return list(words)

In [34]:
df['premise_content_preprocessed'] = df.premise_content.apply(preprocess_sentence)

In [35]:
df = df.sample(frac=1, random_state=420).reset_index(drop=True) # shuffle the data

In [36]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=420)

In [37]:
word_vocab = set(chain.from_iterable(map(lambda x: x[1]["premise_content_preprocessed"], train_df.iterrows())))
sorted_words = sorted(list(word_vocab))

Skip this whole part below if you already have calculated the needed vectors and word map ( we will save and reload them in this notebook, but this could be useful if you were trying to replicate our work and just needed the vectors we have in our ../data directory!).

In [38]:
word_to_ix = {}
# we need to create a mapping with all the vectors we might need!
needed_vectors = []
for i in range(len(sorted_words)):
    try:
        # Order is important, if we found the word, the we'll append it
        # if we haven't we'll consider this an oov word!
        needed_vectors.append(numberbatch_from_bin[sorted_words[i]])
        word_to_ix[sorted_words[i]] = len(needed_vectors)
    except KeyError:
        # this will be our OOV term
        word_to_ix[sorted_words[i]] = 0


#OOV and PAD vectors are both random
needed_vectors.insert(0, np.random.randn(*needed_vectors[0].shape))
needed_vectors.append(np.random.randn(*needed_vectors[0].shape))
word_to_ix['<oov>'] = 0
word_to_ix['<pad>'] = len(needed_vectors) - 1

#Normalize our vectors
needed_vectors = torch.FloatTensor(needed_vectors)
norm = needed_vectors.norm(p=2, dim=1, keepdim=True)
needed_vectors = needed_vectors.div(norm)

torch.save(needed_vectors,'../data/numberbatch_needed_vectors.pt')
# Also make sure to save the word_map
print(needed_vectors.shape)
f = open("../data/numberbatch_word_to_ix.pkl", "wb")
pickle.dump(word_to_ix, f)
f.close()

torch.Size([110, 300])


In [39]:
needed_vectors = torch.load('../data/numberbatch_needed_vectors.pt')
with open("../data/numberbatch_word_to_ix.pkl", 'rb') as fr:
    word_map = pickle.load(fr)

In [40]:
fallacy_vocab = sorted(list(set(df.fallacy_type.unique())))
fallacy_to_ix = {word: i for i, word in enumerate(fallacy_vocab)}

# Neural Net

In [41]:
class Net(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim, fallacy_vocab_size, max_sent_size, numberbatch_tensor):
        super(Net, self).__init__()
        
        self.word_embeddings = nn.Embedding.from_pretrained(numberbatch_tensor)

        hidden = 100
        self.fc1 = nn.Linear(word_embedding_dim, hidden)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden * max_sent_size, fallacy_vocab_size)
        
    def forward(self, word_inputs):
        word_embeds = self.word_embeddings(word_inputs)
        h1 = self.fc1(word_embeds)
        a1 = self.relu(h1)
        h2 = self.fc2(a1.view(a1.shape[0], -1))
        return h2

In [42]:
MAX_SENT_SIZE = 100

In [43]:
def pad_and_convert_to_ints(data):
    X = np.full((len(data), MAX_SENT_SIZE), word_to_ix["<pad>"])

    for i, (_, x) in enumerate(data.iterrows()):
        X[i, :len(x["premise_content_preprocessed"])] = [
            (word_to_ix[word] if word in word_to_ix else word_to_ix['<oov>'])
            for word in x["premise_content_preprocessed"]
        ]

    return X

In [44]:
trainX = pad_and_convert_to_ints(train_df)
testX = pad_and_convert_to_ints(test_df)

In [45]:
trainY = train_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values
testY = test_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values

In [46]:
net = Net(
    len(word_vocab),
    300,
    len(fallacy_vocab),
    MAX_SENT_SIZE,
    needed_vectors
)
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.CrossEntropyLoss()

In [47]:
def train_epoch(X, Y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X.shape[0], batch_size):
        x_batch = X[beg_i : beg_i + batch_size, :]
        y_batch = Y[beg_i : beg_i + batch_size]
        x_batch = torch.tensor(x_batch)
        y_batch = torch.tensor(y_batch)

        opt.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_batch)

        loss.backward()
        
        opt.step()

        losses.append(loss.data.numpy())

    return [sum(losses) / float(len(losses))]

In [48]:
e_losses = []
num_epochs = 50
for e in tqdm(range(num_epochs)):
    e_losses += train_epoch(trainX, trainY, net, opt, criterion, batch_size=100)

100%|██████████| 50/50 [00:15<00:00,  3.49it/s]


In [49]:
e_losses

[1.6239029659944422,
 1.4320020745782291,
 1.3878648491466747,
 1.3392051037620096,
 1.2825453912510592,
 1.2250224772621603,
 1.1703543417594011,
 1.1202354396090788,
 1.078411407330457,
 1.0414888280279495,
 1.0083690986913794,
 0.9761710359769709,
 0.9445623229531681,
 0.9234212373985964,
 0.898720977937474,
 0.8773404500063728,
 0.8553482381736531,
 0.8343695647576276,
 0.8156078142278335,
 0.7992306001046124,
 0.7831716274513918,
 0.769335678395103,
 0.7552255129112917,
 0.74031402258312,
 0.7267584818250993,
 0.7142733002410215,
 0.7066754295545465,
 0.6931656967191135,
 0.6847570205436033,
 0.6752560436725616,
 0.6656898242585799,
 0.6573495952522054,
 0.6491793236311745,
 0.6416953942354988,
 0.6349883658044478,
 0.6279487574801725,
 0.6213392545195187,
 0.6162620993221507,
 0.6106918278862449,
 0.6053871126735911,
 0.6002449042656842,
 0.5942175195497625,
 0.585710932226742,
 0.5816615784869474,
 0.5749589949846268,
 0.5699315947644851,
 0.5587790801244623,
 0.5572669199284386

# Metrics (Macro F1 Score)

In [50]:
with torch.no_grad():
    net.eval()
    x = torch.tensor(testX)
    y_pred = net(x)

In [51]:
score = f1_score(testY, y_pred.numpy().argmax(axis=1), average='macro')

  'recall', 'true', average, warn_for)


In [52]:
score

0.13829441974882395

In [53]:
for i in range(10):
    print("Guessing " + fallacy_vocab[i] + "\t for all:", end='\t')
    print(f1_score(testY, np.array( [i]*testY.shape[0]), average='macro'))

Guessing Appeal To Authority	 for all:	0.002442002442002442
Guessing Appeal To Belief	 for all:	0.007168458781362007
Guessing Begging The Question	 for all:	0.010582010582010581
Guessing Fallacy Of False Cause	 for all:	0.007168458781362007
Guessing Fallacy Of Red Herring	 for all:	0.012798138452588714
Guessing Irrelevant Conclusion	 for all:	0.03442879499217527
Guessing None	 for all:	0.081377151799687
Guessing Poisoning The Well	 for all:	0.002442002442002442
Guessing Prejudicial Language	 for all:	0.0
Guessing Wrong Direction	 for all:	0.008318478906714201


In [54]:
print("Random Guessing:", end='\t')
print(f1_score(testY, np.random.randint(0,10,testY.shape[0]), average='macro'))

Random Guessing:	0.07239885972537836
