In [164]:
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import spacy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [122]:
fallacies_df = pd.read_csv('../data/fallacies.csv', index_col=0)
approved_df = pd.read_csv('../data/approved.csv', index_col=0)

In [123]:
df.fallacy_reason = df.fallacy_reason.fillna('')

df = pd.concat([
    fallacies_df,
    approved_df[approved_df.n_supporters >= 5].drop("n_supporters", axis=1)
])

df = df[~df.premise_content.isna()]

In [124]:
vc = df.fallacy_type.value_counts()

In [125]:
df = df[df.fallacy_type.isin(vc.head(10).index)]

# Text preprocessing

In [126]:
nlp = spacy.load('en_core_web_sm')

In [127]:
def preprocess_sentence(sent):
    sent = sent.lower()
    sent = nlp(sent)
    words = map(lambda x: x.text, sent)
    return list(words)

In [128]:
df['premise_content_preprocessed'] = df.premise_content.apply(preprocess_sentence)

In [129]:
df = df.sample(frac=1).reset_index(drop=True) # shuffle the data

In [130]:
train_df, test_df = train_test_split(df, test_size=0.1)

In [135]:
word_vocab = {"<oov>", "<pad>"}
word_vocab = word_vocab.union(
    set(chain.from_iterable(map(lambda x: x[1]["premise_content_preprocessed"], train_df.iterrows())))
)
word_to_ix = {word: i for i, word in enumerate(word_vocab)}

In [136]:
fallacy_vocab = set(df.fallacy_type.unique())
fallacy_to_ix = {word: i for i, word in enumerate(fallacy_vocab)}

# Neural Net

In [188]:
class Net(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim, fallacy_vocab_size):
        super(Net, self).__init__()
        
        self.word_embeddings = nn.Embedding(
            word_vocab_size, word_embedding_dim
        )  # random init

        hidden = 100
        self.fc1 = nn.Linear(word_embedding_dim, hidden)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden, fallacy_vocab_size)
        
    def forward(self, word_inputs):
        word_embeds = self.word_embeddings(word_inputs)
        h1 = self.fc1(word_embeds)
        a1 = self.relu(h1)
        h2 = self.fc2(a1)
        return h2

In [189]:
def pad_and_convert_to_ints(data):
    max_size = len(
        max(data.iterrows(),
            key=lambda x: len(x[1]["premise_content_preprocessed"]))[1]
        ["premise_content_preprocessed"])
    X = np.full((len(data), max_size), word_to_ix["<pad>"])

    for i, (_, x) in enumerate(data.iterrows()):
        X[i, :len(x["premise_content_preprocessed"])] = [
            (word_to_ix[word] if word in word_to_ix else word_to_ix['<oov>'])
            for word in x["premise_content_preprocessed"]
        ]

    return X

In [190]:
trainX = pad_and_convert_to_ints(train_df)
testX = pad_and_convert_to_ints(test_df)

In [196]:
trainY = train_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values
testY = test_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values

In [197]:
net = Net(
    len(word_vocab),
    10,
    len(fallacy_vocab)
)
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.CrossEntropyLoss()

In [206]:
def train_epoch(X, Y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X.shape[0], batch_size):
        x_batch = X[beg_i : beg_i + batch_size, :]
        y_batch = Y[beg_i : beg_i + batch_size]
        x_batch = torch.tensor(x_batch)
        
        y_onehot = (np.arange(len(fallacy_vocab)) == y_batch[:,None]).astype(np.long)
        y_onehot = torch.from_numpy(y_onehot)

        opt.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_onehot)

        loss.backward()
        
        opt.step()

        losses.append(loss.data.numpy())

    return [sum(losses) / float(len(losses))]

In [207]:
e_losses = []
num_epochs = 100
for e in tqdm(range(num_epochs)):
    e_losses += train_epoch(trainX, trainY, net, opt, criterion, batch_size=100)

100%|██████████| 100/100 [00:58<00:00,  1.72it/s]


In [208]:
e_losses

[4.079296280356014,
 3.612408203237197,
 3.250465883928187,
 3.0295575787039364,
 2.897688262602862,
 2.8013391775243424,
 2.7209101845236385,
 2.650587558746338,
 2.5873507331399357,
 2.530113556805779,
 2.4774994569666244,
 2.42874215630924,
 2.3832123700310204,
 2.340719741933486,
 2.3008682938183056,
 2.2635538928649006,
 2.228869957082412,
 2.196645673583536,
 2.1667637264027313,
 2.1389953879749073,
 2.113375250030966,
 2.0897367000579834,
 2.0678575599894806,
 2.0475393744076,
 2.0284380141426537,
 2.0104459664400887,
 1.9934797216864193,
 1.977383487364825,
 1.9620820985120886,
 1.9475790051852955,
 1.9336818737142227,
 1.9203567645129036,
 1.9074766355402328,
 1.8950139073764576,
 1.8830180168151855,
 1.8714232444763184,
 1.8601855320089005,
 1.8493060364442713,
 1.8387943436117733,
 1.8285879457698149,
 1.8187457954182344,
 1.8092001816805672,
 1.7999028808930342,
 1.7908845017938053,
 1.7821017223245956,
 1.7735902351491593,
 1.7653251816244686,
 1.7573269114774817,
 1.74957