In [164]:
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import spacy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [122]:
fallacies_df = pd.read_csv('../data/fallacies.csv', index_col=0)
approved_df = pd.read_csv('../data/approved.csv', index_col=0)

In [123]:
df.fallacy_reason = df.fallacy_reason.fillna('')

df = pd.concat([
    fallacies_df,
    approved_df[approved_df.n_supporters >= 5].drop("n_supporters", axis=1)
])

df = df[~df.premise_content.isna()]

In [124]:
vc = df.fallacy_type.value_counts()

In [125]:
df = df[df.fallacy_type.isin(vc.head(10).index)]

# Text preprocessing

In [126]:
nlp = spacy.load('en_core_web_sm')

In [127]:
def preprocess_sentence(sent):
    sent = sent.lower()
    sent = nlp(sent)
    words = map(lambda x: x.text, sent)
    return list(words)

In [128]:
df['premise_content_preprocessed'] = df.premise_content.apply(preprocess_sentence)

In [129]:
df = df.sample(frac=1).reset_index(drop=True) # shuffle the data

In [130]:
train_df, test_df = train_test_split(df, test_size=0.1)

In [135]:
word_vocab = {"<oov>", "<pad>"}
word_vocab = word_vocab.union(
    set(chain.from_iterable(map(lambda x: x[1]["premise_content_preprocessed"], train_df.iterrows())))
)
word_to_ix = {word: i for i, word in enumerate(word_vocab)}

In [136]:
fallacy_vocab = set(df.fallacy_type.unique())
fallacy_to_ix = {word: i for i, word in enumerate(fallacy_vocab)}

# Neural Net

In [175]:
class Net(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim):
        super(Net, self).__init__()
        
        self.word_embeddings = nn.Embedding(
            word_vocab_size, word_embedding_dim
        )  # random init

        hidden = 100
        self.fc1 = nn.Linear(word_embedding_dim, hidden)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden, 1)
        self.log_softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, word_inputs):
        word_embeds = self.word_embeddings(word_inputs)
        h1 = self.fc1(word_embeds)
        a1 = self.relu(h1)
        h2 = self.fc2(a1)
        return h2

In [151]:
def pad_and_convert_to_ints(data):
    max_size = len(
        max(data.iterrows(),
            key=lambda x: len(x[1]["premise_content_preprocessed"]))[1]
        ["premise_content_preprocessed"])
    X = np.full((len(data), max_size), word_to_ix["<pad>"])

    for i, (_, x) in enumerate(data.iterrows()):
        X[i, :len(x["premise_content_preprocessed"])] = [
            (word_to_ix[word] if word in word_to_ix else word_to_ix['<oov>'])
            for word in x["premise_content_preprocessed"]
        ]

    return X

In [152]:
trainX = pad_and_convert_to_ints(train_df)
testX = pad_and_convert_to_ints(test_df)

In [166]:
trainY = train_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values.reshape(-1, 1)
testY = test_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values.reshape(-1, 1)

In [181]:
net = Net(
    len(word_vocab),
    10,
)
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.CrossEntropyLoss()

In [182]:
def train_epoch(X, Y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X.shape[0], batch_size):
        x_batch = X[beg_i : beg_i + batch_size, :]
        y_batch = Y[beg_i : beg_i + batch_size, :]
        x_batch = torch.tensor(x_batch)
        y_batch = torch.tensor(y_batch)

        opt.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_batch)

        loss.backward()
        
        opt.step()

        losses.append(loss.data.numpy())

    return [sum(losses) / float(len(losses))]

In [186]:
e_losses = []
num_epochs = 100
for e in tqdm(range(num_epochs)):
    e_losses += train_epoch(trainX, trainY, net, opt, criterion, batch_size=100)

100%|██████████| 100/100 [00:57<00:00,  1.75it/s]


In [187]:
e_losses

[3.0380179741803337,
 3.0063493672539208,
 2.9753541525672462,
 2.944806505652035,
 2.915020185358384,
 2.885999258826761,
 2.857938892701093,
 2.8311156525331387,
 2.8055960991803337,
 2.781282424926758,
 2.7582375722772934,
 2.736333173864028,
 2.715488686281092,
 2.6957156517926384,
 2.6770255004658416,
 2.659271899391623,
 2.642465717652265,
 2.626415224636302,
 2.611290693283081,
 2.5968673088971306,
 2.5831929936128506,
 2.570176699582268,
 2.557802480809829,
 2.5459998775930965,
 2.5346695395076977,
 2.5238923605750587,
 2.5136453965130974,
 2.503760113435633,
 2.494337095933802,
 2.485317650963278,
 2.476711637833539,
 2.4684326929204605,
 2.4605423422420727,
 2.4529862544115852,
 2.445765130660113,
 2.4388511461370133,
 2.432255282121546,
 2.4258906140046963,
 2.419803437064676,
 2.4139573714312386,
 2.408359681858736,
 2.4029767934013817,
 2.3978470353519215,
 2.3928763024947224,
 2.3881373405456543,
 2.383571554632748,
 2.379203614066629,
 2.374945233849918,
 2.3708842642167