In [299]:
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm


import torch
import spacy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [209]:
# initialize hyper hyper parameters
torch.manual_seed(420)
np.random.seed(420)

-------------------

# Load data

In [122]:
fallacies_df = pd.read_csv('../data/fallacies.csv', index_col=0)
approved_df = pd.read_csv('../data/approved.csv', index_col=0)

In [123]:
df.fallacy_reason = df.fallacy_reason.fillna('')

df = pd.concat([
    fallacies_df,
    approved_df[approved_df.n_supporters >= 5].drop("n_supporters", axis=1)
])

df = df[~df.premise_content.isna()]

In [124]:
vc = df.fallacy_type.value_counts()

In [125]:
df = df[df.fallacy_type.isin(vc.head(10).index)]

# Text preprocessing

In [126]:
nlp = spacy.load('en_core_web_sm')

In [127]:
def preprocess_sentence(sent):
    sent = sent.lower()
    sent = nlp(sent)
    words = map(lambda x: x.text, sent)
    return list(words)

In [128]:
df['premise_content_preprocessed'] = df.premise_content.apply(preprocess_sentence)

In [129]:
df = df.sample(frac=1).reset_index(drop=True) # shuffle the data

In [130]:
train_df, test_df = train_test_split(df, test_size=0.1)

In [135]:
word_vocab = {"<oov>", "<pad>"}
word_vocab = word_vocab.union(
    set(chain.from_iterable(map(lambda x: x[1]["premise_content_preprocessed"], train_df.iterrows())))
)
word_to_ix = {word: i for i, word in enumerate(word_vocab)}

In [136]:
fallacy_vocab = set(df.fallacy_type.unique())
fallacy_to_ix = {word: i for i, word in enumerate(fallacy_vocab)}

# Neural Net

In [294]:
class Net(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim, fallacy_vocab_size, max_sent_size):
        super(Net, self).__init__()
        
        self.word_embeddings = nn.Embedding(
            word_vocab_size, word_embedding_dim
        )  # random init

        hidden = 100
        self.fc1 = nn.Linear(word_embedding_dim, hidden)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden * max_sent_size, fallacy_vocab_size)
        
    def forward(self, word_inputs):
        word_embeds = self.word_embeddings(word_inputs)
        h1 = self.fc1(word_embeds)
        a1 = self.relu(h1)
        h2 = self.fc2(a1.view(a1.shape[0], -1))
        return h2

In [286]:
MAX_SENT_SIZE = 100

In [287]:
def pad_and_convert_to_ints(data):
    X = np.full((len(data), MAX_SENT_SIZE), word_to_ix["<pad>"])

    for i, (_, x) in enumerate(data.iterrows()):
        X[i, :len(x["premise_content_preprocessed"])] = [
            (word_to_ix[word] if word in word_to_ix else word_to_ix['<oov>'])
            for word in x["premise_content_preprocessed"]
        ]

    return X

In [288]:
trainX = pad_and_convert_to_ints(train_df)
testX = pad_and_convert_to_ints(test_df)

In [289]:
trainY = train_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values
testY = test_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values

In [290]:
net = Net(
    len(word_vocab),
    10,
    len(fallacy_vocab),
    MAX_SENT_SIZE,
)
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.CrossEntropyLoss()

In [291]:
def train_epoch(X, Y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X.shape[0], batch_size):
        x_batch = X[beg_i : beg_i + batch_size, :]
        y_batch = Y[beg_i : beg_i + batch_size]
        x_batch = torch.tensor(x_batch)
        y_batch = torch.tensor(y_batch)

        opt.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_batch)

        loss.backward()
        
        opt.step()

        losses.append(loss.data.numpy())

    return [sum(losses) / float(len(losses))]

In [293]:
e_losses = []
num_epochs = 50
for e in tqdm(range(num_epochs)):
    e_losses += train_epoch(trainX, trainY, net, opt, criterion, batch_size=100)

100%|██████████| 50/50 [00:39<00:00,  1.26it/s]


In [295]:
e_losses

[1.4183097586912268,
 1.1480253514121561,
 1.024930929436403,
 0.9076031130902907,
 0.8023893307237064,
 0.7116362101891461,
 0.6356460557264441,
 0.5704033146886265,
 0.515100326608209,
 0.46868685589117165,
 0.4294133799917558,
 0.3954954576842925,
 0.36588358703781576,
 0.3408045558368458,
 0.3159698274205713,
 0.2963291012188968,
 0.2752674335942549,
 0.25655636980253105,
 0.2408735480378656,
 0.227424838087138,
 0.21599343112286398,
 0.2048915024189388,
 0.19506009317496242,
 0.1863756074624903,
 0.17796269541277604,
 0.17184165120124817,
 0.16443879376439488,
 0.159304859883645,
 0.15317970777259154,
 0.1492533192915075,
 0.14785107553881757,
 0.14126613319796674,
 0.13789781489792993,
 0.14035865028991418,
 0.13469565265318928,
 0.1271090104299433,
 0.1250345479039585,
 0.12755161852521055,
 0.12169939695912249,
 0.11814483386628769,
 0.11970089529367055,
 0.11575667472446666,
 0.11257321694317986,
 0.11678184788016711,
 0.11259010105448611,
 0.10699103465851616,
 0.107236308867

# Metrics

In [305]:
with torch.no_grad():
    net.eval()
    x = torch.tensor(testX)
    y_pred = net(x)

In [306]:
score = f1_score(testY, y_pred.numpy().argmax(axis=1), average='micro')

In [307]:
score

0.6055555555555555