In [1]:
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm


import torch
import spacy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# initialize hyper hyper parameters
torch.manual_seed(420)
np.random.seed(420)

-------------------

# Load data

In [3]:
fallacies_df = pd.read_csv('../data/fallacies.csv', index_col=0)
approved_df = pd.read_csv('../data/approved.csv', index_col=0)

In [4]:
df = pd.concat([
    fallacies_df,
    approved_df[approved_df.n_supporters >= 5].drop("n_supporters", axis=1)
])
df.fallacy_reason = df.fallacy_reason.fillna('')
df = df[~df.premise_content.isna()]

In [5]:
vc = df.fallacy_type.value_counts()

In [6]:
df = df[df.fallacy_type.isin(vc.head(10).index)]

# Text preprocessing

In [7]:
nlp = spacy.load('en_core_web_sm')

In [8]:
def preprocess_sentence(sent):
    sent = sent.lower()
    sent = nlp(sent)
    words = map(lambda x: x.text, sent)
    return list(words)

In [9]:
df['premise_content_preprocessed'] = df.premise_content.apply(preprocess_sentence)

In [10]:
df = df.sample(frac=1, random_state=420).reset_index(drop=True) # shuffle the data

In [11]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=420)

In [12]:
word_vocab = {"<oov>", "<pad>"}
word_vocab = word_vocab.union(
    set(chain.from_iterable(map(lambda x: x[1]["premise_content_preprocessed"], train_df.iterrows())))
)
word_to_ix = {word: i for i, word in enumerate(word_vocab)}

In [13]:
fallacy_vocab = sorted(list(set(df.fallacy_type.unique())))
fallacy_to_ix = {word: i for i, word in enumerate(fallacy_vocab)}

# Neural Net

In [14]:
class Net(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim, fallacy_vocab_size, max_sent_size):
        super(Net, self).__init__()
        
        self.word_embeddings = nn.Embedding(
            word_vocab_size, word_embedding_dim
        )  # random init

        hidden = 100
        self.fc1 = nn.Linear(word_embedding_dim, hidden)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden * max_sent_size, fallacy_vocab_size)
        
    def forward(self, word_inputs):
        word_embeds = self.word_embeddings(word_inputs)
        h1 = self.fc1(word_embeds)
        a1 = self.relu(h1)
        h2 = self.fc2(a1.view(a1.shape[0], -1))
        return h2

In [15]:
MAX_SENT_SIZE = 100

In [16]:
def pad_and_convert_to_ints(data):
    X = np.full((len(data), MAX_SENT_SIZE), word_to_ix["<pad>"])

    for i, (_, x) in enumerate(data.iterrows()):
        X[i, :len(x["premise_content_preprocessed"])] = [
            (word_to_ix[word] if word in word_to_ix else word_to_ix['<oov>'])
            for word in x["premise_content_preprocessed"]
        ]

    return X

In [17]:
trainX = pad_and_convert_to_ints(train_df)
testX = pad_and_convert_to_ints(test_df)

In [18]:
trainY = train_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values
testY = test_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values

In [19]:
net = Net(
    len(word_vocab),
    300,
    len(fallacy_vocab),
    MAX_SENT_SIZE,
)
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.CrossEntropyLoss()

In [20]:
def train_epoch(X, Y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X.shape[0], batch_size):
        x_batch = X[beg_i : beg_i + batch_size, :]
        y_batch = Y[beg_i : beg_i + batch_size]
        x_batch = torch.tensor(x_batch)
        y_batch = torch.tensor(y_batch)

        opt.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_batch)

        loss.backward()
        
        opt.step()

        losses.append(loss.data.numpy())

    return [sum(losses) / float(len(losses))]

In [None]:
e_losses = []
num_epochs = 50
for e in tqdm(range(num_epochs)):
    e_losses += train_epoch(trainX, trainY, net, opt, criterion, batch_size=100)

 14%|█▍        | 7/50 [00:07<00:46,  1.09s/it]

In [None]:
e_losses

# Metrics (macro f1 score)

In [None]:
with torch.no_grad():
    net.eval()
    x = torch.tensor(testX)
    y_pred = net(x)

In [None]:
score = f1_score(testY, y_pred.numpy().argmax(axis=1), average='macro')

In [None]:
score

In [None]:
for i in range(10):
    print("Guessing " + fallacy_vocab[i] + "\t for all:", end='\t')
    print(f1_score(testY, np.array( [i]*testY.shape[0]), average='macro'))

In [None]:
print("Random Guessing:", end='\t')
print(f1_score(testY, np.random.randint(0,10,testY.shape[0]), average='macro'))