In [1]:
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm


import torch
import spacy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

  return f(*args, **kwds)


In [2]:
# initialize hyper hyper parameters
torch.manual_seed(420)
np.random.seed(420)

-------------------

# Load data

In [3]:
fallacies_df = pd.read_csv('../data/fallacies.csv', index_col=0)
approved_df = pd.read_csv('../data/approved.csv', index_col=0)

In [4]:
df = pd.concat([
    fallacies_df,
    approved_df[approved_df.n_supporters >= 5].drop("n_supporters", axis=1)
])
df.fallacy_reason = df.fallacy_reason.fillna('')
df = df[~df.premise_content.isna()]

In [5]:
vc = df.fallacy_type.value_counts()

In [6]:
df = df[df.fallacy_type.isin(vc.head(10).index)]

# Text preprocessing

In [7]:
# make sure to run
# pip install spacy-transformers
# python -m spacy download en_trf_bertbaseuncased_lg

nlp = spacy.load('en_trf_bertbaseuncased_lg') # use bert model for the embeddings

I0509 20:51:01.732623 4545457600 file_utils.py:39] PyTorch version 1.0.1.post2 available.
I0509 20:51:01.787950 4545457600 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [8]:
def preprocess_sentence(sent):
    sent = sent.lower()
    sent = nlp(sent)
    words = map(lambda x: x.vector, sent)
    return list(words)

In [9]:
df['premise_content_preprocessed'] = df.premise_content.apply(preprocess_sentence)

In [10]:
df = df.sample(frac=1).reset_index(drop=True) # shuffle the data

In [11]:
train_df, test_df = train_test_split(df, test_size=0.1)

In [13]:
fallacy_vocab = set(df.fallacy_type.unique())
fallacy_to_ix = {word: i for i, word in enumerate(fallacy_vocab)}

In [20]:
bert_embed_size = df.premise_content_preprocessed[0][0].shape[0]
pad_vector = np.random.rand(bert_embed_size)

# Neural Net

In [84]:
class Net(nn.Module):
    def __init__(self, word_embedding_size, fallacy_vocab_size, max_sent_size):
        super(Net, self).__init__()

        hidden = 100
        self.fc1 = nn.Linear(word_embedding_size, hidden)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden * max_sent_size, fallacy_vocab_size)
        
    def forward(self, word_embed_inputs):
        print(word_embed_inputs.dtype)
        print(word_embed_inputs.shape)
        h1 = self.fc1(word_embed_inputs)
        a1 = self.relu(h1)
        h2 = self.fc2(a1.view(a1.shape[0], -1))
        return h2

In [85]:
MAX_SENT_SIZE = 100

In [86]:
def pad_and_convert_to_ints(data):
    X = np.zeros((len(data), MAX_SENT_SIZE, bert_embed_size))
    X[:, :] = pad_vector

    for i, (_, x) in enumerate(data.iterrows()):
        X[i, :len(x["premise_content_preprocessed"])] = x["premise_content_preprocessed"]

    return X

In [87]:
trainX = pad_and_convert_to_ints(train_df)
testX = pad_and_convert_to_ints(test_df)

In [88]:
trainY = train_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values
testY = test_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values

In [89]:
net = Net(
    bert_embed_size,
    len(fallacy_vocab),
    MAX_SENT_SIZE,
)
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.CrossEntropyLoss()

In [90]:
def train_epoch(X, Y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X.shape[0], batch_size):
        x_batch = X[beg_i : beg_i + batch_size, :]
        y_batch = Y[beg_i : beg_i + batch_size]
        x_batch = torch.tensor(x_batch).float()
        y_batch = torch.tensor(y_batch).float()

        opt.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_batch)

        loss.backward()
        
        opt.step()

        losses.append(loss.data.numpy())

    return [sum(losses) / float(len(losses))]

In [91]:
e_losses = []
num_epochs = 50
for e in tqdm(range(num_epochs)):
    e_losses += train_epoch(trainX, trainY, net, opt, criterion, batch_size=100)

  0%|          | 0/50 [00:00<?, ?it/s]

torch.float32
torch.Size([100, 100, 768])





RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target'

In [None]:
e_losses

# Metrics

In [None]:
with torch.no_grad():
    net.eval()
    x = torch.tensor(testX)
    y_pred = net(x)

In [None]:
score = f1_score(testY, y_pred.numpy().argmax(axis=1), average='micro')

In [None]:
score