In [1]:
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm


import torch
import spacy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# initialize hyper hyper parameters
torch.manual_seed(420)
np.random.seed(420)

In [3]:
import gensim
import pickle

In [4]:
# NOTE -- You just need to provide your own path to a w2v binary here if you don't have it
# we used the one from our NLP Course's 2nd HW Assignment
wv_from_bin = gensim.models.KeyedVectors.load_word2vec_format("../data/w2v.bin", binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# Load data

In [5]:
fallacies_df = pd.read_csv('../data/fallacies.csv', index_col=0)
approved_df = pd.read_csv('../data/approved.csv', index_col=0)

In [6]:
df = pd.concat([
    fallacies_df,
    approved_df[approved_df.n_supporters >= 5].drop("n_supporters", axis=1)
])
df.fallacy_reason = df.fallacy_reason.fillna('')
df = df[~df.premise_content.isna()]

In [7]:
vc = df.fallacy_type.value_counts()

In [8]:
df = df[df.fallacy_type.isin(vc.head(10).index)]

# Text preprocessing

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
def preprocess_sentence(sent):
    sent = sent.lower()
    sent = nlp(sent)
    words = map(lambda x: x.text, sent)
    return list(words)

In [11]:
df['premise_content_preprocessed'] = df.premise_content.apply(preprocess_sentence)

In [12]:
df = df.sample(frac=1, random_state=420).reset_index(drop=True) # shuffle the data

In [13]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=420)

In [14]:
word_vocab = set(chain.from_iterable(map(lambda x: x[1]["premise_content_preprocessed"], train_df.iterrows())))
sorted_words = sorted(list(word_vocab))

Skip this whole part below if you already have calculated the needed vectors and word map ( we will save and reload them in this notebook, but this could be useful if you were trying to replicate our work and just needed the vectors we have in our ../data directory!).

In [15]:
word_to_ix = {}
# we need to create a mapping with all the vectors we might need!
needed_vectors = []
for i in range(len(sorted_words)):
    try:
        # Order is important, if we found the word, the we'll append it
        # if we haven't we'll consider this an oov word!
        needed_vectors.append(wv_from_bin[sorted_words[i]])
        word_to_ix[sorted_words[i]] = len(needed_vectors)
    except KeyError:
        # this will be our OOV term
        word_to_ix[sorted_words[i]] = 0


#OOV and PAD vectors are both random
needed_vectors.insert(0, np.random.randn(*needed_vectors[0].shape))
needed_vectors.append(np.random.randn(*needed_vectors[0].shape))
word_to_ix['<oov>'] = 0
word_to_ix['<pad>'] = len(needed_vectors) - 1

needed_vectors = torch.FloatTensor(needed_vectors)
torch.save(needed_vectors,'../data/w2v_needed_vectors.pt')
# Also make sure to save the word_map
print(needed_vectors.shape)
f = open("../data/w2v_word_to_ix.pkl", "wb")
pickle.dump(word_to_ix, f)
f.close()

torch.Size([3526, 300])


In [16]:
needed_vectors = torch.load('../data/w2v_needed_vectors.pt')
with open("../data/w2v_word_to_ix.pkl", 'rb') as fr:
    word_map = pickle.load(fr)

In [17]:
fallacy_vocab = sorted(list(set(df.fallacy_type.unique())))
fallacy_to_ix = {word: i for i, word in enumerate(fallacy_vocab)}

# Neural Net

In [18]:
class Net(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim, fallacy_vocab_size, max_sent_size, word2vec_tensor):
        super(Net, self).__init__()
        
        self.word_embeddings = nn.Embedding.from_pretrained(word2vec_tensor)

        hidden = 100
        self.fc1 = nn.Linear(word_embedding_dim, hidden)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden * max_sent_size, fallacy_vocab_size)
        
    def forward(self, word_inputs):
        word_embeds = self.word_embeddings(word_inputs)
        h1 = self.fc1(word_embeds)
        a1 = self.relu(h1)
        h2 = self.fc2(a1.view(a1.shape[0], -1))
        return h2

In [19]:
MAX_SENT_SIZE = 100

In [20]:
def pad_and_convert_to_ints(data):
    X = np.full((len(data), MAX_SENT_SIZE), word_to_ix["<pad>"])

    for i, (_, x) in enumerate(data.iterrows()):
        X[i, :len(x["premise_content_preprocessed"])] = [
            (word_to_ix[word] if word in word_to_ix else word_to_ix['<oov>'])
            for word in x["premise_content_preprocessed"]
        ]

    return X

In [21]:
trainX = pad_and_convert_to_ints(train_df)
testX = pad_and_convert_to_ints(test_df)

In [22]:
trainY = train_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values
testY = test_df.fallacy_type.apply(lambda x: fallacy_to_ix[x]).values

In [23]:
net = Net(
    len(word_vocab),
    300,
    len(fallacy_vocab),
    MAX_SENT_SIZE,
    needed_vectors
)
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.CrossEntropyLoss()

In [24]:
def train_epoch(X, Y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X.shape[0], batch_size):
        x_batch = X[beg_i : beg_i + batch_size, :]
        y_batch = Y[beg_i : beg_i + batch_size]
        x_batch = torch.tensor(x_batch)
        y_batch = torch.tensor(y_batch)

        opt.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_batch)

        loss.backward()
        
        opt.step()

        losses.append(loss.data.numpy())

    return [sum(losses) / float(len(losses))]

In [None]:
e_losses = []
num_epochs = 50
for e in tqdm(range(num_epochs)):
    e_losses += train_epoch(trainX, trainY, net, opt, criterion, batch_size=100)

 64%|██████▍   | 32/50 [00:09<00:05,  3.49it/s]

In [None]:
e_losses

# Metrics (Macro F1 Score)

In [None]:
with torch.no_grad():
    net.eval()
    x = torch.tensor(testX)
    y_pred = net(x)

In [None]:
score = f1_score(testY, y_pred.numpy().argmax(axis=1), average='macro')

In [None]:
score

In [None]:
for i in range(10):
    print("Guessing " + fallacy_vocab[i] + "\t for all:", end='\t')
    print(f1_score(testY, np.array( [i]*testY.shape[0]), average='macro'))

In [None]:
print("Random Guessing:", end='\t')
print(f1_score(testY, np.random.randint(0,10,testY.shape[0]), average='macro'))