In [1]:
import json
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

import pprint
from aita.datasets import AITADatasetJoint
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import random_split
from transformers import BertTokenizer
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
pp = pprint.PrettyPrinter(indent=4)


# Device Config

In [2]:

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1050 Ti with Max-Q Design


# Prepare Dataset

## Load Config File

In [3]:
config_file = open('training_joint_bert_config.json')
config = json.load(config_file)
pp.pprint(config)

{   'comments_weights': 'model_weights/model_comments.pt',
    'dataset_path': 'data/merged.csv',
    'gaussian_model_path': 'model_weights/gaussian_params.pickle',
    'mlp_model_path': 'model_weights/gaussian_params.pickle',
    'model_name': 'model_comments.pt',
    'model_path': 'model_weights',
    'posts_weights': 'model_weights/last_model_posts.pt',
    'token_length_comments': 512,
    'token_length_posts': 512,
    'undersample': True}


In [4]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
dataset = AITADatasetJoint(dataset_path=config['dataset_path'],
                           tokenizer=tokenizer,
                           max_token_length_posts=config['token_length_posts'],
                           max_token_length_comments=config['token_length_comments'],
                           undersample=config['undersample'])

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset,  # The training samples.
                              shuffle=True,
                              batch_size=1)

validation_dataloader = DataLoader(val_dataset, shuffle=True, batch_size=1)
print(f"Training Dataset Size: {len(train_dataset)}, Validation Dataset Size: {len(val_dataset)}")

Training Dataset Size: 3837, Validation Dataset Size: 427


# Prepare Model

In [8]:
from aita.models import BERTJoint

model = BERTJoint(posts_weights=config["posts_weights"], comments_weights=config["comments_weights"], device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Train
We aim to train the header of the model so we collect the obtained embeddings and the true labels.
In this notebook we train two layers on top of the joint bert model
- Naive Baise
- Multi-Layer Perceptron

In [9]:
y_true = []
embeddings_list = []
for batch in train_dataloader:
    post_id = batch[0].to(device)
    post_attention = batch[1].to(device)
    comment_id = batch[2].to(device)
    comment_attention = batch[3].to(device)
    label = batch[4].to(device)
    y_true.append(label.squeeze().cpu().numpy())
    # returns a concatenated embedding of the posts and average embedding of the comments
    embeddings = model(post_id=post_id,
     post_attention=post_attention,
      comments_ids=comment_id,
       comments_attentions=comment_attention)
    embeddings_list.append(embeddings)



## Fit top layer

In [10]:
embeddings_list = np.array(embeddings_list)
embeddings_list = np.squeeze(embeddings_list)
y_true = np.array(y_true)
y_true = y_true[:, 1]
naive_baise = GaussianNB()
naive_baise.fit(embeddings_list, y_true)
mlp = MLPClassifier([500, 500]).fit(embeddings_list, y_true)

## Validate Naive Baise

In [11]:
y_true_eval = []
y_pred = []
for batch in validation_dataloader:
    post_id = batch[0].to(device)
    post_attention = batch[1].to(device)
    comment_id = batch[2].to(device)
    comment_attention = batch[3].to(device)
    label = batch[4].to(device)
    y_true_eval.append(label.squeeze().cpu().numpy())
    embeddings = model(post_id=post_id,
                       post_attention=post_attention,
                       comments_ids=comment_id,
                       comments_attentions=comment_attention)

    y_pred.append(naive_baise.predict(np.expand_dims(embeddings,0)))

y_true_eval = np.array(y_true_eval)
y_pred = np.array(y_pred).squeeze()
y_true_eval = y_true_eval[:, 1]



IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [14]:
acc = accuracy_score(y_true_eval, y_pred)
precision = precision_score(y_true=y_true_eval, y_pred=y_pred)
recall = recall_score(y_true=y_true_eval, y_pred=y_pred)
f1 = f1_score(y_true=y_true_eval, y_pred=y_pred)
matthews = matthews_corrcoef(y_true=y_true_eval, y_pred=y_pred)
cm = confusion_matrix(y_true_eval, y_pred)
metrics = {'metric':["Accuracy", "precision", "recall", "f1-score", "Matthews Correlation Coefficient (MCC)"], "value":[acc, precision, recall, f1, matthews]}

In [15]:
metrics_table = pd.DataFrame(metrics)
metrics_table

Unnamed: 0,metric,value
0,Accuracy,0.627635
1,precision,0.608295
2,recall,0.640777
3,f1-score,0.624113
4,Matthews Correlation Coefficient (MCC),0.256038


## Save Model

In [24]:

params = naive_baise.get_params()
with open("model_weights/naive_baise.pickle", "wb") as f:
    pickle.dump(naive_baise, f)

## Validate MLP Baise

In [17]:
y_true_eval = []
y_pred = []
for batch in validation_dataloader:
    post_id = batch[0].to(device)
    post_attention = batch[1].to(device)
    comment_id = batch[2].to(device)
    comment_attention = batch[3].to(device)
    label = batch[4].to(device)
    y_true_eval.append(label.squeeze().cpu().numpy())
    embeddings = model(post_id=post_id,
                       post_attention=post_attention,
                       comments_ids=comment_id,
                       comments_attentions=comment_attention)

    y_pred.append(mlp.predict(np.expand_dims(embeddings,0)))

y_true_eval = np.array(y_true_eval)
y_pred = np.array(y_pred).squeeze()
y_true_eval = y_true_eval[:, 1]



IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [19]:

acc = accuracy_score(y_true_eval, y_pred)
precision = precision_score(y_true=y_true_eval, y_pred=y_pred)
recall = recall_score(y_true=y_true_eval, y_pred=y_pred)
f1 = f1_score(y_true=y_true_eval, y_pred=y_pred)
matthews = matthews_corrcoef(y_true=y_true_eval, y_pred=y_pred)
cm = confusion_matrix(y_true_eval, y_pred)
metrics = {'metric':["Accuracy", "precision", "recall", "f1-score", "Matthews Correlation Coefficient (MCC)"], "value":[acc, precision, recall, f1, matthews]}

In [20]:
metrics_table = pd.DataFrame(metrics)
metrics_table

Unnamed: 0,metric,value
0,Accuracy,0.740047
1,precision,0.711111
2,recall,0.776699
3,f1-score,0.742459
4,Matthews Correlation Coefficient (MCC),0.482985


## Save Model

In [23]:
params = mlp.get_params()
with open("model_weights/mlp.pickle", "wb") as f:
    pickle.dump(mlp, f)