In [None]:
! pip install transformers

In [1]:
! rm -rf space-model
! git clone https://github.com/StepanTita/space-model.git

In [3]:
import sys

sys.path.append('space-model')

In [4]:
import math
import json
from collections import Counter
import random
import os

import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

from datasets import load_dataset, Dataset, DatasetDict

from space_model.model import *
from space_model.loss import *

In [5]:
device_id = 1

In [6]:
device = torch.device(f'cuda:{device_id}' if torch.cuda.is_available() else 'cpu')
device

In [7]:
torch.cuda.set_device(device)

In [8]:
data = None
with open('space-model/data/hate_dataset.json') as f:
    data = json.loads(f.read())

In [9]:
vocab = set()
for id in data:
    vocab |= set(data[id]['post_tokens'])

In [10]:
ids_split = None
with open('space-model/data/hate_post_id_divisions.json') as f:
    ids_split = json.loads(f.read())

In [11]:
def encode_label(label):
    if label == 'hatespeech':
        return 0
    elif label == 'normal':
        return 1
    elif label == 'offensive':
        return 0  # 2
    raise Exception(f'Unknown Label: {label}!')

In [12]:
MODEL_NAME = 'distilbert-base-cased'
MODELS_PATH = 'models'
DATASET_NAME = 'imdb'

NUM_EPOCHS = 15
BATCH_SIZE = 256
MAX_SEQ_LEN = 256

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer

In [14]:
def get_label(annotators):
    return Counter([anno['label'] for anno in annotators]).most_common(1)[0][0]

In [15]:
val_dataset = DatasetDict({
    'train': Dataset.from_list([{
        'text': ' '.join(data[post_id]['post_tokens']),
        'label': encode_label(get_label(data[post_id]['annotators']))
    } for post_id in tqdm(data, desc='Train') if post_id in ids_split['train']]),
    'test': Dataset.from_list([{
        'text': ' '.join(data[post_id]['post_tokens']),
        'label': encode_label(get_label(data[post_id]['annotators']))
    } for post_id in tqdm(data, desc='Test') if post_id in ids_split['test']]),
    'val': Dataset.from_list([{
        'text': ' '.join(data[post_id]['post_tokens']),
        'label': encode_label(get_label(data[post_id]['annotators']))
    } for post_id in tqdm(data, desc='Val') if post_id in ids_split['val']])
})

In [16]:
val_dataset

In [17]:
val_dataset = val_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=MAX_SEQ_LEN, return_tensors='pt'), batched=True)
val_dataset.set_format('torch', device=device)
val_dataset

## Base Model

In [18]:
state_dict = torch.load(f'{MODELS_PATH}/{DATASET_NAME}_{MODEL_NAME}_{NUM_EPOCHS}.bin')

In [19]:
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, state_dict=state_dict).to(device)
base_model

In [20]:
def eval(f):
    def wrapper(model, *args, **kwargs):
        model.eval()
        return f(model, *args, **kwargs)
    return wrapper

In [21]:
@eval
def eval_epoch(model, val_dataloader):
    val_loss = 0.0
    val_preds = []
    val_labels = []

    with torch.no_grad():

        for step, batch in enumerate(tqdm(val_dataloader, total=len(val_dataloader))):
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            targets = batch['label'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)

            loss, logits = outputs.loss, outputs.logits

            probs = F.softmax(logits, dim=-1).cpu()
            pred = torch.argmax(probs, dim=-1) # (B)
            val_preds += pred.detach().tolist()
            val_labels += [l.item() for l in targets.cpu()]

            val_loss += loss.item()
    return val_loss, val_preds, val_labels

In [22]:
val_dataloader = torch.utils.data.DataLoader(val_dataset['test'], batch_size=2 * BATCH_SIZE)

In [23]:
val_loss, val_preds, val_labels = eval_epoch(base_model, val_dataloader)

In [24]:
val_acc = accuracy_score(val_labels, val_preds)
val_f1 = f1_score(val_labels, val_preds, average='macro')
val_precision = precision_score(val_labels, val_preds)
val_recall = recall_score(val_labels, val_preds)

In [25]:
print(f'Val loss: {val_loss / len(val_dataloader)}')
print(f'Val acc: {val_acc}')
print(f'Val f1: {val_f1}')
print(f'Val precision: {val_precision}')
print(f'Val recall: {val_recall}')

## Space Model

In [36]:
base_model = AutoModel.from_pretrained(MODEL_NAME)
base_model

In [37]:
space_model = SpaceModelForSequenceClassification(base_model, n_embed=768, n_latent=3, n_concept_spaces=2, l1=0, l2=0).to(device) 
space_model.load_state_dict(torch.load(f'{MODELS_PATH}/{DATASET_NAME}_space-{MODEL_NAME}_{NUM_EPOCHS}.bin'))
space_model

In [38]:
val_loss, val_preds, val_labels = eval_epoch(space_model, val_dataloader)

In [39]:
val_acc = accuracy_score(val_labels, val_preds)
val_f1 = f1_score(val_labels, val_preds, average='macro')
val_precision = precision_score(val_labels, val_preds)
val_recall = recall_score(val_labels, val_preds)

In [40]:
print(f'Val loss: {val_loss / len(val_dataloader)}')
print(f'Val acc: {val_acc}')
print(f'Val f1: {val_f1}')
print(f'Val precision: {val_precision}')
print(f'Val recall: {val_recall}')