In [12]:
# LIBRARY IMPORTS
import os
import gc
import copy
import time
import random
import string
import joblib

# Import Manipulation
import numpy as np
import pandas as pd

# PyTorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
from torch.autograd import Variable

# Utils
from tqdm import tqdm
from collections import defaultdict

# SkLearn Imports
import sklearn as sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import f1_score

# Transformers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding, DebertaTokenizer, DebertaModel
from transformers.models.deberta_v2.modeling_deberta_v2 import ContextPooler

# Hugging Face Imports
from huggingface_hub import InferenceClient
"""
StableDropout is no longer supported in the latest transformers library
"""
# from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout
from torch.optim import AdamW

# BitsAndBytes for 8-bit optimizers
import bitsandbytes as bnb

#  For coloured terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Supress Warnings
import warnings
warnings.filterwarnings("ignore")

# Descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [None]:
# HUGGING FACE CONNECTION

token = [insert your token here]

from huggingface_hub import login
login(token)


In [14]:
# FILE PATHS AND DATA SET PREVIEW

"""
File Directories
"""
ROOT_DIR = '../DeBERTa Toxicity Detection Model/'

TRAIN_CSV = os.path.join(ROOT_DIR, 'INPUTS/train.csv')
dfTRAIN = pd.read_csv(TRAIN_CSV)

VAL_CSV = os.path.join(ROOT_DIR, 'INPUTS/val.csv')
# dfVAL = pd.read_csv(VAL_CSV)

TEST_CSV = os.path.join(ROOT_DIR, 'INPUTS/test.csv')
dfTEST = pd.read_csv(TEST_CSV)

"""
Preview CSV Data
"""
df = pd.read_csv(TRAIN_CSV)
df.columns

Index(['message', 'target'], dtype='object')

In [15]:
# TRAINING CONFIGURATON

class CFG:
    seed = 2022
    max_length = 512
    epoch = 4
    train_batch_size = 16
    valid_batch_size = 32

    model_name = "microsoft/deberta-v2-xlarge-mnli"
    token_name = "microsoft/deberta-v2-xlarge-mnli"

    scheduler = "CosineAnnealingLR"
    learning_rate = 1e-5
    min_lr = 1e-6
    T_max = 500
    weight_decay = 0.005
    dropout = 0.1

    num_classes = 3
    n_fold = 3
    n_acumulate = 2

    """
    Freezing the model layers for first few epochs
    Reduces computational power required and speeds up training
    """
    freezing = True
    gradient_checkpoint = True
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # WandB ID
    wandb_id = f"PL{round(time.time())}"
    group = f'{wandb_id}-Baseline'
    competition = "FeedBack"
    _wandb_kernel = "starf"

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.token_name, use_fast = False)
CFG.tokenizer.model_max_length = CFG.max_length
CFG.tokenizer.is_fast

AutoConfig.from_pretrained(CFG.model_name)

DebertaV2Config {
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "CONTRADICTION",
    "1": "NEUTRAL",
    "2": "ENTAILMENT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "CONTRADICTION": 0,
    "ENTAILMENT": 2,
    "NEUTRAL": 1
  },
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pooling": {
    "dropout": 0,
    "hidden_act": "gelu"
  },
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buck

In [16]:
# DATASET EXPLORATION

df.describe()
df.dtypes
df['message'].str.split(" ").apply(len).describe()
df['message']

0                               can t win alone
1                          buy more hearts rofl
2                          no one ever gives up
3                                        tower 
4                                too much brain
                         ...                   
1717                       said the useless mid
1718                                  yeah why 
1719                             fuck this game
1720                   hc eeuu shit player taco
1721    cumbackkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk
Name: message, Length: 1722, dtype: object

In [17]:
# SET SEED FOR REPODUCIBILITY AND CRITERION/SCORING FUNCTIONS

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

# OUTPUTS AND PARAMS

def criterion(outputs, labels):
    """
    Calculate Cross Entropy Loss
    """
    return nn.CrossEntropyLoss()(outputs, labels)

def get_score(outputs, labels):
    """
    Calculate Log Loss from softmax output
    """
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return log_loss(labels, outputs)

def freeze(module):
    """
    Freezes module's parameters.
    """
    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

# 8-bits optimizer
def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {'f1_score': f1}

In [19]:
from transformers import TrainingArguments

logging_steps = len(dfTRAIN) // CFG.train_batch_size
output_dir = '/OUTPUTS/'
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=CFG.epoch,
                                  learning_rate=CFG.learning_rate,
                                    per_device_train_batch_size=CFG.train_batch_size,
                                    per_device_eval_batch_size=CFG.valid_batch_size,
                                    weight_decay=CFG.weight_decay,
                                    logging_steps=logging_steps,
                                    fp16=True,
                                    push_to_hub=True
)

In [20]:
from transformers import Trainer

model = AutoModel.from_pretrained(CFG.model_name)
tokenizer = AutoTokenizer.from_pretrained(CFG.token_name)

trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dfTRAIN,
                  eval_dataset=dfTEST,
                  tokenizer=tokenizer
                  )

In [21]:
# TRAINING [RUN THIS]

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


KeyError: 246