# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; color:chartreuse; border-style: inset; border-color: limegreen;border-radius:30px; text-align:center; border-width:8px; padding:20px;"> Huggingface + Deepspeed Starter</h1></span>

> * This notebook is obviously inspired from [Pytorch Jigsaw Starter](https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter) and [$#!++Y patterns in NLP data](https://www.kaggle.com/samarthagarwal23/y-patterns-in-nlp-data)
> * It uses deepspeed along with huggingface trainer for training the model. Thus, larger models can be finetuned with this method.
> * feel free to upvote my first ever well produced notebook if you like it ツ

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Installing Libraries</h1>

In [None]:
!pip install deepspeed
!pip install nltk
!pip install contractions
!pip install wandb

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Creating a config file for training the model </h1>

In [None]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": 1e-4,
            "betas": [0.9, 0.999],
            "eps": 1.0e-8,
            "weight_decay": 1e-6
        }
    },

    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": 3e-6,
            "warmup_max_lr": 3e-5,
            "warmup_num_steps": 20
        }
    },

    "zero_optimization": {
     "stage": 2,
     "offload_optimizer": {
         "device": "cpu",
         "pin_memory": false
     },
     "allgather_partitions": true,
     "allgather_bucket_size": 2e6,
     "reduce_scatter": true,
     "reduce_bucket_size": 2e6,
     "overlap_comm": true,
     "contiguous_gradients": true
  }
}
EOT

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Setting up environment variables for deepspeed </h1>

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import os
import numpy as np
import torch


os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Setting seeds </h1>

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Loading data </h1>

In [None]:
import pandas as pd
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df.head()

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Creating dataset </h1>

In [None]:
import string
import random
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.worker)):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Datasets </h1>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import re
import contractions

RE_COMBINATIONS = {
    '\n':' ',
    'https?://\S+|www\.\S+':' link ',
    '[/.]':' ',
    '[ .-]':' ',
    '([A-Za-z])\1{2,}':'\1',
    '([A-Za-z]{1,})([*!?\'])\2{2,}([A-Za-z]{1,})':'\1\2\3',
}


class JigsawDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.more_toxic)
    
    def transform_text(self,text):
        #removing common errors specific to this dataset
        global RE_COMBINATIONS
        for i in RE_COMBINATIONS.items():
            text = re.sub(i[0],i[1],text)
        
        #uncontracting words :D
        text = ' '.join([contractions.fix(word) for word in text.lower().split()])
        
        #removing any punctuations
        text = list(map(lambda x:x if x.isalpha() else '',word_tokenize(text)))
        text = [word for word in text if word != '']
        
        return ' '.join(text)
        
    def __getitem__(self, index):
        more_toxic = self.transform_text(self.more_toxic[index])
        less_toxic = self.transform_text(self.less_toxic[index])
        
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }



In [None]:
MAX_LENGTH = 190
def prepare_datasets(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = JigsawDataset(df_train, tokenizer=tokenizer, max_length=MAX_LENGTH)
    valid_dataset = JigsawDataset(df_valid, tokenizer=tokenizer, max_length=MAX_LENGTH)
    
    return train_dataset, valid_dataset

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Defineing the model </h1>

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

class JigsawModel(torch.nn.Module):
    def __init__(self):
        super(JigsawModel, self).__init__()
        self.model = RobertaModel.from_pretrained('roberta-large')
        self.fc = torch.nn.Linear(1024,1) #LazyLinear won't work with huggingface trainer
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        outputs = self.fc(out.pooler_output)
        return outputs

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Training the model using deepspeed </h1>

In [None]:
from torch import nn
import os
from kaggle_secrets import UserSecretsClient
import wandb
from transformers import Trainer, TrainingArguments

class MarginRankingLossTrainer(Trainer):
    
    def criterion(self,outputs1, outputs2, targets):
        return nn.MarginRankingLoss(margin=0.5)(outputs1, outputs2, targets)
    
    def compute_loss(self, model, data, return_outputs=False):
        
        more_toxic_ids = data['more_toxic_ids']
        more_toxic_mask = data['more_toxic_mask']
        less_toxic_ids = data['less_toxic_ids']
        less_toxic_mask = data['less_toxic_mask']
        targets = data['target']
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        loss = self.criterion(more_toxic_outputs, less_toxic_outputs, targets)
        
        return loss
    
    def prediction_step(self,model,inputs,prediction_loss_only=True,ignore_keys=None,**kwargs):
        inputs = self._prepare_inputs(inputs)
        with torch.no_grad():
            loss = self.compute_loss(model,inputs)
        return (loss,None,None)
        

user_secrets = UserSecretsClient()
os.environ["WANDB_API_KEY"] = user_secrets.get_secret("wandb_key")
os.environ["WANDB_PROJECT"] = "Jigsaw-Kaggle"

training_arguments = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,
    prediction_loss_only=True,
    per_device_train_batch_size=24,
    eval_accumulation_steps=4,
    gradient_accumulation_steps=4,
    warmup_steps=20,               
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=1,
    deepspeed="ds_config_zero3.json",
    report_to="wandb"
)

train_dataset,valid_dataset = prepare_datasets(0)
trainer = MarginRankingLossTrainer(
    model=JigsawModel(),                         
    args=training_arguments,  
    train_dataset = train_dataset,
    eval_dataset = valid_dataset
)

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Live wandb monitering </h1>

In [None]:
%%wandb --height=800
import wandb
wandb.init(name="deepspeed_hf_trainer")

In [None]:
import wandb
import torch
import gc
trainer.train()
torch.cuda.empty_cache()
gc.collect()
wandb.run

## <h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Evaluating </h1>

In [None]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
trainer.evaluate()
wandb.run

In [None]:
wandb.finish()

In [None]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()
trainer.save_model()

<img style="display: block;margin-left: auto; margin-right: auto; width: 50%;" src="https://img.shields.io/badge/Upvote-If%20you%20like%20my%20work-07b3c8?style=for-the-badge&logo=kaggle">