In [3]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [4]:
%%writefile config.py
from transformers import AutoTokenizer

TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
EPOCHS = 5
MAX_LEN = 512
TRAINING_FILE = "/kaggle/working/train_folds.csv"
MODEL_NAME = "microsoft/deberta-v3-large"
MODEL_PATH = "deberta_v3_large.bin"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Writing config.py


In [5]:
%%writefile early_stopping.py

import config
import torch

class EarlyStopping:
    def __init__(self, patience = 5, delta = 0.001, mode = "min"):
        self.patience = patience
        self.delta = delta
        self.mode = mode
        self.best_score = None
        self.counter = 0
        self.early_stop = False
        self.save_path = config.MODEL_PATH

    def __call__(self, val_metric, model):
        score = -val_metric if self.mode == "min" else val_metric
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_metric, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_metric, model)
            self.counter = 0

    def save_checkpoint(self, val_metric, model):
        torch.save(model.state_dict(), self.save_path)
        print(f"Model save with {val_metric: .4f} performance")

Writing early_stopping.py


In [6]:
%%writefile utils.py
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def create_folds(input_file_path, output_file_path, num_folds = 5):
    df = pd.read_csv(input_file_path)
    df["FOLD"] = -1
    target_cols = ["cohesion","syntax","vocabulary","phraseology","grammar","conventions"]
    skf = MultilabelStratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 42)
    for i, (train_idx, val_idx) in enumerate(skf.split(df, df[target_cols])):
        df.loc[val_idx, "FOLD"] = i+1

    df.to_csv(output_file_path, index = False)


Writing utils.py


In [7]:
import utils

utils.create_folds("/kaggle/input/feedback-prize-english-language-learning/train.csv",
                  "train_folds.csv")

In [8]:
import pandas as pd
df = pd.read_csv("/kaggle/working/train_folds.csv")
df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,FOLD
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,2
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,1
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,4
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,2


In [9]:
df["full_text"].apply(lambda x: len(x.split())).values

array([261, 533, 320, ..., 257, 510, 638])

In [10]:
df.shape

(3911, 9)

In [11]:
# import importlib
# import utils  # Import the script
# importlib.reload(utils)  # Reload the updated script

# input_file_path = "/kaggle/input/feedback-prize-english-language-learning/train.csv"
# output_file_path = "train_folds.csv"
# utils.create_folds(input_file_path, output_file_path)

In [12]:
%%writefile dataset.py
import torch

class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ids = self.samples[idx]["input_ids"]
        input_labels = self.samples[idx]["input_labels"]
        input_ids = [self.tokenizer.cls_token_id] + ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        res = {
            "ids": input_ids,
            "mask": attention_mask,
            "targets": input_labels,
        }
        return res


class Collate:
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]
        output["targets"] = [sample["targets"] for sample in batch]

        batch_max_len = max([len(ids) for ids in output["ids"]])
        if batch_max_len > self.max_len:
            batch_max_len = self.max_len

        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max_len - len(s))*[self.tokenizer.pad_token_id]
                            for s in output["ids"]]
            output["mask"] = [s + (batch_max_len - len(s))*[self.tokenizer.pad_token_id]
                            for s in output["ids"]]
        else:
            output["ids"] = [(batch_max_len - len(s)) * [self.tokenizer.pad_token_id] + s 
                             for s in output["ids"]]
            output["mask"] = [(batch_max_len - len(s)) * [0] + s 
                              for s in output["mask"]]

        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)
        output["targets"] = torch.tensor(output["targets"], dtype=torch.float)
        return output

Writing dataset.py


In [13]:
%%writefile engine.py
from sklearn.metrics import mean_squared_error
from torch import nn
from tqdm import tqdm
import torch
import numpy as np
import torch

def check_gpu_status():
    num_gpus = torch.cuda.device_count()
    for gpu_id in range(num_gpus):
        device = torch.device(f"cuda:{gpu_id}")
        total_memory = torch.cuda.get_device_properties(device).total_memory / 1024**2
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**2
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**2
        free_memory = total_memory - reserved_memory
        
        print(f"GPU {gpu_id}:")
        print(f"  Total Memory:    {total_memory:.2f} MB")
        print(f"  Allocated Memory:{allocated_memory:.2f} MB")
        print(f"  Cached Memory:   {reserved_memory:.2f} MB")
        print(f"  Free Memory:     {free_memory:.2f} MB")
        print("-" * 30)

def loss_fn(outputs, targets):
    loss_fct = nn.MSELoss()
    loss = loss_fct(outputs, targets)
    return loss

def monitor_metrics(outputs, targets):
    device = targets.get_device()
    outputs = outputs.detach().cpu().numpy()
    targets = targets.detach().cpu().numpy()
    num_labels = 6
    mcrmse = []
    for i in range(num_labels):
        mcrmse.append(
            mean_squared_error(
                targets[:, i],
                outputs[:, i],
                squared = False
            ),
        )
    mcrmse = np.mean(mcrmse)
    return {"mcrmse": torch.tensor(mcrmse, device = device)}

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    print("In train_fn: ", len(data_loader))
    for data in tqdm(data_loader, total = len(data_loader)):
        
        # torch.cuda.empty_cache()
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, loss, _ = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
        # check_gpu_status()
    return final_loss / len(data_loader)

# def train_fn(data_loader, model, optimizer, device, scheduler):
#     model.train()
#     final_loss = 0
#     print("In train_fn: ", len(data_loader))
#     for data in tqdm(data_loader, total = len(data_loader)):
        
#         # torch.cuda.empty_cache()
#         for k, v in data.items():
#             data[k] = v.to(device)
#         optimizer.zero_grad()
#         outputs = model(**data)
#         if isinstance(outputs, tuple):
#             loss = outputs[1]
#         else:
#             loss = outputs
#         # Ensure loss is scalar
#         if loss.dim() > 0:
#             loss = loss.mean()
        
#         loss.backward()
#         optimizer.step()
#         scheduler.step()
#         final_loss += loss.item()
#     return final_loss / len(data_loader)

def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    with torch.no_grad():  
        for data in tqdm(data_loader, total = len(data_loader)):
            for k, v in data.items():
                data[k] = v.to(device)
            _, loss, _ = model(**data)
            final_loss += loss.item()
    return final_loss / len(data_loader)


# def eval_fn(data_loader, model, device):
#     model.eval()
#     final_loss = 0
#     for data in tqdm(data_loader, total=len(data_loader)):
#         for k, v in data.items():
#             data[k] = v.to(device)
            
#         with torch.no_grad():
#             outputs = model(**data)
#             if isinstance(outputs, tuple):
#                 loss = outputs[1]  # Assuming loss is the second element
#             else:
#                 loss = outputs
                
#             if loss.dim() > 0:
#                 loss = loss.mean()
                
#         final_loss += loss.item()
#     return final_loss / len(data_loader)

Writing engine.py


In [14]:
%%writefile model.py
import config
import engine
from torch import nn
from transformers import AutoConfig, AutoModel

class FeedbackModel(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.num_labels = num_labels
        hidden_dropout_prob: float = 0.0
        model_config = AutoConfig.from_pretrained(config.MODEL_NAME)
        model_config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": hidden_dropout_prob,
            "add_pooling_layer": False,
            "num_labels": self.num_labels
        })
        self.transformer = AutoModel.from_pretrained(config.MODEL_NAME,
                                                    config = model_config)
        self.dropout = nn.Dropout(model_config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(model_config.hidden_size, self.num_labels)

    def forward(self, ids, mask, targets = None):
        transformer_out = self.transformer(input_ids = ids, 
                                          attention_mask = mask)
        sequence_output = transformer_out.last_hidden_state[:, 0, :]
        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5

        loss = 0

        if targets is not None:
            loss1 = engine.loss_fn(logits1, targets)
            loss2 = engine.loss_fn(logits2, targets)
            loss3 = engine.loss_fn(logits3, targets)
            loss4 = engine.loss_fn(logits4, targets)
            loss5 = engine.loss_fn(logits5, targets)
            loss = (loss1 + loss2 + loss3 + loss4 + loss5) / 5
            mcrmse = engine.monitor_metrics(logits, targets)
            return logits, loss, mcrmse
        return logits, loss, {}
            
        
        

Writing model.py


In [15]:
%%writefile train.py

import config
import pandas as pd
import numpy as np
import torch
import engine
from model import FeedbackModel
from early_stopping import EarlyStopping
from torch.utils.data import RandomSampler
from tqdm import tqdm
from joblib import Parallel, delayed
from dataset import Collate, FeedbackDataset
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import subprocess

# def check_gpu_status():
#     result = subprocess.run(['nvidia-smi'], capture_output = True, text = True)
#     print(result.stdout)


def _prepare_data_helper(tokenizer, df, text_ids):
    samples = []
    lbls = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    for idx in tqdm(text_ids):
        full_text = df[df.text_id == idx].reset_index(drop=True).full_text.values[0]
        encoded_text = tokenizer.encode_plus(
            full_text,
            None,
            add_special_tokens=False,
        )
        input_ids = encoded_text["input_ids"]
        sample = {
            "input_ids": input_ids,
            "text_id": idx,
            "full_text": full_text,
            "attention_mask": encoded_text["attention_mask"],
            "input_labels": df[df.text_id == idx].reset_index(drop=True)[lbls].values[0, :].tolist(),
        }
        samples.append(sample)

    return samples
    

def prepare_data(df, tokenizer, num_jobs):
    samples = []
    text_ids = df["text_id"].unique()
    text_ids_splits = np.array_split(text_ids, num_jobs)
    results = Parallel(n_jobs = num_jobs, backend="multiprocessing")(
        delayed(_prepare_data_helper)(tokenizer, df, idx)
        for idx in text_ids_splits
    )
    for result in results:
        samples.extend(result)
    return samples

def run(fold):
    NUM_JOBS = 12
    df = pd.read_csv(config.TRAINING_FILE)
    target_columns = ["cohesion","syntax","vocabulary","phraseology","grammar","conventions"]
    train_dataset = df[df["FOLD"] != fold].reset_index(drop = True)
    valid_dataset = df[df["FOLD"] == fold].reset_index(drop = True)
    training_samples = prepare_data(train_dataset, config.tokenizer, num_jobs = NUM_JOBS)
    valid_samples = prepare_data(valid_dataset, config.tokenizer, num_jobs = NUM_JOBS)
    print(len(valid_samples))
    num_train_steps = int(len(train_dataset) / config.TRAIN_BATCH_SIZE *  config.EPOCHS)
    n_gpu = torch.cuda.device_count()
    if n_gpu > 0:
        num_train_steps /= n_gpu

    train_dataset = FeedbackDataset(training_samples, config.MAX_LEN, config.tokenizer)
    valid_dataset = FeedbackDataset(valid_samples, config.MAX_LEN, config.tokenizer)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                   batch_size=config.TRAIN_BATCH_SIZE,
                                                   collate_fn=Collate(config.tokenizer, config.MAX_LEN)
                                                  )
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=config.VALID_BATCH_SIZE,
                                                   collate_fn=Collate(config.tokenizer, config.MAX_LEN)
                                                  )
    print("Training Length: ", len(train_dataloader))
    print("Validation Length: ", len(valid_dataloader))

    # model =  torch.nn.parallel.DistributedDataParallel(FeedbackModel(num_labels=len(target_columns)))
    model =  FeedbackModel(num_labels=len(target_columns))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)

    # check_gpu_status()
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0
        }
    ]

    optimizer = AdamW(optimizer_parameters, lr = 3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    early_stopping = EarlyStopping()
    for epoch in range(config.EPOCHS):
        # check_gpu_status()
        train_loss = engine.train_fn(train_dataloader, model, optimizer, device, scheduler)
        val_loss = engine.eval_fn(valid_dataloader, model, device)
        print(f"Train loss = {train_loss} Valid loss = {val_loss}")

        early_stopping(val_loss, model)

        # Check if we should stop training
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break

Writing train.py


In [16]:
import importlib
import config 
import train
import model
import dataset
import early_stopping
import engine
import utils
importlib.reload(config)  
importlib.reload(train)
importlib.reload(model)
importlib.reload(dataset)  
importlib.reload(early_stopping)
importlib.reload(engine)
importlib.reload(utils)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



<module 'utils' from '/kaggle/working/utils.py'>

In [17]:
# torch.cuda.memory_reserved()

In [18]:
import train
fold = 5
train.run(fold)

100%|██████████| 261/261 [00:00<00:00, 302.37it/s]
100%|██████████| 261/261 [00:01<00:00, 209.05it/s]
100%|██████████| 261/261 [00:01<00:00, 204.80it/s]
100%|██████████| 261/261 [00:01<00:00, 202.60it/s]
100%|██████████| 261/261 [00:01<00:00, 182.09it/s]
100%|██████████| 261/261 [00:01<00:00, 194.22it/s]
100%|██████████| 261/261 [00:01<00:00, 196.30it/s]
100%|██████████| 261/261 [00:01<00:00, 200.11it/s]
100%|██████████| 261/261 [00:01<00:00, 185.64it/s]
100%|██████████| 260/260 [00:01<00:00, 224.41it/s]
100%|██████████| 260/260 [00:01<00:00, 247.55it/s]
100%|██████████| 260/260 [00:00<00:00, 279.24it/s]
100%|██████████| 66/66 [00:00<00:00, 301.38it/s]
100%|██████████| 66/66 [00:00<00:00, 340.50it/s]
100%|██████████| 65/65 [00:00<00:00, 374.23it/s]
100%|██████████| 65/65 [00:00<00:00, 353.92it/s]
100%|██████████| 65/65 [00:00<00:00, 350.16it/s]
100%|██████████| 65/65 [00:00<00:00, 339.84it/s]
100%|██████████| 65/65 [00:00<00:00, 331.42it/s]
100%|██████████| 65/65 [00:00<00:00, 333.59it

782
Training Length:  1565
Validation Length:  391


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]



In train_fn:  1565


100%|██████████| 1565/1565 [14:26<00:00,  1.81it/s]
100%|██████████| 391/391 [01:06<00:00,  5.90it/s]


Train loss = 0.5681609327277055 Valid loss = 0.48474608740919384
Model save with  0.4847 performance
In train_fn:  1565


100%|██████████| 1565/1565 [14:26<00:00,  1.81it/s]
100%|██████████| 391/391 [01:06<00:00,  5.91it/s]


Train loss = 0.4937841117334442 Valid loss = 0.455505867424371
Model save with  0.4555 performance
In train_fn:  1565


100%|██████████| 1565/1565 [14:26<00:00,  1.81it/s]
100%|██████████| 391/391 [01:06<00:00,  5.91it/s]


Train loss = 0.4754387193903946 Valid loss = 0.4483734165387385
Model save with  0.4484 performance
In train_fn:  1565


100%|██████████| 1565/1565 [14:26<00:00,  1.81it/s]
100%|██████████| 391/391 [01:06<00:00,  5.91it/s]


Train loss = 0.4668452476041195 Valid loss = 0.4413993208647689
Model save with  0.4414 performance
In train_fn:  1565


100%|██████████| 1565/1565 [14:26<00:00,  1.81it/s]
100%|██████████| 391/391 [01:06<00:00,  5.91it/s]


Train loss = 0.46140148656341595 Valid loss = 0.43996008207349824
Model save with  0.4400 performance


In [31]:
!zip large_file.zip /kaggle/working/deberta_v3_large.bin

  adding: kaggle/working/deberta_v3_large.bin (deflated 16%)


In [34]:
# Compress the model file
!zip -r model.zip /kaggle/working/deberta_v3_large.bin
# Then create download link for the zip
FileLink('model.zip')

  adding: kaggle/working/deberta_v3_large.bin (deflated 16%)
