In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install accelerate
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m204.8/309.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [None]:
!pip install peft
!pip install bitsandbytes


Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.11.1
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [None]:
import bitsandbytes
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from huggingface_hub import HfFolder, Repository
from sklearn.metrics import cohen_kappa_score
from datasets import Dataset, get_dataset_config_names
import polars as pl
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
from transformers import (AutoTokenizer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          Trainer,
                          AutoModelForSequenceClassification,
                           EarlyStoppingCallback,
                          BitsAndBytesConfig
                          )

from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
import regex as re
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from google.colab import userdata
import ctypes, torch, gc, yaml,  joblib, spacy, multiprocessing, os, random


In [None]:
key_ = userdata.get('HF_TOKEN')
cuda_available = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
train_path = '/content/drive/MyDrive/essay/data/train.csv'
libc = ctypes.CDLL("libc.so.6")

cuda


In [None]:
args_no = 4
args = f'/content/drive/MyDrive/essay/args/args_{args_no}.yaml'

with open(args, 'r') as file:
  config = yaml.safe_load(file)


In [None]:

def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()



In [None]:
lbl_mapping = { j: i for i, j in enumerate(config['lbl'])}


df_full = (pl.scan_csv(train_path)
        .with_columns(
            score = pl.col('score')
            .replace(lbl_mapping)
        )
      )


In [None]:
def initialize_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

initialize_seeds(config['seed'])

In [None]:


class text_prep():
  def __init__(self,
               df,
               tokenizer,
               max_length = config['max_length'],
               batch_size = config['batch_size'],
               stride = config['stride']
               ):

    self.set_df(df)
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.batch_size  = batch_size
    self.stride = stride

  def tokenize_function(self, batch):
    try:
        input_ids = []
        essay_id = []
        attention_mask = []
        full_text = []
        score = []
        txts = [self.preprocess_text(i) for i in batch['full_text']]

        for count_id , txt in enumerate(txts):
          tokenized = self.tokenizer(txt,
                                    max_length=self.max_length,
                                    return_overflowing_tokens = True,
                                    stride = self.stride,
                                    truncation=True
                                    )


          for count, input in enumerate(tokenized['input_ids']):
            full_text.append(txt)
            score.append(batch['score'][count_id])
            essay_id.append(batch['essay_id'][count_id])
            input_ids.append(input)
            attention_mask.append(tokenized['attention_mask'][count])

        return {
            'essay_id': essay_id,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'full_text': full_text,
            'score': score
        }
    except Exception as e:
        print(f"Error tokenizing batch: {batch}")
        raise e

  def preprocess_text(self, text):
    text = text.replace('\xa0', ' ')
    text = re.sub(r"[^a-zA-Z0-9\s,.'!?;:-]", '', text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

  def set_df(self, val):
    if isinstance(val, Dataset):
      self.df = val
    elif isinstance(val, pl.DataFrame):
      arrow_table = val.to_arrow()
      self.df = Dataset(arrow_table)
    else:
      raise TypeError("Unsupported data")



  def fit_transform(self):
     return (self.df.map(self.tokenize_function,
                         num_proc=multiprocessing.cpu_count() -1,
                         batched = True
                         )
                    .rename_column('score', 'labels')
     )


In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)




In [None]:


tokenizer = AutoTokenizer.from_pretrained(config['model_ckpt_berta'])
def token_df(tokenizer, df_full):
  df_sample = df_full.collect()
  tp = text_prep(df_sample, tokenizer = tokenizer)
  return tp.fit_transform()


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    qwk = cohen_kappa_score(p.label_ids, preds, weights="quadratic")
    return {
        'qwk': float(qwk),
    }


In [None]:
df = token_df(tokenizer, df_full)



  self.pid = os.fork()


Map (num_proc=11):   0%|          | 0/17307 [00:00<?, ? examples/s]

In [None]:

essay_id = list(set(df['essay_id']))

df_pd = (
        df_full
        .filter(pl.col('essay_id').is_in(essay_id))
        .collect()
        .to_pandas()
)


df_dic = {}
stratified_kfold = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, random_state=config['seed'])
for fold, (train_index, test_index) in enumerate(stratified_kfold.split(df_pd, df_pd['score'])):
    train_index, val_index = train_test_split(train_index, test_size=0.2, random_state=config['seed'])
    df_dic[fold] = {
      'train': df.filter(lambda f: f['essay_id'] in df_pd.iloc[train_index].loc[:, 'essay_id'].to_list()),
      'vali': df.filter(lambda f: f['essay_id'] in df_pd.iloc[val_index].loc[:, 'essay_id'].to_list()),
      'test':  df.filter(lambda f: f['essay_id'] in df_pd.iloc[test_index].loc[:, 'essay_id'].to_list())
    }


Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22250 [00:00<?, ? examples/s]

In [None]:
df_dic[0]


{'train': Dataset({
     features: ['essay_id', 'full_text', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 14254
 }),
 'vali': Dataset({
     features: ['essay_id', 'full_text', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 3560
 }),
 'test': Dataset({
     features: ['essay_id', 'full_text', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 4436
 })}

In [None]:
def print_trainable_params(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"trainable parameters: {trainable_params}, all parameters: {all_params}, ratio: {100 * trainable_params / all_params}%")



In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=config['pad_to_multiple_of'])




def do_trainer(train, vali, fold, args):

    model = AutoModelForSequenceClassification.from_pretrained(
          config['model_ckpt_berta'],
          num_labels=len(config['lbl']),
          quantization_config=quantization_config,
          device_map="auto"
    )


    for param in model.parameters():
        param.requires_grad = False



    config_ = LoraConfig(
      r=16,
      lora_alpha=32,
      lora_dropout=0.05,
      target_modules=['q_proj', 'k_proj', 'k_proj', 'o_proj'],
      bias="none",
      task_type="CAUSAL_LM"
    )


    model = get_peft_model(model, config_)


    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=f"{config['model_dir']}{args}_gemma_2_{fold}",
        learning_rate=1e-5,
        num_train_epochs=config['epoch'],
        per_device_train_batch_size=config['batch_size'],
        warmup_steps=config['warmup_steps'],
        per_device_eval_batch_size=config['batch_size'],
        weight_decay=0.01,
        save_total_limit=1,
        logging_steps=len(train) // (config['batch_size'] * config['gradient_accumulation_steps']),
        evaluation_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model="qwk",
        greater_is_better=True,
        load_best_model_at_end=True,
        hub_token=key_,
        push_to_hub=True,
        lr_scheduler_type='cosine',
        fp16=True,
        gradient_accumulation_steps=config['gradient_accumulation_steps']
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=vali,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(config['early_stopping_patience'])]
    )

    return trainer


In [None]:
# clear_memory()


In [22]:

for fold, data in df_dic.items():
  train = data['train']
  vali = data['vali']
  test = data['test']

  train = train.remove_columns(['essay_id','full_text' ])
  vali = vali.remove_columns(['essay_id','full_text'])
  trainer_ = do_trainer(train, vali, fold, args_no)
  print(f'fold_{fold}')
  try:
      trainer_.train()
  except Exception as e:
      print("Error during training: %s", e)


  try:
      trainer_.evaluate()
  except Exception as e:
      print("Error during evaluation: %s", e)

  clear_memory()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fold_0




Epoch,Training Loss,Validation Loss,Qwk
1,2.1027,1.449283,0.327079
2,1.2828,1.161178,0.627082
3,1.071,1.070743,0.686923
4,0.9899,1.037912,0.712372
5,0.9466,1.002734,0.740654
6,0.9137,0.980883,0.745498
7,0.8879,0.987176,0.751443
8,0.8703,0.975001,0.754631
9,0.8615,0.979605,0.755412
10,0.8562,0.977126,0.754669




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fold_1


Epoch,Training Loss,Validation Loss,Qwk
0,2.2036,1.427464,0.315501
1,1.2469,1.125472,0.614421
2,1.0624,1.047944,0.672405
4,0.9878,1.012579,0.719431




Epoch,Training Loss,Validation Loss,Qwk
0,2.2036,1.427464,0.315501
1,1.2469,1.125472,0.614421
2,1.0624,1.047944,0.672405
4,0.9878,1.012579,0.719431


KeyboardInterrupt: 