In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install accelerate
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m194.6/302.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [3]:

import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from huggingface_hub import HfFolder, Repository
from sklearn.metrics import cohen_kappa_score
from datasets import Dataset, get_dataset_config_names
import polars as pl
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
from transformers import (AutoTokenizer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          Trainer,
                          AutoModelForSequenceClassification,
                           EarlyStoppingCallback
                          )
import regex as re
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from google.colab import userdata
import ctypes, torch, gc, yaml,  joblib, spacy, multiprocessing, os, random


In [4]:
key_ = userdata.get('HF_TOKEN')
cuda_available = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
train_path = '/content/drive/MyDrive/essay/data/train.csv'
libc = ctypes.CDLL("libc.so.6")

cuda


In [5]:
args_no = 2
args = f'/content/drive/MyDrive/essay/args/args_{args_no}.yaml'

with open(args, 'r') as file:
  config = yaml.safe_load(file)


In [6]:

def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()



In [7]:
lbl_mapping = { j: i for i, j in enumerate(config['lbl'])}


df_full = (pl.scan_csv(train_path)
        .with_columns(
            score = pl.col('score')
            .replace(lbl_mapping)
        )
      )


In [8]:
def initialize_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

initialize_seeds(config['seed'])

In [9]:


class text_prep():
  def __init__(self,
               df,
               tokenizer,
               max_length = config['max_length'],
               batch_size = config['batch_size'],
               stride = config['stride']
               ):

    self.set_df(df)
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.batch_size  = batch_size
    self.stride = stride

  def tokenize_function(self, batch):
    try:
        input_ids = []
        essay_id = []
        attention_mask = []
        full_text = []
        score = []
        txts = [self.preprocess_text(i) for i in batch['full_text']]

        for count_id , txt in enumerate(txts):
          tokenized = self.tokenizer(txt,
                                    max_length=self.max_length,
                                    return_overflowing_tokens = True,
                                    stride = self.stride,
                                    truncation=True
                                    )


          for count, input in enumerate(tokenized['input_ids']):
            full_text.append(txt)
            score.append(batch['score'][count_id])
            essay_id.append(batch['essay_id'][count_id])
            input_ids.append(input)
            attention_mask.append(tokenized['attention_mask'][count])

        return {
            'essay_id': essay_id,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'full_text': full_text,
            'score': score
        }
    except Exception as e:
        print(f"Error tokenizing batch: {batch}")
        raise e

  def preprocess_text(self, text):
    text = text.replace('\xa0', ' ')
    text = re.sub(r"[^a-zA-Z0-9\s,.'!?;:-]", '', text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

  def set_df(self, val):
    if isinstance(val, Dataset):
      self.df = val
    elif isinstance(val, pl.DataFrame):
      arrow_table = val.to_arrow()
      self.df = Dataset(arrow_table)
    else:
      raise TypeError("Unsupported data")



  def fit_transform(self):
     return (self.df.map(self.tokenize_function,
                         num_proc=multiprocessing.cpu_count() -1,
                         batched = True
                         )
                    .rename_column('score', 'labels')
     )


In [10]:
tokenizer = AutoTokenizer.from_pretrained(config['model_ckpt_berta'])
def token_df(tokenizer, df_full):
  df_sample = df_full.collect()
  tp = text_prep(df_sample, tokenizer = tokenizer)
  return tp.fit_transform()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    qwk = cohen_kappa_score(p.label_ids, preds, weights="quadratic")
    return {
        'qwk': float(qwk),
    }


In [12]:
df = token_df(tokenizer, df_full)



  self.pid = os.fork()


Map (num_proc=11):   0%|          | 0/17307 [00:00<?, ? examples/s]

In [13]:

essay_id = list(set(df['essay_id']))

df_pd = (
        df_full
        .filter(pl.col('essay_id').is_in(essay_id))
        .collect()
        .to_pandas()
)


df_dic = {}
stratified_kfold = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, random_state=config['seed'])
for fold, (train_index, test_index) in enumerate(stratified_kfold.split(df_pd, df_pd['score'])):
    train_index, val_index = train_test_split(train_index, test_size=0.2, random_state=config['seed'])
    df_dic[fold] = {
      'train': df.filter(lambda f: f['essay_id'] in df_pd.iloc[train_index].loc[:, 'essay_id'].to_list()),
      'vali': df.filter(lambda f: f['essay_id'] in df_pd.iloc[val_index].loc[:, 'essay_id'].to_list()),
      'test':  df.filter(lambda f: f['essay_id'] in df_pd.iloc[test_index].loc[:, 'essay_id'].to_list())
    }



Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22406 [00:00<?, ? examples/s]

In [14]:
df_dic[0]


{'train': Dataset({
     features: ['essay_id', 'full_text', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 14342
 }),
 'vali': Dataset({
     features: ['essay_id', 'full_text', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 3601
 }),
 'test': Dataset({
     features: ['essay_id', 'full_text', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 4463
 })}

In [15]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=config['pad_to_multiple_of'])


def do_trainer(train, vali, fold, args):
  model = (AutoModelForSequenceClassification.from_pretrained(config['model_ckpt_berta'],
                                                            num_labels=len(config['lbl']))
         .to(device)
         )

  training_args = TrainingArguments(
      output_dir=f"{config['model_dir']}{args}_electra_base_prod_{fold}",
      learning_rate=1e-5,
      num_train_epochs=config['epoch'],
      per_device_train_batch_size=config['batch_size'],
      warmup_steps=config['warmup_steps'],
      per_device_eval_batch_size=config['batch_size'],
      weight_decay=0.01,
      save_total_limit=1,
      logging_steps=len(train)//(config['batch_size'] * config['gradient_accumulation_steps']) ,
      evaluation_strategy="epoch",
      save_strategy="epoch",
      metric_for_best_model="qwk",
      greater_is_better=True,
      load_best_model_at_end=True,
      hub_token=key_,
      push_to_hub=True,
      lr_scheduler_type='cosine',
      fp16=True,
      gradient_accumulation_steps = config['gradient_accumulation_steps']
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train,
      eval_dataset= vali,
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
      callbacks=[EarlyStoppingCallback(config['early_stopping_patience'])]
  )
  return trainer



In [16]:
# clear_memory()


In [17]:

for fold, data in df_dic.items():
  train = data['train']
  vali = data['vali']
  test = data['test']

  train = train.remove_columns(['essay_id','full_text' ])
  vali = vali.remove_columns(['essay_id','full_text'])
  trainer_ = do_trainer(train, vali, fold, args_no)
  print(f'fold_{fold}')
  try:
      trainer_.train()
  except Exception as e:
      print("Error during training: %s", e)


  try:
      trainer_.evaluate()
  except Exception as e:
      print("Error during evaluation: %s", e)

  clear_memory()


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fold_0


Epoch,Training Loss,Validation Loss,Qwk
0,1.4147,1.201744,0.602444
1,1.0547,1.023469,0.718971
2,0.9165,0.976541,0.750712
3,0.8094,1.022079,0.762375
4,0.7136,1.016842,0.763664
5,0.6215,1.15519,0.743714
6,0.5357,1.262597,0.740617


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fold_1


Epoch,Training Loss,Validation Loss,Qwk
0,1.4124,1.159668,0.597523
1,1.0514,1.021816,0.738453
2,0.9148,0.982653,0.730877
4,0.7196,1.027448,0.750836
5,0.6252,1.059058,0.757034
6,0.5519,1.171316,0.755454
8,0.4891,1.202972,0.754147


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fold_2


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Qwk
0,1.4145,1.120978,0.642241
1,1.0479,1.090123,0.717248
2,0.9011,1.151883,0.710778
3,0.803,1.108078,0.72612
4,0.7081,1.090661,0.747299
5,0.6097,1.126855,0.748785
6,0.5207,1.23401,0.75261
8,0.4119,1.320729,0.74652


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fold_3


Epoch,Training Loss,Validation Loss,Qwk
0,1.4131,1.10208,0.662052
1,1.03,0.958106,0.744731
2,0.9016,1.007081,0.76009
3,0.802,0.962824,0.78058
4,0.7087,1.045486,0.771596
5,0.617,1.062157,0.777398


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fold_4


Epoch,Training Loss,Validation Loss,Qwk
0,1.4252,1.145398,0.622176
1,1.0689,0.99421,0.727251
2,0.9167,0.996332,0.759365
3,0.8107,0.971168,0.760726
4,0.7159,1.049637,0.752933
5,0.6309,1.06811,0.756098
