In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install accelerate
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/309.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [3]:

from collections import Counter
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from huggingface_hub import HfFolder, Repository
from sklearn.metrics import cohen_kappa_score
from datasets import Dataset, get_dataset_config_names
import polars as pl
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
from transformers import (AutoTokenizer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          Trainer,
                          AutoModelForSequenceClassification,
                           EarlyStoppingCallback,
                          Pipeline
                          )
import regex as re
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from google.colab import userdata
import ctypes, torch, gc, yaml,  joblib, spacy, multiprocessing, os, random, functools
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


In [12]:
key_ = userdata.get('HF_TOKEN')
cuda_available = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
train_path = '/content/drive/MyDrive/essay/data/train.csv'
libc = ctypes.CDLL("libc.so.6")

cuda


In [41]:
args_no = "combine"
args = f'/content/drive/MyDrive/essay/args/args_{args_no}.yaml'

with open(args, 'r') as file:
  config = yaml.safe_load(file)


In [14]:

def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()




In [15]:
lbl_mapping = { j: i for i, j in enumerate(config['lbl'])}

df_full = (pl.scan_csv(train_path)
        .with_columns(
            score = pl.col('score')
            .replace(lbl_mapping)
        )
      )


In [16]:
def initialize_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

initialize_seeds(config['seed'])

In [17]:


class text_prep():
  def __init__(self,
               df,
               tokenizer,
               max_length = config['max_length'],
               batch_size = config['batch_size'],
               stride = config['stride']
               ):

    self.set_df(df)
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.batch_size  = batch_size
    self.stride = stride

  def tokenize_function(self, batch):
    try:
        input_ids = []
        essay_id = []
        attention_mask = []
        full_text = []
        score = []
        txts = [self.preprocess_text(i) for i in batch['full_text']]

        for count_id , txt in enumerate(txts):
          tokenized = self.tokenizer(txt,
                                    max_length=self.max_length,
                                    return_overflowing_tokens = True,
                                    stride = self.stride,
                                    truncation=True
                                    )


          for count, input in enumerate(tokenized['input_ids']):
            full_text.append(txt)
            score.append(batch['score'][count_id])
            essay_id.append(batch['essay_id'][count_id])
            input_ids.append(input)
            attention_mask.append(tokenized['attention_mask'][count])

        return {
            'essay_id': essay_id,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'full_text': full_text,
            'score': score
        }
    except Exception as e:
        print(f"Error tokenizing batch: {batch}")
        raise e

  def preprocess_text(self, text):
    text = text.replace('\xa0', ' ')
    text = re.sub(r"[^a-zA-Z0-9\s,.'!?;:-]", '', text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

  def set_df(self, val):
    if isinstance(val, Dataset):
      self.df = val
    elif isinstance(val, pl.DataFrame):
      arrow_table = val.to_arrow()
      self.df = Dataset(arrow_table)
    else:
      raise TypeError("Unsupported data")



  def fit_transform(self):
     return (self.df.map(self.tokenize_function,
                         num_proc=multiprocessing.cpu_count() -1,
                         batched = True
                         )
                    .rename_column('score', 'labels')
     )


In [18]:

def token_df(tokenizer, df_full):
  df_sample = df_full.collect()
  tp = text_prep(df_sample, tokenizer = tokenizer)
  return tp.fit_transform()


In [21]:
def most_common( lst):
    counts = Counter(lst)
    max_frequency = max(counts.values())
    return [e for e, c in counts.items() if c == max_frequency]


def vote_pred(r):
  lbl_agg = r['lbl_agg']
  max_value_agg = r['max_value_agg']

  if len(lbl_agg) > 1:
    mc = most_common(lbl_agg)
    val_dic = {}
    data = list(zip(lbl_agg, max_value_agg))

    if len(mc) > 1:
      for i in mc:
        val_dic[i] = max(val for lbl, val in data if i == lbl)
    else:
      val_dic[mc[0]] = max(val for lbl, val in data if mc[0] == lbl)

    max_lbl = max(val_dic, key=val_dic.get)
    return {'new_lbl': max_lbl, 'val': val_dic[max_lbl]}
  else:
    return {'new_lbl': lbl_agg[0], 'val': max_value_agg[0]}


class PredictionProcess:
    def __init__(self, best_model, pad_to_multiple_of, device, batch_size, tokenizer):
        self.model = AutoModelForSequenceClassification.from_pretrained(best_model).to(device)
        self.data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
        self.batch_size = batch_size
        self.device = device


    @staticmethod
    def predict(batch, model, data_collator, device):
        collated_batch = data_collator({k: batch[k]
                                for k in batch
                                if k in ['input_ids', 'attention_mask']}
                              )

        input_ids = collated_batch['input_ids'].to(device)
        attention_mask = collated_batch['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().tolist()
        pred_labels = outputs.logits.argmax(dim=-1).cpu().tolist()
        return {'pred_labels': pred_labels, 'probabilities': probabilities}







    def predict_dataset(self, dataset):
        self.model.eval()
        predict_partial = functools.partial(self.predict,
                                  model=self.model,
                                  data_collator=self.data_collator,
                                  device=self.device
                                  )
        pred = dataset.map(
            predict_partial,
            batched=True,
            batch_size=self.batch_size,
            remove_columns=['input_ids', 'attention_mask']
        )

        raw_pred = pred.to_polars()

        clear_memory()


        return (
              (raw_pred
              .with_columns(
                      max_value = pl.col('probabilities').list.max(),
              )
              .group_by('essay_id')
                .agg(
                    lbl_agg = pl.col('pred_labels'),
                    max_value_agg = pl.col('max_value')
                )
                  .with_columns(
                      temp = pl.struct(['lbl_agg', 'max_value_agg'])
                               .map_elements(lambda g: vote_pred(g))
               )
                .unnest('temp')
                .drop(['max_value_agg', 'lbl_agg'])
             ).join(
                 (raw_pred
                 .select(['essay_id', 'labels', 'full_text'])
                 .unique()
                 )
                 , on = 'essay_id'
             )
        )


In [22]:

essay_id = list(set(df_full.collect()['essay_id']))

df_pd = (
        df_full
        .filter(pl.col('essay_id').is_in(essay_id))
        .collect()
        .to_pandas()
       )


In [42]:

tokenizer = AutoTokenizer.from_pretrained(config['models'][0])
df = token_df(tokenizer, df_full)



df_dic = {}
stratified_kfold = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, random_state=config['seed'])
for fold, (train_index, test_index) in enumerate(stratified_kfold.split(df_pd, df_pd['score'])):

    print(f'fold number: {fold}')
    model_path = f"prl90777/1_{config['base_model']}_{fold}"

    train_index, val_index = train_test_split(train_index, test_size=0.2, random_state=config['seed'])

    test_df = df.filter(lambda f: f['essay_id'] in df_pd.iloc[test_index].loc[:, 'essay_id'].to_list())

    predict = PredictionProcess(
                                best_model = model_path,
                                pad_to_multiple_of = config['pad_to_multiple_of'],
                                device = device,
                                batch_size = config['batch_size'],
                                tokenizer = tokenizer
                                )
    df_dic[fold] = predict.predict_dataset(test_df)






  self.pid = os.fork()


Map (num_proc=7):   0%|          | 0/17307 [00:00<?, ? examples/s]

fold number: 0


Filter:   0%|          | 0/17307 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

fold number: 1


Filter:   0%|          | 0/17307 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

fold number: 2


Filter:   0%|          | 0/17307 [00:00<?, ? examples/s]

Map:   0%|          | 0/3461 [00:00<?, ? examples/s]

fold number: 3


Filter:   0%|          | 0/17307 [00:00<?, ? examples/s]

Map:   0%|          | 0/3461 [00:00<?, ? examples/s]

fold number: 4


Filter:   0%|          | 0/17307 [00:00<?, ? examples/s]

Map:   0%|          | 0/3461 [00:00<?, ? examples/s]

In [43]:
true_lbl = { i: j for i, j in enumerate(config['lbl'])}



#  .agg(
#                     lbl_agg = pl.col('pred_labels'),
#                     max_value_agg = pl.col('max_value')
#                 )
#                   .with_columns(
#                       temp = pl.struct(['lbl_agg', 'max_value_agg'])
#                                .map_elements(lambda g: vote_pred(g))
#                )
#                 .unnest('temp')
#                 .drop(['max_value_agg', 'lbl_agg'])
#              ).join(
#                  (raw_pred
#                  .select(['essay_id', 'labels', 'full_text'])
#                  .unique()
#                  )
#                  , on = 'essay_id'
#              )


# .with_columns(
#     score_value = pl.col('max_value_agg ').list.max(),
#     score_idx = pl.col('max_value_agg ').list.arg_max()
# )
# .with_columns(
#     score = pl.col('new_lbl_lst')
#               .list
#               .get(pl.col('score_idx'))
#               .replace(true_lbl)
# )
# .select(['essay_id', 'score'])



final_pred = pl.concat([j for i, j in df_dic.items()])

all_pred = (final_pred
.group_by('essay_id')
.agg(
    max_value_agg  = pl.col('val'),
    lbl_agg = pl.col('new_lbl'),
)
.with_columns(
  temp = pl.struct(['lbl_agg', 'max_value_agg'])
          .map_elements(lambda g: vote_pred(g))
)
.unnest('temp')
.drop(['max_value_agg', 'lbl_agg'])

).join(
    (final_pred
    .select(['essay_id', 'labels', 'full_text'])
    .unique()
    )
    , on = 'essay_id'
)
all_pred

essay_id,new_lbl,val,labels,full_text
str,i64,f64,i64,str
"""0613e60""",2,0.959286,2,"""My position on…"
"""0d1dfcb""",1,0.642843,1,"""Faces in...scr…"
"""1acb118""",2,0.861184,2,"""Should we limi…"
"""2444077""",2,0.939269,2,"""Driving or any…"
"""29273d7""",1,0.878912,1,"""To the fellow …"
"""29a0567""",1,0.895328,2,"""Would you like…"
"""30ff0f0""",3,0.714879,4,"""The year is 20…"
"""32303e8""",3,0.735185,2,"""Electoral Coll…"
"""362d18f""",3,0.499411,3,"""The author sup…"
"""39ed025""",2,0.93618,2,"""Do you think t…"


In [44]:

kappa = cohen_kappa_score(all_pred["labels"], all_pred["new_lbl"], weights="quadratic")
kappa

0.8059339712787852