In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q pyarrow==19.0.0 datasets==2.18.0 evaluate==0.4.1 peft==0.11.1 transformers==4.42.0 scikit-learn==1.4.2 tqdm


Importing libraries

In [None]:
import os, re, unicodedata, random, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments


Important configurations : 

    Model Name : Distilbert Base Multilingual Cased
    Sequence Length : 256
    Batch Size : 16
    Epoch : 10
    Seed : 42


In [None]:
SEED = 42
CFG = {
    #"model_name": "Geotrend/distilbert-base-multilingual-cased",  # use "deberta-v3-small" for faster run
    "model_name": "distilbert/distilbert-base-multilingual-cased",
    "max_length": 256,
    "train_bs": 16,
    "eval_bs": 32,
    "epochs": 10,
    "lr": 3e-5,
    "fp16": True,
    "val_size": 0.04,
}
def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)
set_seed()

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device, "| GPUs:", torch.cuda.device_count())


Setting up input and output directories and preprocessing . After that will encode the labels and split the dataset for traning and validation .

    Preprocessing : 
        Normalizing Unicode characters
        Removing control characters and extra whitespace
        Lowercasing all text

    Label Encoding : 
        LabelEncoder
    
    Stratified Split : 
        Train : 96
        Val : 4


In [None]:
INPUT_DIR = "/kaggle/input/the-ancient-texts-provenance-challenge"
train = pd.read_csv(f"{INPUT_DIR}/train.csv")
test  = pd.read_csv(f"{INPUT_DIR}/test.csv")
sample_sub = pd.read_csv(f"{INPUT_DIR}/sample_submission.csv")

def clean_text(t):
    if not isinstance(t,str): t=str(t)
    t = unicodedata.normalize("NFKC", t)
    t = re.sub(r"[\u0000-\u001F\u007F]", " ", t)
    return re.sub(r"\s+", " ", t).strip().lower()


train["text"] = train["text"].map(clean_text)

test["text"]  = test["text"].map(clean_text)

le = LabelEncoder()

train["label_enc"] = le.fit_transform(train["label"])

num_labels = train["label_enc"].nunique()
print("Labels:", num_labels)

trn_df, val_df = train_test_split(train , test_size = CFG["val_size"] , stratify = train["label_enc"] , random_state=SEED)


Tokenization : 

    We use the tokenizer from the selected Transformer model to convert text into tokens and we have defined a cusotom Txtdataset class with huggingface trainer api.  
    

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG["model_name"], use_fast=True)

class TxtDataset(Dataset):
    def __init__(self, df , tok , max_len , with_labels = True):
        self.texts = df["text"].tolist()
        self.labels = df["label_enc"].tolist() if with_labels else None
        self.tok = tok; self.m = max_len; self.with_labels = with_labels
    def __len__(self): return len(self.texts)

    def __getitem__(self, i):

        enc = self.tok(self.texts[i] , max_length=self.m , truncation = True, padding = False)
        if self.with_labels:

            enc["labels"] = torch.tensor(self.labels[i] , dtype = torch.long)
        return enc

train_ds = TxtDataset(trn_df, tokenizer , CFG["max_length"], True)

val_ds   = TxtDataset(val_df , tokenizer , CFG["max_length"] , True)
test_ds  = TxtDataset(test , tokenizer , CFG["max_length"] , False)

collator = DataCollatorWithPadding(tokenizer)


Handling impalance Classes : 

    To handle label imbalance, we compute inverse frequency class weights , which are applied to the CrossEntropy loss function during training .

In [None]:
### Compute class weights

cls_counts = train["label_enc"].value_counts().sort_index().values 

weights = 1.0 / torch.tensor(cls_counts , dtype=torch.float)

weights = weights / weights.sum() * len(weights)

print("Class weights:", weights[:5])


Custom Trainer and Evaluation Metrics

    We subclass Hugging Face Trainer to use a weighted cross-entropy loss and defined Macro F1 and accuracy matrix .


In [None]:

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight = weights.to(logits.device))
        loss = loss_fct(logits , labels)
        return (loss , outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    preds = np.argmax(logits , axis=-1)
    return { "macro_f1": f1_score(labels , preds , average="macro") , "accuracy": accuracy_score(labels, preds)}


Training Configuration , Execution ,  initialize and training the model.


In [None]:
out_dir = "/kaggle/working/distilbert-base-multilingual-cased"
os.makedirs(out_dir, exist_ok=True)

args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size = CFG["train_bs"],
    #per_device_train_batch_size= 512,
    per_device_eval_batch_size = CFG["eval_bs"],
    num_train_epochs = CFG["epochs"],
    
    learning_rate = CFG["lr"],
    fp16 = CFG["fp16"],

    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model = "macro_f1",
    greater_is_better = True,
    logging_steps= 100 ,
    dataloader_num_workers = 2 ,
    seed = SEED ,
    report_to = "none",
)

model = AutoModelForSequenceClassification.from_pretrained(CFG["model_name"], num_labels=num_labels)

trainer = WeightedTrainer(
    model=model, args=args,
    train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=collator,
    compute_metrics=compute_metrics
)

train_result = trainer.train()
print("Tr completed and best modle Loaded")
print(train_result.metrics)

########## Evaluating model on Validation Set
val_pred = trainer.predict(val_ds)
val_logits = val_pred.predictions

val_preds = np.argmax(val_logits , axis = -1)

f1 = f1_score(val_df["label_enc"] , val_preds , average = "macro")
acc = accuracy_score(val_df["label_enc"] , val_preds)
print(f"Validation F1={f1:.4f} | Acc={acc:.4f}")

##################### Predecting on test data and creating submission file
test_logits = trainer.predict(test_ds).predictions
test_labels = np.argmax(test_logits, axis=-1)
test_labels = le.inverse_transform(test_labels)

sample_sub.iloc[:, 1] = test_labels
sample_sub.to_csv("/kaggle/working/Submission.csv", index=False)
print("Saved:", "/kaggle/working/Submission.csv")
