In [None]:
%pip install datasets transformers wandb

Note: you may need to restart the kernel to use updated packages.


In [None]:
from datasets import load_dataset
import os

# Load the Hugging Face token from Kaggle secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
# Set the environment variable for Hugging Face
os.environ["HF_TOKEN"] = hf_token

# Load the dataset with the token
NER = load_dataset("Exploration-Lab/IL-TUR", "lner")

In [None]:
NER

DatasetDict({
    fold_1: Dataset({
        features: ['id', 'text', 'spans'],
        num_rows: 35
    })
    fold_2: Dataset({
        features: ['id', 'text', 'spans'],
        num_rows: 35
    })
    fold_3: Dataset({
        features: ['id', 'text', 'spans'],
        num_rows: 35
    })
})

**Data Fields:**

'id': string → IndianKanoon Case ID

'text': string → Full document text

'spans': List(

'start': int → starting char index
'end': int → ending char index + 1
'label': class_label → NER label
)

The labels for NER in the dataset are as follows:

* "APP"
* "RESP"
* "A.COUNSEL"
* "R.COUNSEL"
* "JUDGE"
* "WIT"
* "AUTH"
* "COURT"
* "STAT"
* "PREC"
* "DATE"
* "CASENO"

In [None]:
NER['fold_1'].to_pandas().head()

Unnamed: 0,id,text,spans
0,115651329,REPORTABLE IN THE SUPREME COURT OF INDIA CRIMI...,"[{'start': 137, 'end': 153, 'label': 1}, {'sta..."
1,37849282,1 REPORTABLE IN THE SUPREME COURT OF INDIA CIV...,"[{'start': 20, 'end': 42, 'label': 7}, {'start..."
2,975074,PETITIONER: PARMAR KANAKSINH BHAGWANSINH (DEAD...,"[{'start': 12, 'end': 40, 'label': 0}, {'start..."
3,189525449,Non-Reportable IN THE SUPREME COURT OF INDIA C...,"[{'start': 22, 'end': 44, 'label': 7}, {'start..."
4,736324,CASE NO.: Special Leave Petition (civil) 14656...,"[{'start': 10, 'end': 54, 'label': 11}, {'star..."


In [None]:
from datasets import DatasetDict, Dataset

# Map label indices to their corresponding labels
label_map = {
    0: "APP",
    1: "RESP",
    2: "A.COUNSEL",
    3: "R.COUNSEL",
    4: "JUDGE",
    5: "WIT",
    6: "AUTH",
    7: "COURT",
    8: "STAT",
    9: "PREC",
    10: "DATE",
    11: "CASENO"
}

def annotate_entities(example):
    text = example['text']
    spans = example['spans']

    # Sort spans by start index to avoid overlapping issues
    spans = sorted(spans, key=lambda x: x['start'], reverse=True)

    # Insert entity tags
    for span in spans:
        start = span['start']
        end = span['end']
        label = label_map[span['label']]
        entity = text[start:end]
        text = text[:start] + f"<{label}>{entity}</{label}>" + text[end:]

    return {'text': text}

# Apply preprocessing to each fold
processed_data = DatasetDict({
    fold_1: dataset.map(annotate_entities) for fold_1, dataset in NER.items()
})

In [None]:
processed_data['fold_1'].to_pandas().head()

Unnamed: 0,id,text,spans
0,115651329,REPORTABLE IN THE SUPREME COURT OF INDIA CRIMI...,"[{'start': 137, 'end': 153, 'label': 1}, {'sta..."
1,37849282,1 REPORTABLE IN THE <COURT>SUPREME COURT OF IN...,"[{'start': 20, 'end': 42, 'label': 7}, {'start..."
2,975074,PETITIONER: <APP>PARMAR KANAKSINH BHAGWANSINH<...,"[{'start': 12, 'end': 40, 'label': 0}, {'start..."
3,189525449,Non-Reportable IN THE <COURT>SUPREME COURT OF ...,"[{'start': 22, 'end': 44, 'label': 7}, {'start..."
4,736324,CASE NO.: <CASENO>Special Leave Petition (civi...,"[{'start': 10, 'end': 54, 'label': 11}, {'star..."


In [None]:
def create_input_output(example):
    input_text = f"Perform NER: {example['text']}"
    output_text = example['text']  # Already annotated by annotate_entities
    return {'input_text': input_text, 'output_text': output_text}

# Apply to each fold
processed_data = DatasetDict({
    fold: dataset.map(create_input_output) for fold, dataset in processed_data.items()
})

In [None]:
from transformers import ByT5Tokenizer

tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")

def tokenize_data(example):
    model_inputs = tokenizer(example['input_text'], max_length=512, truncation=True)
    labels = tokenizer(example['output_text'], max_length=512, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize each fold
tokenized_data = DatasetDict({
    fold: dataset.map(tokenize_data, batched=True) for fold, dataset in processed_data.items()
})

In [None]:
print(tokenized_data['fold_1'][0])  # Inspect the first example in fold_1

{'id': '115651329', 'text': 'REPORTABLE IN THE SUPREME COURT OF INDIA CRIMINAL APPELLATE JURISDICTION CRIMINAL APPEAL NO. 92/2015 JAGE RAM & ORS. ..Appellants Versus <RESP>STATE OF HARYANA</RESP> ..Respondent J U D G M E N T R. BANUMATHI, J. This appeal is preferred against the judgment dated <DATE>19.8.2011</DATE> passed by the <COURT>High Court of Punjab and Haryana</COURT> in Criminal Appeal No.181 SB of 2000, whereby the <COURT>High Court</COURT> partly allowed the appeal filed by the appellants thereby confirming the conviction of the appellants with certain modifications. 2. Briefly stated, case of the prosecution is that on the fateful day i.e. <DATE>18.11.1994</DATE>, at about 8.00 A.M. in the morning the complainant <WIT>Jagdish</WIT> (PW-5) along with his two sons namely Sukhbir and <WIT>Mange Ram</WIT> (PW-6) were busy in cutting pullas (reeds) from the dola of their field. At that time, <APP>Jage Ram</APP> (A-1) and his sons Rajbir Singh @ Raju (A-2), Rakesh (A-3) and Madan

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_1 = user_secrets.get_secret("WANDB_API_KEY")

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
key = user_secrets.get_secret("WANDB_API_KEY")

!wandb login $key

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# Load the model
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small",
                                                   use_cache=False)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./byt5-ner",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=2,# Added this after OutOfMemory error
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    fp16=True,  # Use mixed precision if you have a GPU
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['fold_1'],
    eval_dataset=tokenized_data['fold_2'],
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,4.873067
2,No log,2.801598




TrainOutput(global_step=6, training_loss=8.49232546488444, metrics={'train_runtime': 39.9167, 'train_samples_per_second': 2.63, 'train_steps_per_second': 0.15, 'total_flos': 64312470405120.0, 'train_loss': 8.49232546488444, 'epoch': 2.0})

In [None]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import numpy as np

import os

In [None]:
# Memory optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Load the model with optimizations
model = T5ForConditionalGeneration.from_pretrained(
    "google/byt5-small",
    use_cache=False
)

# Custom compute_metrics function for NER
def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # Decode predictions first
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compare decoded sequences
    exact_matches = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels))
    accuracy = exact_matches / len(decoded_preds)

    return {
        "accuracy": accuracy
    }
# Define training arguments with detailed logging
training_args = Seq2SeqTrainingArguments(
    output_dir="./byt5-ner",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    eval_strategy="epoch",        # Changed from evaluation_strategy
    eval_steps=100,              # Evaluate every 100 steps
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,            # Log every 50 steps
    learning_rate=5e-5,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 score to determine best model
    greater_is_better=True,
    report_to=["wandb"],         # Log to Weights & Biases
)

# Initialize trainer with compute_metrics
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['fold_1'],
    eval_dataset=tokenized_data['fold_2'],
    compute_metrics=compute_metrics,
)

# Train with proper error handling
try:
    train_result = trainer.train()

    # Print detailed training metrics
    print("\nTraining metrics:")
    print(f"Total training loss: {train_result.training_loss:.4f}")
    print(f"Training runtime: {train_result.metrics['train_runtime']:.2f} seconds")
    print(f"Samples per second: {train_result.metrics['train_samples_per_second']:.2f}")

    # Evaluate on fold 3
    print("\nEvaluating on hold-out set (fold 3):")
    eval_results = trainer.evaluate(tokenized_data['fold_3'])

    # Print entity-specific metrics
    for entity in ["APP", "RESP", "A.COUNSEL", "R.COUNSEL", "JUDGE", "WIT",
                   "AUTH", "COURT", "STAT", "PREC", "DATE", "CASENO"]:
        print(f"\n{entity} metrics:")
        print(f"Precision: {eval_results.get(f'{entity}_precision', 'N/A')}")
        print(f"Recall: {eval_results.get(f'{entity}_recall', 'N/A')}")
        print(f"F1: {eval_results.get(f'{entity}_f1', 'N/A')}")

except Exception as e:
    print(f"An error occurred during training: {str(e)}")



Epoch,Training Loss,Validation Loss


An error occurred during training: int() argument must be a string, a bytes-like object or a real number, not 'list'
