In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/working/__notebook__.ipynb


In [2]:
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import torch
from sklearn.metrics import accuracy_score
import os
os.environ["WANDB_DISABLED"] = "true"

# Metric function for accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Load dataset
ds = load_dataset("rishiai/indian-court-judgements-and-its-summaries")
df = ds["train"].to_pandas()

# Weak labeling function
def infer_outcome(text):
    text = text.lower()
    if "petition allowed" in text or "appeal allowed" in text:
        return "allowed"
    elif "petition dismissed" in text or "appeal dismissed" in text:
        return "dismissed"
    return None

# Apply labeling and clean
df['outcome'] = df['Judgment'].apply(infer_outcome)
df = df.dropna(subset=['outcome'])
df.loc[:, 'outcome'] = df['outcome'].str.lower()

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['outcome'])

# Use only necessary columns
df = df[['Summary', 'label']].copy()
df = df.dropna(subset=['Summary'])

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df.reset_index(drop=True))

# Tokenizer & model setup
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['Summary'], padding=True, truncation=True, max_length=512)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"Training fold {fold + 1}...")

    train_dataset = dataset.select(train_idx)
    val_dataset = dataset.select(val_idx)

    output_dir = f"./legal_outcome_model_fold_{fold + 1}"
    
    training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_dir=output_dir,
    logging_steps=50,
    save_steps=500,  # Save every 500 steps
    save_total_limit=2,  # Keep only last 2 checkpoints
    do_eval=True,
    logging_first_step=True,
    report_to=[]  
    )

    # Fresh model each fold
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        compute_metrics=compute_metrics ,
        eval_dataset=val_dataset
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Fold {fold + 1} results:", eval_results)
    results.append(eval_results)

# Average results
average_results = {metric: np.mean([result[metric] for result in results]) for metric in results[0]}
print("\nAverage cross-validation results:", average_results)


2025-04-21 17:45:37.132994: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745257537.349468      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745257537.413907      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

pair_dataset.csv:   0%|          | 0.00/209M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6944 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/4125 [00:00<?, ? examples/s]

Training fold 1...


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Step,Training Loss
1,0.6566
50,0.6842
100,0.5481
150,0.4415
200,0.439
250,0.4162
300,0.4084
350,0.4291
400,0.4119
450,0.3954




Fold 1 results: {'eval_loss': 1.4986298084259033, 'eval_accuracy': 0.7806060606060606, 'eval_runtime': 15.2201, 'eval_samples_per_second': 54.205, 'eval_steps_per_second': 3.417, 'epoch': 10.0}
Training fold 2...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
1,0.665
50,0.6972
100,0.6362
150,0.4541
200,0.4775
250,0.444
300,0.402
350,0.4249
400,0.4181
450,0.3797




Fold 2 results: {'eval_loss': 1.112125039100647, 'eval_accuracy': 0.8121212121212121, 'eval_runtime': 15.1633, 'eval_samples_per_second': 54.408, 'eval_steps_per_second': 3.429, 'epoch': 10.0}
Training fold 3...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
1,0.7155
50,0.6773
100,0.4792
150,0.4494
200,0.4358
250,0.3684
300,0.3741
350,0.3624
400,0.385
450,0.3101




Fold 3 results: {'eval_loss': 1.4257022142410278, 'eval_accuracy': 0.8072727272727273, 'eval_runtime': 15.141, 'eval_samples_per_second': 54.488, 'eval_steps_per_second': 3.434, 'epoch': 10.0}
Training fold 4...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
1,0.745
50,0.7103
100,0.5377
150,0.4413
200,0.4413
250,0.3974
300,0.3839
350,0.4263
400,0.4094
450,0.3497




Fold 4 results: {'eval_loss': 1.4837357997894287, 'eval_accuracy': 0.7951515151515152, 'eval_runtime': 15.1426, 'eval_samples_per_second': 54.482, 'eval_steps_per_second': 3.434, 'epoch': 10.0}
Training fold 5...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
1,0.6712
50,0.7187
100,0.7007
150,0.5446
200,0.4739
250,0.4202
300,0.4254
350,0.416
400,0.4345
450,0.3566




Fold 5 results: {'eval_loss': 1.2566680908203125, 'eval_accuracy': 0.8024242424242424, 'eval_runtime': 15.1842, 'eval_samples_per_second': 54.333, 'eval_steps_per_second': 3.425, 'epoch': 10.0}

Average cross-validation results: {'eval_loss': 1.3553721904754639, 'eval_accuracy': 0.7995151515151515, 'eval_runtime': 15.170240000000002, 'eval_samples_per_second': 54.3832, 'eval_steps_per_second': 3.4278000000000004, 'epoch': 10.0}


In [3]:
import os
print(os.listdir('./legal_outcome_model_fold_2'))


['checkpoint-2000', 'checkpoint-2070']


In [4]:
import json

# Save evaluation results
with open("./legal_outcome_model_fold_1/eval_results_fold1.json", "w") as f:
    json.dump(eval_results, f, indent=4)


In [5]:
import json

# Save evaluation results
with open("./legal_outcome_model_fold_2/eval_results_fold2.json", "w") as f:
    json.dump(eval_results, f, indent=4)


In [6]:
import shutil

# This will create legal_outcome_model_fold_1.zip in current working directory
shutil.make_archive("legal_outcome_model_fold_2", 'zip', "./legal_outcome_model_fold_2")


'/kaggle/working/legal_outcome_model_fold_2.zip'