In [None]:
!pip install transformers datasets torch accelerate


Collecting datasets
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn

In [None]:
pip install wandb

Collecting wandb
  Downloading wandb-0.17.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.17.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
import requests
import json
import pandas as pd
from datasets import Dataset

# Step 1: Download the JSON file
url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/train.json"
response = requests.get(url)
json_data = response.json()


In [None]:
def json_to_dataframe(json_data):
    data = []
    for document in json_data:
        doc_id = document.get("id")
        for annotation in document.get("annotations", []):
            for result in annotation.get("result", []):
                segment = {
                    'doc_id': doc_id,
                    'text': result['value'].get('text'),
                    'label': result['value'].get('labels', [None])[0]  # Get the first label if available
                }
                data.append(segment)
    return pd.DataFrame(data)

In [None]:
# Convert JSON to DataFrame
df = json_to_dataframe(json_data)
print(df.head(100))

    doc_id                                               text     label
0     1735        IN THE HIGH COURT OF KARNATAKA,\n       ...  PREAMBLE
1     1735  \n\n      BEFORE\n\nTHE HON'BLE MR.JUSTICE ANA...  PREAMBLE
2     1735  This Criminal Appeal is filed under Section 37...  PREAMBLE
3     1735  \n\n       This appeal coming on for hearing t...  PREAMBLE
4     1735  \n       Heard the learned Counsel for the app...      NONE
..     ...                                                ...       ...
95    4183                                          \nJUSTICE  PREAMBLE
96    4183  .\n\n         THIS I.T.A. COMING ON FOR HEARIN...  PREAMBLE
97    4183          \nS. SUJATHA J. DELIVERED THE FOLLOWING:-  PREAMBLE
98    4183       \n\n                                JUDGMENT  PREAMBLE
99    4183  \nMr. E.I.Sanmathi Adv. for Appellants- Revenu...      NONE

[100 rows x 3 columns]


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
df['label'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)
# Convert DataFrame to Hugging Face Dataset again
dataset = Dataset.from_pandas(df)

{'ANALYSIS': 0, 'ARG_PETITIONER': 1, 'ARG_RESPONDENT': 2, 'FAC': 3, 'ISSUE': 4, 'NONE': 5, 'PREAMBLE': 6, 'PRE_NOT_RELIED': 7, 'PRE_RELIED': 8, 'RATIO': 9, 'RLC': 10, 'RPC': 11, 'STA': 12}


In [None]:
import torch
import torch.nn as nn
from transformers import RobertaModel

class LegalRoBERTaWithPositionalEmbeddings(nn.Module):
    def __init__(self, model_name, num_labels):
        super(LegalRoBERTaWithPositionalEmbeddings, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None, absolute_pos=None, normalized_pos=None, k_quantile_pos=None):
        # Pass through LegalRoBERTa (similar to BERT)
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]  # Last hidden state

        # Incorporate positional information here (e.g., add/concatenate positional embeddings)
        if absolute_pos is not None:
            # Simple example: adding positional embeddings to token embeddings
            absolute_pos_embeds = self._get_positional_embeddings(absolute_pos, sequence_output.size())
            sequence_output = sequence_output + absolute_pos_embeds

        # Pass through classifier
        pooled_output = outputs[1]  # CLS token's output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # If labels are provided, calculate loss
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        # Return the loss and logits
        if loss is not None:
            return loss, logits
        else:
            return logits

    def _get_positional_embeddings(self, pos, size):
        # Custom logic to generate positional embeddings (e.g., absolute/normalized/k-quantile)
        # For simplicity, this method generates random embeddings. You can implement more meaningful logic here.
        # 'size' will contain (batch_size, seq_len, hidden_dim) as the dimensions of sequence_output
        batch_size, seq_len, hidden_dim = size
        positional_embeds = torch.randn(batch_size, seq_len, hidden_dim).to(pos.device)
        return positional_embeds


# Finetuning and Dataset-splitting

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import DatasetDict

# Initialize the model
model = LegalRoBERTaWithPositionalEmbeddings("nlpaueb/legal-bert-base-uncased", num_labels=len(label_encoder.classes_))
# Load the tokenizer for LegalRoBERTa
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Step 1: Load and preprocess the dataset (similar to your existing code)
dataset = Dataset.from_pandas(df)

# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Step 2: Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Step 3: Split the dataset into train, eval, and test sets
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_test_dataset = split_dataset['test'].train_test_split(test_size=0.5)

datasets = DatasetDict({
    'train': split_dataset['train'],
    'test': train_test_dataset['test'],
    'eval': train_test_dataset['train']
})





You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaModel were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.laye

Map:   0%|          | 0/28986 [00:00<?, ? examples/s]

In [None]:

training_args = TrainingArguments(
    output_dir="./results_legalroberta_pos",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=1,
    logging_dir="./logs",
    report_to="wandb",  # Optional: report to W&B
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['eval']
)

# Train the model
trainer.train()

# Evaluate the model on the test set
trainer.evaluate(datasets['test'])

# Precision, Recall, F1 score, Accuracy (Evaluating Model)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import Trainer

# Define a compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# If the model is already trained, just initialize the Trainer with the same model
trainer = Trainer(
    model=model,  # Use your already trained model
    #args=training_args,
    eval_dataset=datasets['eval'],  # Use your evaluation dataset
    compute_metrics=compute_metrics  # Include the metrics function
)

# Evaluate the model
results = trainer.evaluate()

# Print the evaluation results
print(results)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mengineersaloni159[0m ([33msalonijnu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 2.69576096534729, 'eval_model_preparation_time': 0.0118, 'eval_accuracy': 0.012418075198344257, 'eval_precision': 0.0019636028983896553, 'eval_recall': 0.012418075198344257, 'eval_f1': 0.002906543491838534, 'eval_runtime': 20.7954, 'eval_samples_per_second': 139.406, 'eval_steps_per_second': 17.456}


In [None]:
model

LegalRoBERTaWithPositionalEmbeddings(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Print predicted labels
print(predicted_labels)

# save this model on huggingface

In [None]:
pip install huggingface_hub




In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [None]:
from huggingface_hub import HfApi

# Define repository name and create repo on the hub
repo_name = "Pos_RoBERt_for_rhetorical_role_labeling"  # Choose a unique name
HfApi().create_repo(repo_name)

# Upload the model
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

HfHubHTTPError: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-66e04a33-610b3b7f6a0d253c3d11a0cd;178c80d2-c023-4c76-b50c-a8191ff366cc)

You already created this model repo

In [None]:
model.save_pretrained(repo_name, push_to_hub=True)
tokenizer.save_pretrained(repo_name, push_to_hub=True)

('LegalRo-BERt_for_rhetorical_role_labeling/tokenizer_config.json',
 'LegalRo-BERt_for_rhetorical_role_labeling/special_tokens_map.json',
 'LegalRo-BERt_for_rhetorical_role_labeling/vocab.json',
 'LegalRo-BERt_for_rhetorical_role_labeling/merges.txt',
 'LegalRo-BERt_for_rhetorical_role_labeling/added_tokens.json',
 'LegalRo-BERt_for_rhetorical_role_labeling/tokenizer.json')

In [None]:
import torch

# Save the model's state dictionary (weights)
torch.save(model.state_dict(), "legal_roberta_positional.pth")

# You may also want to save the tokenizer
tokenizer.save_pretrained("./legal_roberta_tokenizer")


('./legal_roberta_tokenizer/tokenizer_config.json',
 './legal_roberta_tokenizer/special_tokens_map.json',
 './legal_roberta_tokenizer/vocab.txt',
 './legal_roberta_tokenizer/added_tokens.json',
 './legal_roberta_tokenizer/tokenizer.json')