In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
import requests
import json
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

# Download the dataset
url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/train.json"
response = requests.get(url)
json_data = response.json()

# Convert JSON to DataFrame
def json_to_dataframe(json_data):
    data = []
    for document in json_data:
        doc_id = document.get("id")
        for annotation in document.get("annotations", []):
            for result in annotation.get("result", []):
                segment = {
                    'doc_id': doc_id,
                    'text': result['value'].get('text'),
                    'label': result['value'].get('labels', [None])[0]  # Get the first label if available
                }
                data.append(segment)
    return pd.DataFrame(data)

df = json_to_dataframe(json_data)

# Encode the labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])


def add_positional_info(df):
    # Sort the dataframe by document ID to group sentences from the same document
    df = df.sort_values(by=['doc_id'])

    # Add absolute position, normalized position, and k-quantile position
    df['absolute_pos'] = df.groupby('doc_id').cumcount() + 1  # Starts from 1 for each document
    df['doc_length'] = df.groupby('doc_id')['text'].transform('count')
    df['normalized_pos'] = df['absolute_pos'] / df['doc_length']
    df['k_quantile_pos'] = pd.qcut(df['absolute_pos'], q=4, labels=False)  # 4-quantile position

    return df

# Apply positional information to the dataframe
df = add_positional_info(df)

# Drop the 'doc_length' column as it's not needed for tokenization
df = df.drop(columns=['doc_length'])

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)


# Split the dataset into train, test, and eval sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
test_eval_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

# Combine into a DatasetDict
split_dataset = {
    'train': train_test_split['train'],
    'test': test_eval_split['test'],
    'eval': test_eval_split['train']
}

# Verify the split
print("Train size:", len(split_dataset['train']))
print("Test size:", len(split_dataset['test']))
print("Eval size:", len(split_dataset['eval']))


Train size: 23188
Test size: 2899
Eval size: 2899


In [None]:
dataset.shape

(28986, 7)

In [None]:
# Add sentence position embeddings to the dataset
def add_position_embeddings(df):
    df['absolute_pos'] = df.groupby('doc_id').cumcount() + 1
    df['normalized_pos'] = df['absolute_pos'] / df.groupby('doc_id')['absolute_pos'].transform('max')

    def quantile_position(group):
        group['quantile_pos'] = pd.qcut(group['absolute_pos'], q=4, labels=False) + 1  # k=4 quantiles
        return group

    df = df.groupby('doc_id').apply(quantile_position)
    return df

df = add_position_embeddings(df)


  df = df.groupby('doc_id').apply(quantile_position)


In [None]:
df.head(28986)

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_id,text,label,absolute_pos,normalized_pos,k_quantile_pos,quantile_pos
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1711,5130,1711,It would be even more anomalous to have an app...,0,1,0.005319,0,1
1711,5168,1711,As the Punjab Courts Act does not contemplate ...,0,2,0.010638,0,1
1711,5169,1711,The Court contemplated is the Court of the Add...,0,3,0.015957,0,1
1711,5170,1711,We hold therefore that the Court of the Additi...,0,4,0.021277,0,1
1711,5171,1711,"\n Now, as we have seen, when the original Cou...",0,5,0.026596,0,1
...,...,...,...,...,...,...,...,...
13658,9485,13658,The accused pleaded not guilty and claims to b...,0,69,0.945205,2,4
13658,9486,13658,"Hence, case has been posted for recording of e...",0,70,0.958904,2,4
13658,9487,13658,\n 7. In order to substantiate the case made o...,0,71,0.972603,2,4
13658,9489,13658,No defence evidence led.,5,72,0.986301,2,4


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn as nn
from transformers import BertModel

class BERTWithPositionalEmbeddings(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BERTWithPositionalEmbeddings, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None, absolute_pos=None, normalized_pos=None, k_quantile_pos=None):
        # Pass through BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]  # Last hidden state

        # Here you can incorporate positional information (absolute_pos, normalized_pos, k_quantile_pos) if necessary

        # Pass through classifier
        pooled_output = outputs[1]  # CLS token's output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # If labels are provided, calculate loss
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        # Return the loss and logits
        if loss is not None:
            return loss, logits
        else:
            return logits

# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Initialize the model
model = BERTWithPositionalEmbeddings(model_name=model_name, num_labels=len(label_encoder.classes_))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
model

BERTWithPositionalEmbeddings(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [None]:

def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True)
    tokenized_inputs['absolute_pos'] = examples['absolute_pos']
    return tokenized_inputs

# Tokenize the datasets
tokenized_datasets = {}
for split in ['train', 'test', 'eval']:
    tokenized_datasets[split] = split_dataset[split].map(preprocess_function, batched=True)



Map:   0%|          | 0/23188 [00:00<?, ? examples/s]

Map:   0%|          | 0/2899 [00:00<?, ? examples/s]

Map:   0%|          | 0/2899 [00:00<?, ? examples/s]

In [None]:
pip install wandb

Collecting wandb
  Downloading wandb-0.18.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
from transformers import Trainer, TrainingArguments
import wandb

# Initialize wandb
#wandb.init(project="rhetorical-role-bert", entity="your_wandb_username")  # Replace with your WandB username and project

# Step 3: Setup Training Arguments
training_args = TrainingArguments(
    output_dir="./resultsLR_Pos_BERTagain",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    save_total_limit=1,  # Keep only the most recent model
    save_steps=500,  # Save checkpoint every 500 steps
    logging_dir="./logs",
    learning_rate=2e-5,#new add
    weight_decay=0.01,#new add
    warmup_steps=100,#new add
    report_to="wandb",  # Report to Weights & Biases
    logging_steps=100,  # Log every 100 steps
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
0,1.1556,1.173184
1,0.9647,1.086361
2,0.8523,1.076046


TrainOutput(global_step=2172, training_loss=1.0977826425822839, metrics={'train_runtime': 6621.3699, 'train_samples_per_second': 10.506, 'train_steps_per_second': 0.328, 'total_flos': 0.0, 'train_loss': 1.0977826425822839, 'epoch': 2.996895481200414})

# Precision, Recall, F1 score, Accuracy (Evaluating Model)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import Trainer


# Define a compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# If the model is already trained, just initialize the Trainer with the same model
trainer = Trainer(
    model=model,  # Use your already trained model
    #args=training_args,
    eval_dataset=tokenized_datasets['eval'],  # Use your evaluation dataset
    compute_metrics=compute_metrics  # Include the metrics function
)

# Evaluate the model
results = trainer.evaluate()

# Print the evaluation results
print(results)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mengineersaloni159[0m ([33msalonijnu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 2.6256320476531982, 'eval_model_preparation_time': 0.0038, 'eval_accuracy': 0.09451535012073128, 'eval_precision': 0.42194022216665367, 'eval_recall': 0.09451535012073128, 'eval_f1': 0.04571065162758925, 'eval_runtime': 79.6703, 'eval_samples_per_second': 36.387, 'eval_steps_per_second': 4.556}


In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Print predicted labels
print(predicted_labels)

KeyboardInterrupt: 

# Manually Evaluation of model

In [None]:
!pip uninstall torch torchvision torchaudio -y

In [None]:
!pip install torch torchvision torchaudio



In [None]:
pip install transformers



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datasets import load_dataset, Dataset
import pandas as pd
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("engineersaloni159/Positional_BERT_for_rhetorical_role_labeling")
model = AutoModelForSequenceClassification.from_pretrained("engineersaloni159/Positional_BERT_for_rhetorical_role_labeling")


config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Ensure model is in evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
#run this cell if model is directly called from  huggingface
from sklearn.preprocessing import LabelEncoder

# Assuming you already have this
label_encoder = LabelEncoder()

# Fit and transform the labels
df['label'] = label_encoder.fit_transform(df['label'])

# Print the mapping of labels to numbers
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

NameError: name 'df' is not defined

In [None]:
# Assuming label_encoder is the one you used for encoding labels earlier

# Example sentence
sentence =  "The High Court was then moved under s. 66 (2) of the Indian income-tax Act, 1922 and the High, Court heard the two applications together and directed the Tribunal to state a case on the following two questions which, in the opinion of the High Court, arose out of the Tribunal's orders.\n          (1) Whether the claim of loss in this case is governed by the provisions of S. 10(1) or 24(1) proviso read with s. 14(2)(c), or by the provisions of s. 42 ?\n (2) Whether on the facts of the case a loss of Rs. 22,981/- is allowable in computing the income of the assessee chargeable to the Excess Profits Tax ?"

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)

# The predictions are in the form of logits, you can convert them to probabilities
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1)

# Convert the predicted class number to the original label
predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())[0]

# Print the predicted label
print(f"Predicted label: {predicted_label}")


Predicted label: RLC


In [None]:
print("Logits:", logits)


Logits: tensor([[-0.4151, -1.8779, -1.5828,  2.4168,  2.6604, -0.4153,  0.8213, -0.6055,
         -0.4555, -1.2325,  2.7832, -0.7909, -0.9192]])
