In [1]:
# pip install transformers datasets



In [1]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [7]:
# loading dataset scotus which is "Supreme Court of United States"
dataset = load_dataset("lex_glue", "scotus")

In [8]:
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

In [10]:
# Let's inspect it to see the features and labels ('issueArea')
print(train_dataset)
print(f"Example text: {train_dataset[0]['text']}")
print(f"Example label: {train_dataset[0]['label']}")

Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})
Example text: 329 U.S. 29
67 S.Ct. 1
91 L.Ed. 22
CHAMPLIN REFINING COv.UNITED STATES et al.
No. 21.
Argued Oct. 18, 21, 1946.
Decided Nov. 18, 1946.
Rehearing Denied Dec. 16, 1946.

See 329 U.S. 831, 67 S.Ct. 363.
Appeal from the District Court of the United States for the Western District of Oklahoma.
Messrs.Dan Moody, of Austin, Tex., and Harry O. Glasser, of Enid, Okla., for appellant.
Mr. Edward Dumbauld, of Washington, D.C., for appel-
[Argument of Counsel from page 30 intentionally omitted]
lees. Mr. Justice JACKSON delivered the opinion of the Court.


1
The Interstate Commerce Commission, acting under § 19a of the Interstate Commerce Act,1 ordered the appellant to furnish certain inventories, schedules, maps and charts of its pipe line property.2 Champlin's objections that the Act does not authorize the order, or if it be construed to do so is unconstitutional, were overruled by the Commission and again by the Dis

In [12]:
# preprocessing with transformer tokenizer

# tokenizer loading
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [16]:
# mapping from label names to integer
labels = train_dataset.features['label'].names
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

In [21]:
# tokenization function
def tokenize_function(examples):
  # this tokenizer handles padding and truncates for us
  tokenized_inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
  tokenized_inputs['labels'] = examples["label"] # Use integer labels directly
  return tokenized_inputs

In [22]:
# applying the tokenization to entire dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [23]:
# fine tunning the model

# computing metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [24]:
# loading the pretrainned model with correct number of labels
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory for model checkpoints
    num_train_epochs=1,              # A single epoch is often enough for fine-tuning
    per_device_train_batch_size=8,   # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    warmup_steps=500,                # Number of steps for learning rate warmup
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",     # Evaluate at each logging_steps
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [27]:
# creating a trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

In [28]:
# starting fine tunning
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdhruvmali999[0m ([33mdhruvmali999-adani-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,2.4103,2.157635,0.361429,0.226488,0.165206,0.361429
200,1.996,1.74537,0.470714,0.345331,0.356256,0.470714
300,1.5441,1.460418,0.549286,0.460531,0.454744,0.549286
400,1.4092,1.424471,0.57,0.485751,0.471823,0.57
500,1.1584,1.368591,0.595,0.544844,0.520083,0.595
600,1.143,1.229659,0.627143,0.576774,0.548417,0.627143


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=625, training_loss=1.5868224487304687, metrics={'train_runtime': 548.7569, 'train_samples_per_second': 9.112, 'train_steps_per_second': 1.139, 'total_flos': 662466923520000.0, 'train_loss': 1.5868224487304687, 'epoch': 1.0})

In [29]:
# training part over
# now nlp part

In [30]:
from transformers import pipeline

# loading pre trained questions answering pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cuda:0


In [31]:
# Let's use a sample legal text as our context
# This is a simplified excerpt related to the Fourth Amendment
legal_context = """
The right of the people to be secure in their persons, houses, papers, and effects,
against unreasonable searches and seizures, shall not be violated, and no Warrants shall issue,
but upon probable cause, supported by Oath or affirmation, and particularly describing the
place to be searched, and the persons or things to be seized. The primary purpose of this
amendment is to protect citizens from arbitrary governmental intrusions. A search or seizure is
generally considered to be unreasonable without a warrant.
"""

In [32]:
# Question 1
question1 = "What right is protected by the amendment?"
result1 = qa_pipeline(question=question1, context=legal_context)
print(f"Q: {question1}")
print(f"A: {result1['answer']} (Score: {result1['score']:.4f})")
#

# Question 2
question2 = "What is required for a warrant to be issued?"
result2 = qa_pipeline(question=question2, context=legal_context)
print(f"\nQ: {question2}")
print(f"A: {result2['answer']} (Score: {result2['score']:.4f})")

# Question 3
question3 = "When is a search generally considered unreasonable?"
result3 = qa_pipeline(question=question3, context=legal_context)
print(f"\nQ: {question3}")
print(f"A: {result3['answer']} (Score: {result3['score']:.4f})")

Q: What right is protected by the amendment?
A: citizens from arbitrary governmental intrusions (Score: 0.1047)

Q: What is required for a warrant to be issued?
A: A search or seizure (Score: 0.1447)

Q: When is a search generally considered unreasonable?
A: without a warrant (Score: 0.8234)
