In [30]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
import evaluate

# Load SCOTUS dataset from LexGLUE
dataset = load_dataset("coastalcph/lex_glue", "scotus")

# Load LegalBERT tokenizer and model
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=13)  # SCOTUS has 13 classes

# Preprocessing function for tokenization
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Load evaluation metric
f1_metric = evaluate.load("f1")

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)  # Get class with highest probability

    # Compute micro and macro F1 scores
    micro_f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro")["f1"]
    macro_f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]

    return {"micro-F1": micro_f1, "macro-F1": macro_f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Evaluate on test set
test_results = trainer.evaluate(encoded_dataset["test"])
print(test_results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [31]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_call_one',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_pad',
 '_pad_token_type_id',
 '_processor_class',
 '_save_pretrained',
 '_set_model_sp

In [40]:
for i in dir(tokenizer):
    if 'max' in i:
        print(i)

_eventually_correct_t5_max_length
max_len_sentences_pair
max_len_single_sentence
model_max_length


In [41]:
tokenizer.model_max_length

512

In [44]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-led-base-16384")  
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-led-base-16384")

padding = "max_length" 

text=dataset['train']['text'][0]

input_tokenized = tokenizer.encode(text, return_tensors='pt',padding=padding,pad_to_max_length=True, max_length=6144,truncation=True)
summary_ids = model.generate(input_tokenized,
                                  num_beams=4,
                                  no_repeat_ngram_size=3,
                                  length_penalty=2,
                                  min_length=350,
                                  max_length=500)
summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
print('-'*50 , f'Length of the original text : {len(text)}\n','-'*50 , f'Length of the summarized text : {len(summary)}\n')
### Summary Output

tokenizer_config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

-------------------------------------------------- Length of the original text : 19208
 -------------------------------------------------- Length of the summarized text : 2382



In [45]:
summary

'The U.S. District Court for the Western District of Oklahoma today entered an order granting the SEC\'s request for an asset freeze and other emergency relief.  The SEC\'s complaint, filed in federal court in Oklahoma City, alleges that Oklahoma-based Champlin Refining Company violated the antifraud provisions of Section 19a of the Interstate Commerce Act of 1940.  According to the complaint, Champlin is engaged in the transportation of oil or other similar commodities by pipe line from its refinery in Oklahoma to its own refinery in Iowa.  Champlin\'s complaint alleges that the federal agency issued an order requiring Champlin to furnish certain inventories, schedules, maps and charts of its pipe line property.  As alleged, the order did not apply to Champlin, and Champlin has never been asked to carry the products of another company or person.  In fact, the complaint alleges, the company has never carried any oil of its own, and the Commission has never deemed Champlin a "common car