<a href="https://colab.research.google.com/github/Thoran37/Multi-Agent-Law-Framework/blob/main/mlc/T5_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets accelerate tensorboard sentencepiece
!pip install -q torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.1/75.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m124.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m172.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={
    "train": "/content/LaCour_merged.csv"
})
dataset


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['case_id', 'speaker_role', 'prev_speaker_role', 'start_time', 'end_time', 'duration', 'text', 'turn', 'role_turn', 'file'],
        num_rows: 1215
    })
})

In [4]:
def format_example(example):
    example["input_text"] = f"{example['speaker_role']}: {example['text']}"
    example["target_text"] = example["text"]
    return example

dataset = dataset.map(format_example)


Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


  * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
def tokenize(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )
    labels = tokenizer(
        batch["target_text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset["train"].column_names)
tokenized


Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1215
    })
})

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="t5_court_model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="tensorboard",
    optim="adafactor",
    tpu_num_cores=8,     # TPU
)


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
)


In [9]:
trainer.train()




Step,Training Loss
50,2.7141
100,0.3717
150,0.1477
200,0.1062


TrainOutput(global_step=228, training_loss=0.744507516685285, metrics={'train_runtime': 262.7001, 'train_samples_per_second': 13.887, 'train_steps_per_second': 0.868, 'total_flos': 338785444823040.0, 'train_loss': 0.744507516685285, 'epoch': 3.0})

In [10]:
trainer.save_model("t5_court_finetuned")
tokenizer.save_pretrained("t5_court_finetuned")


('t5_court_finetuned/tokenizer_config.json',
 't5_court_finetuned/special_tokens_map.json',
 't5_court_finetuned/spiece.model',
 't5_court_finetuned/added_tokens.json')

In [11]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_path = "t5_court_finetuned"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

In [12]:
text = "judge: State your point clearly."

inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Judge: State your point clearly.
