## **Fine Tuning the pre-trained T5-Base Model**

### **Data Pre-processing**

In [None]:
import torch
print("Num GPUs Available: ", torch.cuda.device_count())


Num GPUs Available:  1


In [None]:
#import torch
print(torch.version.cuda)

11.8


In [None]:
#import torch
torch.cuda.is_available()

True

In [None]:
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Get the name of the GPU being used
    gpu_name = torch.cuda.get_device_name(0)
    print("GPU in use:", gpu_name)
else:
    print("No GPU available.")


GPU in use: NVIDIA GeForce RTX 3050 Laptop GPU


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch

In [None]:
#torch.cuda.empty_cache()

In [None]:
# Load labeled law dataset from CSV
csv_path = "Law_Dataset.csv"
df = pd.read_csv(csv_path)
df.head(5)

Unnamed: 0,Key,Law,Domain,Law_Summary,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...,,,,,,,...,,,,,,,,,,
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a...",,,,,,,...,,,,,,,,,,
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ...",,,,,,,...,,,,,,,,,,
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...,,,,,,,...,,,,,,,,,,
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...,,,,,,,...,,,,,,,,,,


In [None]:
# Rename the third column to "Legal_Domain"
df = df.rename(columns={"Unnamed: 2": "Legal_Domain"})
df.head(5)

Unnamed: 0,Key,Law,Domain,Law_Summary,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...,,,,,,,...,,,,,,,,,,
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a...",,,,,,,...,,,,,,,,,,
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ...",,,,,,,...,,,,,,,,,,
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...,,,,,,,...,,,,,,,,,,
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...,,,,,,,...,,,,,,,,,,


In [None]:
# Drop all empty columns
df = df.dropna(axis=1, how='all')
# Display the modified DataFrame
df.head(5)

Unnamed: 0,Key,Law,Domain,Law_Summary,Unnamed: 25
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...,
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a...",
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ...",
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...,
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...,


In [None]:
# Drop all empty columns
df = df.drop("Unnamed: 25", axis=1, errors="ignore")

df.head()

Unnamed: 0,Key,Law,Domain,Law_Summary
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a..."
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ..."
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...


In [None]:
null_values = df.isnull().sum()
null_values

Key            0
Law            0
Domain         0
Law_Summary    0
dtype: int64

### **Train the Model**

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Define labeled law dataset class
class LawDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=150):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        law_text = self.dataframe.iloc[idx]["Law"]
        summary = self.dataframe.iloc[idx]["Law_Summary"]

        # Tokenize and prepare inputs for the model
        inputs = self.tokenizer.encode_plus(
            law_text,
            max_length=self.max_input_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        # Tokenize and prepare targets for the model
        targets = self.tokenizer.encode(
            summary,
            max_length=self.max_target_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": targets.flatten(),
        }

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create datasets and dataloaders for training and validation
train_dataset = LawDataset(train_df, tokenizer)
val_dataset = LawDataset(val_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# Move your model and data to the GPU
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Training Average Loss: {average_loss}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    average_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}, Validation Average Loss: {average_val_loss}")

# Save the fine-tuned model
model.save_pretrained("/content/drive/fine_tuned_T5_law_model")


In [None]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
# Define your labeled law dataset class
class LawDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=150):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        law_text = self.dataframe.iloc[idx]["Law"]
        summary = self.dataframe.iloc[idx]["Law_Summary"]

        # Tokenize and prepare inputs for the model
        inputs = self.tokenizer.encode_plus(
            law_text,
            max_length=self.max_input_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        # Tokenize and prepare targets for the model
        targets = self.tokenizer.encode(
            summary,
            max_length=self.max_target_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": targets.flatten(),
        }


In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create datasets and dataloaders for training and validation
train_dataset = LawDataset(train_df, tokenizer)
val_dataset = LawDataset(val_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Move model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)  # Move input to the same device as the model
        attention_mask = batch["attention_mask"].to(device)  # Move attention mask to the same device as the model
        labels = batch["labels"].to(device)  # Move labels to the same device as the model

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Training Average Loss: {average_loss}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)  # Move input to the same device as the model
            attention_mask = batch["attention_mask"].to(device)  # Move attention mask to the same device as the model
            labels = batch["labels"].to(device)  # Move labels to the same device as the model

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    average_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}, Validation Average Loss: {average_val_loss}")

# Save the fine-tuned model
model.save_pretrained(r"C:\Users\lafri\Shamini DSGP\fine_tuned_T5_law_model")


Epoch 1: 100%|██████████| 86/86 [04:46<00:00,  3.33s/it]


Epoch 1, Training Average Loss: 9.217517841693967


Validation: 100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


Epoch 1, Validation Average Loss: 9.779259538650512


Epoch 2: 100%|██████████| 86/86 [04:45<00:00,  3.32s/it]


Epoch 2, Training Average Loss: 9.216380596160889


Validation: 100%|██████████| 10/10 [00:08<00:00,  1.16it/s]


Epoch 2, Validation Average Loss: 9.779259538650512


Epoch 3: 100%|██████████| 86/86 [04:45<00:00,  3.32s/it]


Epoch 3, Training Average Loss: 9.201401649519454


Validation: 100%|██████████| 10/10 [00:10<00:00,  1.09s/it]


Epoch 3, Validation Average Loss: 9.779259538650512


import pandas as pd

csv_path = "Law_DataSet_utf8.csv"  # Replace with your actual CSV file path

 Try different encodings (e.g., 'utf-8', 'latin1', 'ISO-8859-1')
encodings = ['utf-8', 'latin1', 'ISO-8859-1']

for encoding in encodings:
    try:
        df = pd.read_csv(csv_path, encoding=encoding)
        print(encoding)
        break
    except UnicodeDecodeError:
        continue

df.head(10)
