## **Fine Tuning the pre-trained T5-Base Model**

### **Data Pre-processing**

In [None]:
import torch
print("Num GPUs Available: ", torch.cuda.device_count())


In [None]:
#import torch
print(torch.version.cuda)

In [None]:
#import torch
torch.cuda.is_available()

In [None]:
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Get the name of the GPU being used
    gpu_name = torch.cuda.get_device_name(0)
    print("GPU in use:", gpu_name)
else:
    print("No GPU available.")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch

In [None]:
#torch.cuda.empty_cache()

In [None]:
# Load labeled law dataset from CSV
csv_path = "Law_Dataset.csv"
df = pd.read_csv(csv_path)
df.head(5)

Unnamed: 0,Key,Law,Domain,Law_Summary,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...,,,,,,,...,,,,,,,,,,
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a...",,,,,,,...,,,,,,,,,,
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ...",,,,,,,...,,,,,,,,,,
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...,,,,,,,...,,,,,,,,,,
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...,,,,,,,...,,,,,,,,,,


In [None]:
# Rename the third column to "Legal_Domain"
df = df.rename(columns={"Unnamed: 2": "Legal_Domain"})
df.head(5)

Unnamed: 0,Key,Law,Domain,Law_Summary,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...,,,,,,,...,,,,,,,,,,
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a...",,,,,,,...,,,,,,,,,,
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ...",,,,,,,...,,,,,,,,,,
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...,,,,,,,...,,,,,,,,,,
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...,,,,,,,...,,,,,,,,,,


In [None]:
# Drop all empty columns
df = df.dropna(axis=1, how='all')
# Display the modified DataFrame
df.head(5)

Unnamed: 0,Key,Law,Domain,Law_Summary,Unnamed: 25
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...,
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a...",
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ...",
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...,
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...,


In [None]:
# Drop all empty columns
df = df.drop("Unnamed: 25", axis=1, errors="ignore")

df.head()

Unnamed: 0,Key,Law,Domain,Law_Summary
0,1,"(1) Where any person having sufficient means, ...",Maintenance,If an individual with sufficient means neglect...
1,2,No Order for an allowance for the maintenance ...,Maintenance,"Any maintenance allowance order for a child, a..."
2,3,An application for maintenance may be made:\n(...,Maintenance,"A child or disabled offspring, along with the ..."
3,4,An application for maintenance may be made to ...,Maintenance,Applications for maintenance must be submitted...
4,5,(1) Where any person against whom neglects to ...,Maintenance,The consequences for non-compliance with a mai...


In [None]:
null_values = df.isnull().sum()
null_values

Key            0
Law            0
Domain         0
Law_Summary    0
dtype: int64

### **Train the Model**

In [None]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
# Define your labeled law dataset class
class LawDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=150):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        law_text = self.dataframe.iloc[idx]["Law"]
        summary = self.dataframe.iloc[idx]["Law_Summary"]

        # Tokenize and prepare inputs for the model
        inputs = self.tokenizer.encode_plus(
            law_text,
            max_length=self.max_input_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        # Tokenize and prepare targets for the model
        targets = self.tokenizer.encode(
            summary,
            max_length=self.max_target_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": targets.flatten(),
        }


In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [None]:
# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Move your model and data to the GPU
model.to(device)

In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create datasets and dataloaders for training and validation
train_dataset = LawDataset(train_df, tokenizer)
val_dataset = LawDataset(val_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Training Average Loss: {average_loss}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    average_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}, Validation Average Loss: {average_val_loss}")

# Save the fine-tuned model
model.save_pretrained(r"C:\Users\lafri\Shamini DSGP\fine_tuned_T5_law_model")


### **Testing the Fine-tuned model**

In [None]:
from google.colab import files

# Upload the zip file
uploaded = files.upload()

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import os

# Path to the zip file in your Google Drive
zip_path = '/content/drive/MyDrive/fine_tuned_T5_law_model-20240320T093755Z-001.zip'

# Directory to extract the contents to
extract_path = '/content/model/'

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [None]:
# Import required libraries
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the fine-tuned T5 model
model_path = '/content/model/fine_tuned_T5_law_model'  # Path to the directory containing the model files
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Prepare input text
input_text = """
Every application for an order of maintenance or to enforce such an order, shall be in writing and shall be signed by the applicant or the person making the application on his behalf and shall be free of any stamp duty. Every summons to a respondent or a witness shall also be free of stamp duty.
"""

# Tokenization
tokenizer = T5Tokenizer.from_pretrained('t5-base')
inputs = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512, truncation=True)

# Model Inference
outputs = model.generate(inputs.input_ids, max_length=400, num_beams=4, early_stopping=True)

# Decode Output
generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Summary:", generated_summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Summary: or to enforce such an order, shall be in writing and shall be signed by the applicant or the person making the application on his behalf and shall be free of stamp duty.


**The law:**
Every application for an order of maintenance or to enforce such an order, shall be in writing and shall be signed by the applicant or the person making the application on his behalf and shall be free of any stamp duty. Every summons to a respondent or a witness shall also be free of stamp duty.

---

**Generated Law:**
or to enforce such an order, shall be in writing and shall be signed by the applicant or the person making the application on his behalf and shall be free of stamp duty.



---


**The Law in the training dataset:**
Every application for a maintenance order or to enforce such an order must be in writing, signed by the applicant or their representative, and exempt from stamp duty. Additionally, summonses issued to respondents or witnesses in relation to such applications are also exempt from stamp duty.

### **Generating summaries for the validation dataset**

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader

# Load the fine-tuned model
model_path = "/content/model/fine_tuned_T5_law_model"
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Define your labeled law dataset class for testing
class TestLawDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        law_text = self.dataframe.iloc[idx]["Law"]

        # Tokenize and prepare inputs for the model
        inputs = self.tokenizer.encode_plus(
            law_text,
            max_length=self.max_input_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
        }

# Initialize tokenizer for testing
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Create test dataset and dataloader
test_dataset = TestLawDataset(val_df, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Evaluate the model on the test dataset
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        # Decode the generated output and add to the predictions list
        decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        predictions.extend(decoded_outputs)

# Add the predictions to the testing DataFrame
val_df["Generated_summaries"] = predictions



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Display the DataFrame with predicted laws
val_df.head()

Unnamed: 0,Key,Law,Domain,Law_Summary,Generated_summaries
266,267,In addition to the powers and functions expres...,Civil Law,"The President, in addition to explicitly defin...",the President shall have the power: (a) to mak...
192,193,Every public authority shall submit annual rep...,Civil Law,All public authorities are required to submit ...,the Commission (the Commission) and the Commis...
46,47,Causing hurt to a victim of crime or witness\n...,Criminal Law,The act of voluntarily causing hurt or grievou...,"commits an offense under Criminal Law, and sha..."
55,56,Application for protection\n(1) A victim of cr...,Criminal Law,The procedure for a victim of crime or witness...,Protection. ( d) any court or Commission or (e...
57,58,"(1) Where, in the case of an offense not speci...",Criminal Law,When dealing with an offense not specified in ...,the Protection Officer assigned to such police...


### **Evaluating the generated summaries**

In [None]:
!pip install rouge nltk

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu

# Function to calculate ROUGE scores
def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge = Rouge()
    rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
    return rouge_scores

# Function to calculate BLEU scores
def calculate_bleu_scores(generated_summaries, reference_summaries):
    reference_tokens = [[summary.split()] for summary in reference_summaries]
    generated_tokens = [summary.split() for summary in generated_summaries]
    bleu_score = corpus_bleu(reference_tokens, generated_tokens)
    return bleu_score

# Example usage
generated_summaries = val_df["Generated_summaries"]
reference_summaries = val_df["Law_Summary"]

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE Scores:", rouge_scores)

# Calculate BLEU score
bleu_score = calculate_bleu_scores(generated_summaries, reference_summaries)
print("BLEU Score:", bleu_score)


ROUGE Scores: {'rouge-1': {'r': 0.1883866778300221, 'p': 0.6159524885203619, 'f': 0.26633092668790614}, 'rouge-2': {'r': 0.07725293194595914, 'p': 0.2694956250386114, 'f': 0.10726923757756399}, 'rouge-l': {'r': 0.16928072364131497, 'p': 0.5619215019101897, 'f': 0.2397176560444206}}
BLEU Score: 0.002110505488540374


1. **ROUGE Scores**:
   - ROUGE-1 (Recall-Oriented Understudy for Gisting Evaluation with unigrams):
     - Recall (R): 0.188
     - Precision (P): 0.616
     - F1-score (F): 0.266
   - ROUGE-2 (with bigrams):
     - Recall (R): 0.077
     - Precision (P): 0.269
     - F1-score (F): 0.107
   - ROUGE-L (using longest common subsequence):
     - Recall (R): 0.169
     - Precision (P): 0.562
     - F1-score (F): 0.240


   These scores indicate how well the generated summaries overlap with the reference summaries. Higher values indicate better performance.

2. **BLEU Score**:
   - BLEU (Bilingual Evaluation Understudy) Score: 0.0021

   The BLEU score measures the similarity between the generated and reference summaries based on n-gram overlaps. A higher BLEU score generally indicates better performance, but the interpretation can vary depending on the specific task and context. In this case, the BLEU score is very low, suggesting that there is minimal overlap between the generated and reference summaries.

Overall, based on these scores, it seems that the generated summaries may not be very accurate or high-quality compared to the reference summaries. Further analysis and improvements may be needed to enhance the summarization model's performance.

### **Generating summaries for the training dataset**

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader

# Load the fine-tuned model
model_path = "/content/model/fine_tuned_T5_law_model"
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Define your labeled law dataset class for testing
class TestLawDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        law_text = self.dataframe.iloc[idx]["Law"]

        # Tokenize and prepare inputs for the model
        inputs = self.tokenizer.encode_plus(
            law_text,
            max_length=self.max_input_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
        }

# Initialize tokenizer for testing
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Create test dataset and dataloader
test_dataset = TestLawDataset(train_df, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Evaluate the model on the test dataset
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        # Decode the generated output and add to the predictions list
        decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        predictions.extend(decoded_outputs)

# Add the predictions to the testing DataFrame
train_df["Generated_summaries"] = predictions



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
train_df.head()

Unnamed: 0,Key,Law,Domain,Law_Summary,Generated_summaries
322,323,(1) Subject to the provisions of the Constitut...,Civil Law,Parliament can decide on the election and reti...,"(ii) the regulation of its business, the prese..."
248,249,(1) The exercise and operation of the fundamen...,Civil Law,The restrictions on fundamental rights in the ...,shall be subject to such restrictions as may b...
110,111,"A court or Commission shall, before granting p...",Criminal Law,"Before permitting a victim of crime, witness, ...","a witness, or law enforcement authority, to gi..."
305,306,"Until the Commission otherwise provides, all r...",Civil Law,"Until the Commission makes new rules, existing...","otherwise provides, all rules, regulations and..."
370,371,(1) A single amber flashing light or a pair of...,Motor Traffic Law,"Amber flashing lights, either a single light o...",not more than two point four (2.4) meters and ...


### **Evaluating the generated summaries**

In [None]:
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu

# Function to calculate ROUGE scores
def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge = Rouge()
    rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
    return rouge_scores

# Function to calculate BLEU scores
def calculate_bleu_scores(generated_summaries, reference_summaries):
    reference_tokens = [[summary.split()] for summary in reference_summaries]
    generated_tokens = [summary.split() for summary in generated_summaries]
    bleu_score = corpus_bleu(reference_tokens, generated_tokens)
    return bleu_score

# Example usage
generated_summaries = train_df["Generated_summaries"]
reference_summaries = train_df["Law_Summary"]

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE Scores:", rouge_scores)

# Calculate BLEU score
bleu_score = calculate_bleu_scores(generated_summaries, reference_summaries)
print("BLEU Score:", bleu_score)


ROUGE Scores: {'rouge-1': {'r': 0.16061525672796015, 'p': 0.5760230586597972, 'f': 0.23717443294573362}, 'rouge-2': {'r': 0.0669067241495678, 'p': 0.2619917306062817, 'f': 0.09923623302810115}, 'rouge-l': {'r': 0.14531197774551324, 'p': 0.5262221479262242, 'f': 0.21509504830070325}}
BLEU Score: 0.0021388672771499016
