## **Fine Tuning the pre-trained T5-Base Model**

### **Data Pre-processing**

In [None]:
import torch
print("Num GPUs Available: ", torch.cuda.device_count())


In [None]:
#import torch
print(torch.version.cuda)

In [None]:
#import torch
torch.cuda.is_available()

In [None]:
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Get the name of the GPU being used
    gpu_name = torch.cuda.get_device_name(0)
    print("GPU in use:", gpu_name)
else:
    print("No GPU available.")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch

In [None]:
#torch.cuda.empty_cache()

In [None]:
# Load labeled law dataset from CSV
csv_path = "Law_Dataset.csv"
df = pd.read_csv(csv_path)
df.head(5)

In [None]:
# Rename the third column to "Legal_Domain"
df = df.rename(columns={"Unnamed: 2": "Legal_Domain"})
df.head(5)

In [None]:
# Drop all empty columns
df = df.dropna(axis=1, how='all')
# Display the modified DataFrame
df.head(5)

In [None]:
# Drop all empty columns
df = df.drop("Unnamed: 25", axis=1, errors="ignore")

df.head()

In [None]:
null_values = df.isnull().sum()
null_values

### **Train the Model**

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Define labeled law dataset class
class LawDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=150):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        law_text = self.dataframe.iloc[idx]["Law"]
        summary = self.dataframe.iloc[idx]["Law_Summary"]

        # Tokenize and prepare inputs for the model
        inputs = self.tokenizer.encode_plus(
            law_text,
            max_length=self.max_input_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        # Tokenize and prepare targets for the model
        targets = self.tokenizer.encode(
            summary,
            max_length=self.max_target_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": targets.flatten(),
        }

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create datasets and dataloaders for training and validation
train_dataset = LawDataset(train_df, tokenizer)
val_dataset = LawDataset(val_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# Move your model and data to the GPU
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Training Average Loss: {average_loss}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    average_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}, Validation Average Loss: {average_val_loss}")

# Save the fine-tuned model
model.save_pretrained("/content/drive/fine_tuned_T5_law_model")


In [None]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
# Define your labeled law dataset class
class LawDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=150):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        law_text = self.dataframe.iloc[idx]["Law"]
        summary = self.dataframe.iloc[idx]["Law_Summary"]

        # Tokenize and prepare inputs for the model
        inputs = self.tokenizer.encode_plus(
            law_text,
            max_length=self.max_input_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        # Tokenize and prepare targets for the model
        targets = self.tokenizer.encode(
            summary,
            max_length=self.max_target_length,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": targets.flatten(),
        }


In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [None]:
# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Move your model and data to the GPU
model.to(device)

In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create datasets and dataloaders for training and validation
train_dataset = LawDataset(train_df, tokenizer)
val_dataset = LawDataset(val_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Training Average Loss: {average_loss}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    average_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}, Validation Average Loss: {average_val_loss}")

# Save the fine-tuned model
model.save_pretrained(r"C:\Users\lafri\Shamini DSGP\fine_tuned_T5_law_model")


import pandas as pd

csv_path = "Law_DataSet_utf8.csv"  # Replace with your actual CSV file path

 Try different encodings (e.g., 'utf-8', 'latin1', 'ISO-8859-1')
encodings = ['utf-8', 'latin1', 'ISO-8859-1']

for encoding in encodings:
    try:
        df = pd.read_csv(csv_path, encoding=encoding)
        print(encoding)
        break
    except UnicodeDecodeError:
        continue

df.head(10)
