In [None]:
#!pip install transformers datasets torch




In [None]:
import pandas as pd

# Load your dataset
data_path = '/content/drive/MyDrive/Trimester5/trainData.csv'
df = pd.read_csv(data_path)

# Display the first few rows
print(df.head())

                                               input  \
0                               What is a gift deed?   
1          What are the key elements of a gift deed?   
2     What is the legal significance of a gift deed?   
3                        Can a gift deed be revoked?   
4  Does the donee need to pay taxes on the gifted...   

                                              output  \
0  A gift deed is a legal document that records t...   
1  The key elements of a gift deed include: \n1. ...   
2  A gift deed legally transfers ownership of the...   
3  Generally, a gift deed is irrevocable once exe...   
4  Tax implications for the donee depend on the j...   

                                                text  
0  What is a gift deed? A gift deed is a legal do...  
1  What are the key elements of a gift deed? The ...  
2  What is the legal significance of a gift deed?...  
3  Can a gift deed be revoked? Generally, a gift ...  
4  Does the donee need to pay taxes on the gifted..

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Prepare the data for the T5 model
def format_data(row):
    input_text = f"question: {row['input']}  context: {row['text']}"
    target_text = row['output']
    return {"input_text": input_text, "target_text": target_text}

train_data = train_df.apply(format_data, axis=1)
val_data = val_df.apply(format_data, axis=1)


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Reset index for both DataFrames
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# Prepare the data for the T5 model by creating input and target texts
def format_data(row):
    input_text = f"question: {str(row['input'])}  context: {str(row['text'])}"
    target_text = str(row['output'])
    return pd.Series({'input_text': input_text, 'target_text': target_text})

# Apply the formatting to both training and validation sets
train_formatted = train_df.apply(format_data, axis=1)
val_formatted = val_df.apply(format_data, axis=1)

# Display formatted data
print(train_formatted.head())
print(val_formatted.head())


                                          input_text  \
0  question: What is Article 17 of the Indian Con...   
1  question: What is the legal significance of a ...   
2  question: What is Article 21A of the Indian Co...   
3  question: What details about children should b...   
4  question: What is this document about?  contex...   

                                         target_text  
0  Article 17 abolishes untouchability and forbid...  
1  A gift deed legally transfers ownership of the...  
2  Article 21A provides the right to free and com...  
3  The petition should state the number of childr...  
4  This document is a petition for divorce by mut...  
                                          input_text  \
0  question: What is Article 224A of the Indian C...   
1  question: What is Article 100 of the Indian Co...   
2  question: What is Article 198 of the Indian Co...   
3  question: Why should the court grant the inter...   
4  question: What is Article 23 of the Indian Con... 

In [None]:
from transformers import T5Tokenizer

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Function to tokenize a single example
def tokenize_example(example):
    # Tokenize the input text
    input_encodings = tokenizer(
        str(example['input_text']),
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    # Tokenize the target text
    target_encodings = tokenizer(
        str(example['target_text']),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    # Replace padding token id's of the labels by -100 so it's ignored by the loss
    labels = target_encodings['input_ids']
    labels[labels == tokenizer.pad_token_id] = -100

    # Return a dictionary of input_ids, attention_mask, and labels
    return {
        'input_ids': input_encodings['input_ids'].squeeze(),
        'attention_mask': input_encodings['attention_mask'].squeeze(),
        'labels': labels.squeeze()
    }

# Apply the tokenization to the training and validation sets
train_encodings = train_formatted.apply(tokenize_example, axis=1)
val_encodings = val_formatted.apply(tokenize_example, axis=1)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = torch.stack([x['input_ids'] for x in encodings])
        self.attention_mask = torch.stack([x['attention_mask'] for x in encodings])
        self.labels = torch.stack([x['labels'] for x in encodings])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create dataset objects
train_dataset = CustomDataset(train_encodings)
val_dataset = CustomDataset(val_encodings)

# Optional: Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [None]:
from transformers import T5ForConditionalGeneration, AdamW
import torch.nn as nn

# Check if CUDA is available and use GPU if possible
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('gpu')
print(f'Using device: {device}')

# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set the model to training mode
model.train()

# Training loop parameters
epochs = 120
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    epoch_loss = 0
    for batch in train_loader:
        # Move batch to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Gradient clipping (optional but recommended)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update parameters
        optimizer.step()
        optimizer.zero_grad()

        # Accumulate loss
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Average Loss: {avg_loss:.4f}")


Using device: cuda




Epoch 1/120
Average Loss: 1.4794
Epoch 2/120
Average Loss: 1.3004
Epoch 3/120
Average Loss: 1.1679
Epoch 4/120
Average Loss: 1.1994
Epoch 5/120
Average Loss: 1.1151
Epoch 6/120
Average Loss: 1.1286
Epoch 7/120
Average Loss: 1.0381
Epoch 8/120
Average Loss: 1.0351
Epoch 9/120
Average Loss: 0.9805
Epoch 10/120
Average Loss: 0.9111
Epoch 11/120
Average Loss: 0.9109
Epoch 12/120
Average Loss: 0.8655
Epoch 13/120
Average Loss: 0.8599
Epoch 14/120
Average Loss: 0.8469
Epoch 15/120
Average Loss: 0.8268
Epoch 16/120
Average Loss: 0.8362
Epoch 17/120
Average Loss: 0.8363
Epoch 18/120
Average Loss: 0.7435
Epoch 19/120
Average Loss: 0.7304
Epoch 20/120
Average Loss: 0.7567
Epoch 21/120
Average Loss: 0.6989
Epoch 22/120
Average Loss: 0.6895
Epoch 23/120
Average Loss: 0.7217
Epoch 24/120
Average Loss: 0.6878
Epoch 25/120
Average Loss: 0.6625
Epoch 26/120
Average Loss: 0.6426
Epoch 27/120
Average Loss: 0.6357
Epoch 28/120
Average Loss: 0.6537
Epoch 29/120
Average Loss: 0.6292
Epoch 30/120
Average Lo

In [None]:
# Set the model to evaluation mode
model.eval()

# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch in val_loader:
        # Move batch to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Generate outputs
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)

        # Decode the generated ids back to text
        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # Print the generated text
        for text in generated_text:
            print(f"Generated Answer: {text}")

        # For demonstration, we'll only process one batch
        break

Generated Answer: Article 224A allows for the appointment of retired judges at sittings of the High Courts. The Chief Justice of a High Court, with the prior consent of the President, can request a retired judge to sit and act as a judge of that High Court for a temporary period.
Generated Answer: Article 100 deals with voting in Houses of Parliament, the power of Houses to act notwithstanding vacancies, and quorum requirements.
Generated Answer: Article 198 outlines the special procedure in respect of money bills in the states, similar to the procedure at the Union level.
Generated Answer: The court should grant the interim injunction to the chain of events establish a prima facie case and further investigation or action is needed. Irreparable damage will be caused which wouldn’t be compensated in monetary terms.
Generated Answer: Article 23 prohibits trafficking in human beings and forced labour.
Generated Answer: Yes, the attorney has the power to initiate, defend, assign, assign, a

In [None]:
def summarize_text(text):
    # Prepare the input text for summarization
    input_text = f"summarize: {text}"

    # Tokenize the input
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=150,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
for i in range(3):
    original_text = df['text'].iloc[i]
    summary = summarize_text(original_text)
    print(f"Original Text: {original_text}")
    print(f"Summary: {summary}\n")


Original Text: What is a gift deed? A gift deed is a legal document that records the voluntary transfer of ownership of a property or asset from a donor to a donee without any exchange of money. It is commonly used to gift cash, property, or other assets.
Summary: a gift deed is a legal document that records the voluntary transfer of ownership of a property or asset from a donor to a donee without any exchange of money. it is commonly used to gift cash, property, or other assets.

Original Text: What are the key elements of a gift deed? The key elements of a gift deed include: \n1. Details of the donor and donee (name, age, and residence).\n2. Description of the gifted asset or property.\n3. Statement of consideration (natural love and affection).\n4. Acceptance of the gift by the donee.\n5. Transfer of possession of the gifted asset or property.\n6. Declaration that the donor has no further interest or right in the gifted asset or property.
Summary: the key elements of a gift deed inc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Specify the directory in Google Drive where you want to save the model
save_directory = '/content/drive/MyDrive/t5_model'

# Ensure the directory exists
import os
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the trained model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model and tokenizer saved to /content/drive/MyDrive/t5_model


In [None]:
from google.colab import drive
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = '/content/drive/MyDrive/t5_model'

tokenizer = T5Tokenizer.from_pretrained(model)
model = T5ForConditionalGeneration.from_pretrained(model)

print("Model and tokenizer loaded successfully!")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model and tokenizer loaded successfully!


In [None]:
def evaluate_text(text, max_length=50):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(inputs, max_length=max_length, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the generated text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

summary


'a gift deed legally transfers ownership of the gifted asset or property from the donor to the donee. once executed and accepted, the donor relinquishes all rights and claims to the gifted asset or property, making the donee the absolute owner.'

Rouge Score

- Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for evaluating automatic summarization and machine translation software in natural language processing.
- The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.

- Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=8e2e8f12eab1d7111b7ff8558eb98a5f7cd156476a95146baf87ea96d4100665
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import load_metric
import rouge_score



# Load ROUGE metric for summarization
rouge = load_metric("rouge")

# Example test dataset (Replace with your actual test dataset)
test_data = [
    {
        "input": "Artificial Intelligence is revolutionizing industries worldwide.",
        "summary": "AI is transforming global industries.",
        "label": "positive"  # Replace with actual labels for classification tasks
    },
    {
        "input": "The history of the Internet dates back to the 1960s.",
        "summary": "The Internet's history started in the 1960s.",
        "label": "neutral"  # Replace with actual labels for classification tasks
    }
]

# Initialize lists to store predictions and references
generated_summaries = []
references = []
true_labels = []
predicted_labels = []

# Evaluate the model on the test dataset
for data in test_data:
    input_text = data["input"]
    true_summary = data["summary"]
    # true_label = data["label"]  # Uncomment for classification tasks

    generated_summary = evaluate_text(input_text)

    # Store results
    generated_summaries.append(generated_summary)
    references.append(true_summary)
    # true_labels.append(true_label)
    # predicted_labels.append(classification_label)  # Uncomment for classification tasks

    # Print input and output for clarity
    print(f"Input Text: {input_text}")
    print(f"True Summary: {true_summary}")
    print(f"Generated Summary: {generated_summary}")
    print()

# Evaluate ROUGE scores for summarization
rouge_scores = rouge.compute(predictions=generated_summaries, references=references)
print("ROUGE Scores:")
for key in rouge_scores.keys():
    print(f"{key}: {rouge_scores[key].mid.fmeasure:.4f}")

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
Input Text: Artificial Intelligence is revolutionizing industries worldwide.
True Summary: AI is transforming global industries.
Generated Summary: Artificial Intelligence is revolutionizing industries worldwide.

Input Text: The history of the Internet dates back to the 1960s.
True Summary: The Internet's history started in the 1960s.
Generated Summary: the history of the Internet dates back to the 1960s.

ROUGE Scores:
rouge1: 0.4596
rouge2: 0.1250
rougeL: 0.4040
rougeLsum: 0.4040


In [None]:
import pandas as pd
from datasets import load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load ROUGE metric for summarization
rouge = load_metric("rouge")

# Load your T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(model.name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model.name_or_path)

# Initialize lists to store predictions and references
generated_summaries = []
references = []

# Function to generate summaries using the T5 model
def evaluate_text(input_text):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=300,
        min_length=60,
        length_penalty=1.0,  # Adjusted length penalty
        num_beams=5,  # Increased number of beams for more diverse outputs
        no_repeat_ngram_size=3,  # Prevents repetition of phrases
        early_stopping=True
    )

    # Decode the generated summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Process the top 10 rows in the dataframe
for index, row in df.head(10).iterrows():
    input_text = row['input']
    true_summary = row['output']

    # Generate summary using the T5 model
    generated_summary = evaluate_text(input_text)

    # Store results
    generated_summaries.append(generated_summary)
    references.append(true_summary)

    # Print input and output for clarity
    print(f"Input Text: {input_text}")
    print(f"True Summary: {true_summary}")
    print(f"Generated Summary: {generated_summary}")
    print()

# Evaluate ROUGE scores for summarization
rouge_scores = rouge.compute(predictions=generated_summaries, references=references)

# Display ROUGE scores
print("ROUGE Scores:")
for key in rouge_scores.keys():
    print(f"{key}: {rouge_scores[key].mid.fmeasure:.4f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input Text: What is a gift deed?
True Summary: A gift deed is a legal document that records the voluntary transfer of ownership of a property or asset from a donor to a donee without any exchange of money. It is commonly used to gift cash, property, or other assets.
Generated Summary: a gift deed is a given to a person whose name is "the most important person in the world" it is an honour to have a loved one whose life is ruined by a firestorm of fire, fire, and a curse.

Input Text: What are the key elements of a gift deed?
True Summary: The key elements of a gift deed include: \n1. Details of the donor and donee (name, age, and residence).\n2. Description of the gifted asset or property.\n3. Statement of consideration (natural love and affection).\n4. Acceptance of the gift by the donee.\n5. Transfer of possession of the gifted asset or property.\n6. Declaration that the donor has no further interest or right in the gifted asset or property.
Generated Summary: The gift deed contains 

In [None]:
import pandas as pd
from datasets import load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load ROUGE metric for summarization
rouge = load_metric("rouge")

# Load your T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(model.name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model.name_or_path)

# Initialize lists to store predictions and references
generated_summaries = []
references = []

# Function to generate summaries using the T5 model
def evaluate_text(input_text):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=150,
        min_length=60,  # Increased min length for better detail
        length_penalty=1.2,  # Slightly higher length penalty for conciseness
        num_beams=7,  # Increased number of beams for better diversity
        no_repeat_ngram_size=3,  # Prevent repetition of phrases
        top_k=50,  # Top-K sampling
        top_p=0.95,  # Top-P (nucleus) sampling
        temperature=0.7,  # Temperature for diversity
        early_stopping=True
    )

    # Decode the generated summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Process the top 10 rows in the dataframe
for index, row in df.head(10).iterrows():
    input_text = row['input']
    true_summary = row['output']

    # Generate summary using the T5 model
    generated_summary = evaluate_text(input_text)

    # Store results
    generated_summaries.append(generated_summary)
    references.append(true_summary)

    # Print input and output for clarity
    print(f"Input Text: {input_text}")
    print(f"True Summary: {true_summary}")
    print(f"Generated Summary: {generated_summary}")
    print()

# Evaluate ROUGE scores for summarization
rouge_scores = rouge.compute(predictions=generated_summaries, references=references)

# Display ROUGE scores
print("ROUGE Scores:")
for key in rouge_scores.keys():
    print(f"{key}: {rouge_scores[key].mid.fmeasure:.4f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input Text: What is a gift deed?
True Summary: A gift deed is a legal document that records the voluntary transfer of ownership of a property or asset from a donor to a donee without any exchange of money. It is commonly used to gift cash, property, or other assets.
Generated Summary: The gift deed is a gift from a charity whose mission is to give back to a loved one a life threatening gift. it is an honour to be a member of a family whose wishes are to be fulfilled if you are a donor to an individual whose family is in need of care.

Input Text: What are the key elements of a gift deed?
True Summary: The key elements of a gift deed include: \n1. Details of the donor and donee (name, age, and residence).\n2. Description of the gifted asset or property.\n3. Statement of consideration (natural love and affection).\n4. Acceptance of the gift by the donee.\n5. Transfer of possession of the gifted asset or property.\n6. Declaration that the donor has no further interest or right in the gift

In [None]:
import pandas as pd
from datasets import load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load ROUGE metric for summarization
rouge = load_metric("rouge")

# Load your T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(model.name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model.name_or_path)

# Initialize lists to store predictions and references
generated_summaries = []
references = []

# Function to generate summaries using the T5 model
def evaluate_text(input_text):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=200,
        min_length=80,  # Increased min length to include more content
        length_penalty=1.0,  # Neutral length penalty to allow some flexibility
        num_beams=8,  # Increased number of beams for better diversity
        no_repeat_ngram_size=3,  # Prevent repetition of phrases
        top_p=0.92,  # Top-P (nucleus) sampling to allow more diverse outputs
        temperature=0.6,  # Lower temperature for more focused output
        early_stopping=True
    )

    # Decode the generated summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Process the top 10 rows in the dataframe
for index, row in df.head(10).iterrows():
    input_text = row['input']
    true_summary = row['output']

    # Generate summary using the T5 model
    generated_summary = evaluate_text(input_text)

    # Store results
    generated_summaries.append(generated_summary)
    references.append(true_summary)

    # Print input and output for clarity
    print(f"Input Text: {input_text}")
    print(f"True Summary: {true_summary}")
    print(f"Generated Summary: {generated_summary}")
    print()

# Evaluate ROUGE scores for summarization
rouge_scores = rouge.compute(predictions=generated_summaries, references=references)

# Display ROUGE scores
print("ROUGE Scores:")
for key in rouge_scores.keys():
    print(f"{key}: {rouge_scores[key].mid.fmeasure:.4f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input Text: What is a gift deed?
True Summary: A gift deed is a legal document that records the voluntary transfer of ownership of a property or asset from a donor to a donee without any exchange of money. It is commonly used to gift cash, property, or other assets.
Generated Summary: the gift deed is a gift from a friend whose name is elizabeth, a native of the u.s., to a man whose mother was born in awe of april a few years ago, was written by the couple. the couple has been married for a year and have two children whose parents are married.

Input Text: What are the key elements of a gift deed?
True Summary: The key elements of a gift deed include: \n1. Details of the donor and donee (name, age, and residence).\n2. Description of the gifted asset or property.\n3. Statement of consideration (natural love and affection).\n4. Acceptance of the gift by the donee.\n5. Transfer of possession of the gifted asset or property.\n6. Declaration that the donor has no further interest or right i

In [None]:
import pandas as pd
from datasets import load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration
import random

# Load ROUGE metric for summarization
rouge = load_metric("rouge")

# Load your T5 model and tokenizer
model_name = 't5-small'  # replace with your model name if different
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Initialize lists to store predictions and references
generated_summaries = []
references = []

# Function to generate summaries using the T5 model
def evaluate_text(input_text):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=200,
        min_length=80,  # Increased min length to include more content
        length_penalty=1.0,  # Neutral length penalty to allow some flexibility
        num_beams=8,  # Increased number of beams for better diversity
        no_repeat_ngram_size=3,  # Prevent repetition of phrases
        top_p=0.92,  # Top-P (nucleus) sampling to allow more diverse outputs
        temperature=0.6,  # Lower temperature for more focused output
        early_stopping=True
    )

    # Decode the generated summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Sample random rows
num_samples = 10
sampled_df = df.sample(n=num_samples, random_state=42)  # Adjust `random_state` for reproducibility

# Process the sampled rows
for index, row in sampled_df.iterrows():
    input_text = row['input']
    true_summary = row['output']

    # Generate summary using the T5 model
    generated_summary = evaluate_text(input_text)

    # Store results
    generated_summaries.append(generated_summary)
    references.append(true_summary)

    # Print input and output for clarity
    print(f"Input Text: {input_text}")
    print(f"True Summary: {true_summary}")
    print(f"Generated Summary: {generated_summary}")
    print()

# Evaluate ROUGE scores for summarization
rouge_scores = rouge.compute(predictions=generated_summaries, references=references)

# Display ROUGE scores
print("ROUGE Scores:")
for key in rouge_scores.keys():
    print(f"{key}: {rouge_scores[key].mid.fmeasure:.4f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input Text: What is Article 224A of the Indian Constitution?
True Summary: Article 224A allows for the appointment of retired judges at sittings of the High Courts. The Chief Justice of a High Court, with the prior consent of the President, can request a retired judge to sit and act as a judge of that High Court for a temporary period.
Generated Summary: what is Article 224A of the Indian Constitution? what is the article 224a of the. Indian constitution? what does the Indian constitution mean? what do you think? what are the provisions of this article?. what does it mean? if you have a question, please contact us on 08457 555 111 or click here for more information.

Input Text: What is Article 100 of the Indian Constitution?
True Summary: Article 100 deals with voting in Houses of Parliament, the power of Houses to act notwithstanding vacancies, and quorum requirements.
Generated Summary: what is Article 100 of the Indian Constitution? what is the article 100 of. the Indian constituti