# Example 01

In [None]:
import pandas as pd

# Load the uploaded Excel file
file_path = '/content/Benefits_of_fruits.xlsx'

df = pd.read_excel(file_path)

# Display the first few rows to understand the structure
df.head()

Unnamed: 0,Importance_of_Fruits
0,پھلوں اور سبزیوں کا روزمرہ کی خوراک میں شامل ہ...


# Sentence Tokenization

In [None]:
import re
import pandas as pd

# Input file path
input_file = '/content/Benefits_of_fruits.xlsx'

# Load the Excel file
df = pd.read_excel(input_file)

# Assume the text is in the first cell of the first sheet
text = df.iloc[0, 0]  # Adjust indexing if the text is in a different cell

# Print text to verify it's loaded correctly
print("Loaded text:\n", text)

# Tokenize the paragraph into sentences
# Improved regex to handle different sentence delimiters and whitespace
sentences = re.split(r'(?<=[۔؟])\s+(?=[\u0600-\u06FF])', text)

# Filter out any empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Print sentences to verify tokenization
print("\nTokenized sentences:")
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")

# Create a DataFrame from the sentences
sentences_df = pd.DataFrame(sentences, columns=['Sentence'])

# Output file path
output_file = '/content/Tokenized sentences.xlsx'

# Save the DataFrame to an Excel file
sentences_df.to_excel(output_file, index=False)

print(f"\nSentences have been saved to {output_file}")


Loaded text:
 پھلوں اور سبزیوں کا روزمرہ کی خوراک میں شامل ہونا انسانی صحت کے لیے بے حد ضروری ہے کیونکہ یہ قدرتی وٹامنز، معدنیات، اور فائبر کے بہترین ذرائع ہیں، جو جسم کو غذائیت فراہم کرنے کے ساتھ مختلف بیماریوں سے بچاؤ اور جسمانی نظام کی بہتری میں اہم کردار ادا کرتے ہیں۔ ان میں موجود اینٹی آکسیڈنٹس جسم کو آزاد ریڈیکلز سے محفوظ رکھتے ہیں، جو کینسر اور دل کی بیماریوں کا باعث بن سکتے ہیں۔ فائبر سے بھرپور یہ غذائیں نظامِ ہاضمہ کو بہتر بناتی ہیں اور قبض جیسے مسائل سے نجات دلاتی ہیں۔ کم کیلوریز اور زیادہ غذائیت کی وجہ سے یہ وزن کم کرنے میں مددگار ہوتی ہیں اور دل کی صحت کو بہتر بناتی ہیں، کیونکہ یہ کولیسٹرول کی سطح کم کرنے اور بلڈ پریشر کو متوازن رکھنے میں معاون ثابت ہوتی ہیں۔ پھلوں اور سبزیوں میں موجود وٹامنز، جیسے وٹامن اے، وٹامن سی، اور وٹامن کے، جلد کی خوبصورتی اور آنکھوں کی صحت کو برقرار رکھتے ہیں، جبکہ معدنیات، جیسے پوٹاشیم اور میگنیشیم، ہڈیوں کو مضبوط اور صحت مند بناتے ہیں۔ ان تمام فوائد کی بنا پر پھلوں اور سبزیوں کو اپنی روزمرہ کی خوراک کا مستقل حصہ بنانا ایک صحت مند زندگی کے لیے لاز

# Punctuation Removal

In [None]:
import pandas as pd
import re

# Load the Excel file
file_path = '/content/Tokenized sentences.xlsx'
df = pd.read_excel(file_path)

# Define a function to remove punctuation from Urdu text
def remove_punctuation(text):
    if isinstance(text, str):
        # Regular expression to match common punctuation marks
        return re.sub(r'[۔،!؟“”\'"\'`()\-—؛:;]', '', text)
    return text

# Apply the punctuation removal function to the 'Sentence' column
df['Sentence'] = df['Sentence'].apply(remove_punctuation)

# Save the cleaned data back to Excel (either overwrite or save to a new file)
df.to_excel('/content/Punctuation_removal.xlsx', index=False)

print("Punctuation removed and data saved.")


Punctuation removed and data saved.


# Sentence scoring

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load the dataset
data = pd.read_excel('/content/Punctuation_removal.xlsx')

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Process all sentences to compute TF-IDF scores
sentences = data['Sentence'].tolist()

# Fit and transform the entire dataset to compute TF-IDF scores
tfidf_matrix = vectorizer.fit_transform(sentences)

# Compute scores for each sentence by summing the TF-IDF values
sentence_scores = np.sum(tfidf_matrix, axis=1).A1  # Flatten to 1D array

# Create a DataFrame with sentences and their corresponding scores
scored_sentences_df = pd.DataFrame({
    'Sentence': sentences,
    'Score': sentence_scores
})

# Save the scored sentences to a new Excel file
scored_sentences_file_path = '/content/Sentence_scoring.xlsx'
scored_sentences_df.to_excel(scored_sentences_file_path, index=False)

print(f"Scored sentences saved to: {scored_sentences_file_path}")


Scored sentences saved to: /content/Sentence_scoring.xlsx


# sentence selection

In [None]:
import pandas as pd
import os

# File paths
scored_sentences_file_path = '/content/Sentence_scoring.xlsx'
selected_sentences_file_path ='/content/Selected sentences.xlsx'

# Parameters
top_n = 3  # Adjust based on how many sentences you want in your summary
score_column = 'Score'

try:
    # Check if the file exists
    if not os.path.isfile(scored_sentences_file_path):
        raise FileNotFoundError(f"File not found: {scored_sentences_file_path}")

    # Load the scored sentences from the previous step
    scored_sentences_df = pd.read_excel(scored_sentences_file_path)
    print(f"Columns in the DataFrame: {scored_sentences_df.columns}")

    # Check if DataFrame is empty
    if scored_sentences_df.empty:
        raise ValueError("The DataFrame is empty.")

    # Check if the Score column is present and numeric
    if score_column not in scored_sentences_df.columns:
        raise ValueError(f"The '{score_column}' column is missing from the input file.")

    if not pd.api.types.is_numeric_dtype(scored_sentences_df[score_column]):
        raise ValueError(f"The '{score_column}' column must contain numeric values.")

    # Select the top N sentences based on their scores
    selected_sentences_df = scored_sentences_df.nlargest(top_n, score_column)

    # Save the selected sentences to a new Excel file
    selected_sentences_df.to_excel(selected_sentences_file_path, index=False)
    print(f"Selected sentences saved to {selected_sentences_file_path}")

except FileNotFoundError as fnf_error:
    print(fnf_error)
except ValueError as ve:
    print(f"ValueError: {ve}")
except Exception as e:
    print(f"An error occurred: {e}")


Columns in the DataFrame: Index(['Sentence', 'Score'], dtype='object')
Selected sentences saved to /content/Selected sentences.xlsx


# Extractive summary generation

In [None]:
import pandas as pd

# Load the data from the Excel file
input_file_path = '/content/Selected sentences.xlsx'
selected_sentences_df = pd.read_excel(input_file_path)

# Debugging: Check the DataFrame
print("DataFrame Columns:", selected_sentences_df.columns)
print("DataFrame Head:")
print(selected_sentences_df.head())

# Combine the selected sentences into a single summary text
if 'Sentence' in selected_sentences_df.columns:
    final_summary = ' '.join(selected_sentences_df['Sentence'])
    print("Final Summary:")
    print(final_summary)

    # Create a DataFrame to save the final summary
    final_summary_df = pd.DataFrame({'Final Summary': [final_summary]})

    # Save the final summary to a new Excel file
    final_summary_file_path = '/content/Extractive_summary.xlsx'
    try:
        final_summary_df.to_excel(final_summary_file_path, index=False, engine='openpyxl')
        print(f"Summary saved to {final_summary_file_path}")
    except Exception as e:
        print("Error saving file:", e)
else:
    print("Column 'Sentence' does not exist in the DataFrame.")


DataFrame Columns: Index(['Sentence', 'Score'], dtype='object')
DataFrame Head:
                                            Sentence     Score
0  پھلوں اور سبزیوں کا روزمرہ کی خوراک میں شامل ہ...  6.498750
1  کم کیلوریز اور زیادہ غذائیت کی وجہ سے یہ وزن ک...  4.979412
2  ان تمام فوائد کی بنا پر پھلوں اور سبزیوں کو اپ...  4.895088
Final Summary:
پھلوں اور سبزیوں کا روزمرہ کی خوراک میں شامل ہونا انسانی صحت کے لیے بے حد ضروری ہے کیونکہ یہ قدرتی وٹامنز معدنیات اور فائبر کے بہترین ذرائع ہیں جو جسم کو غذائیت فراہم کرنے کے ساتھ مختلف بیماریوں سے بچاؤ اور جسمانی نظام کی بہتری میں اہم کردار ادا کرتے ہیں کم کیلوریز اور زیادہ غذائیت کی وجہ سے یہ وزن کم کرنے میں مددگار ہوتی ہیں اور دل کی صحت کو بہتر بناتی ہیں کیونکہ یہ کولیسٹرول کی سطح کم کرنے اور بلڈ پریشر کو متوازن رکھنے میں معاون ثابت ہوتی ہیں ان تمام فوائد کی بنا پر پھلوں اور سبزیوں کو اپنی روزمرہ کی خوراک کا مستقل حصہ بنانا ایک صحت مند زندگی کے لیے لازمی ہے
Summary saved to /content/Extractive_summary.xlsx


In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


# Add Punctutation

In [None]:
import pandas as pd
import re

def add_punctuation(text):
    # Strip any leading/trailing whitespace
    text = text.strip()

    # Ensure text ends with a full stop if not present
    if not text.endswith(('۔', '!', '?')):
        text += '۔'

    # Add full stop after occurrences of 'ہے' and 'ہیں'
    text = re.sub(r'\b(ہے|ہیں)\b', r'\1۔', text)

    # Remove extra spaces before punctuation and ensure single space after punctuation
    text = re.sub(r'\s+([۔!?])', r'\1', text)  # Remove space before Urdu punctuation
    text = re.sub(r'([۔!?])(\s|$)', r'\1 ', text)  # Ensure space after punctuation

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Path to the input Excel file
file_path = '/content/Extractive_summary.xlsx'

# Load the Excel file
df = pd.read_excel(file_path)

# Assuming 'Final Summary' is the column to which we need to add punctuation
if 'Final Summary' in df.columns:
    df['Final Summary'] = df['Final Summary'].astype(str).apply(add_punctuation)

    # Save the updated DataFrame to a new Excel file
    output_file_path = '/content/Final_Example1_summary.xlsx'
    df.to_excel(output_file_path, index=False)
    print(f"Updated file saved to {output_file_path}")
else:
    print("The 'Summary' column does not exist in the provided Excel file.")


Updated file saved to /content/Final_Example1_summary.xlsx


# extractive summary evaluation

In [None]:
from rouge import Rouge
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to preprocess text by lowercasing, stripping spaces, and splitting into words
def preprocess_text(text):
    return ' '.join(text.lower().strip().split())

# Load the original and generated summaries
original_df = pd.read_excel('/content/Benefits_of_fruits.xlsx')

# Verify the existence of 'Extractive_inflation_Evaluation.xlsx'
# and its potential location. If it exists in a different location,
# update the file path accordingly.
generated_df = pd.read_excel('/content/Final_Example1_summary.xlsx') # If the file exists, this line should work

# If the file doesn't exist, you may need to create it first or
# modify the code to handle the missing file appropriately.

# Strip any trailing spaces in the column names of original_df
original_df.columns = original_df.columns.str.strip()
generated_df.columns = generated_df.columns.str.strip()

# Ensure the data is aligned
assert 'Importance_of_Fruits' in original_df.columns, "The column 'Inflation Paragraph' is not present in the original dataframe."
assert 'Final Summary' in generated_df.columns, "The column 'Final Summary' is not present in the generated dataframe."
assert len(original_df) == len(generated_df), "The number of rows in both dataframes must be the same."

# Extract and preprocess the summaries
original_summaries = [preprocess_text(summary) for summary in original_df['Importance_of_Fruits']]
generated_summaries = [preprocess_text(summary) for summary in generated_df['Final Summary']]

# Initialize the ROUGE evaluator
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_summaries, original_summaries, avg=True)

# Print the ROUGE scores
print(f"ROUGE-1: Precision: {scores['rouge-1']['p']:.4f}, Recall: {scores['rouge-1']['r']:.4f}, F1: {scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: Precision: {scores['rouge-2']['p']:.4f}, Recall: {scores['rouge-2']['r']:.4f}, F1: {scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: Precision: {scores['rouge-l']['p']:.4f}, Recall: {scores['rouge-l']['r']:.4f}, F1: {scores['rouge-l']['f']:.4f}")

# Extract the original and generated text for tokenization
y_true = original_df['Importance_of_Fruits']
y_pred = generated_df['Final Summary']

# Tokenize the summaries into sets of words
y_true_tokens = y_true.str.split().apply(set)
y_pred_tokens = y_pred.str.split().apply(set)

# Combine token sets to create a unified set of all unique tokens
all_tokens = set()
for tokens in y_true_tokens:
    all_tokens.update(tokens)
for tokens in y_pred_tokens:
    all_tokens.update(tokens)

# Function to create binary labels for each token in the unified token set
def create_binary_labels(tokens_list, all_tokens):
    return [1 if token in tokens_list else 0 for token in all_tokens]

# Create binary label vectors for true and predicted summaries
y_true_bin = [create_binary_labels(tokens, all_tokens) for tokens in y_true_tokens]
y_pred_bin = [create_binary_labels(tokens, all_tokens) for tokens in y_pred_tokens]

# Flatten the binary labels for all summaries
y_true_bin_flat = [item for sublist in y_true_bin for item in sublist]
y_pred_bin_flat = [item for sublist in y_pred_bin for item in sublist]


# Calculate precision, recall, and F1 score using 'micro' averaging
precision = precision_score(y_true_bin_flat, y_pred_bin_flat, average='micro')
recall = recall_score(y_true_bin_flat, y_pred_bin_flat, average='micro')
f1 = f1_score(y_true_bin_flat, y_pred_bin_flat, average='micro')

from nltk.translate.bleu_score import corpus_bleu

# Tokenize the summaries
reference_tokenized = [[ref.split()] for ref in original_summaries]
generated_tokenized = [gen.split() for gen in generated_summaries]

# Calculate BLEU score
bleu_score = corpus_bleu(reference_tokenized, generated_tokenized)
print(f'BLEU Score: {bleu_score}')



ROUGE-1: Precision: 0.9610, Recall: 0.6549, F1: 0.7789
ROUGE-2: Precision: 0.9009, Recall: 0.5587, F1: 0.6897
ROUGE-L: Precision: 0.9481, Recall: 0.6460, F1: 0.7684
BLEU Score: 0.4391315508064183


# Now Generate Abstractive Summary

In [None]:
import pandas as pd
from transformers import MBartTokenizer, MBartForConditionalGeneration
import torch

# Initialize tokenizer
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50")

def tokenize_text(text, max_length=512):
    """Tokenize Urdu text using the mBART tokenizer."""
    return tokenizer(text, truncation=True, padding="max_length", max_length=max_length, return_tensors='pt')

# Load and preprocess the new example text dataset
def load_example_text(file_path):
    """Load example text from the provided Excel file."""
    df = pd.read_excel(file_path)
    text_column = df.columns[0]  # Get the name of the first column
    example_texts = df[text_column].tolist()
    return example_texts

# Example file path
example_file_path = '/content/Final_Example1_summary.xlsx'
example_texts = load_example_text(example_file_path)

# Model setup
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Function to generate summary
def generate_summary(input_text, max_length=512):
    """Generate a summary of the input Urdu text."""
    inputs = tokenize_text(input_text, max_length=max_length)
    input_ids = inputs['input_ids'].to(device)
    summary_ids = model.generate(
        input_ids,
        max_length=max_length,
        min_length=50,  # Ensure a minimum length for the summary
        num_beams=8,    # Increased beam search
        length_penalty=1.2,  # Adjust length penalty for better summary length
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for all texts in the example file
summaries = [generate_summary(text) for text in example_texts]

# Save the summaries to a new Excel file
output_file_path = '/content/Abstractive_Summary_example1.xlsx'
summary_df = pd.DataFrame({
    'source_text': example_texts,
    'generated_summary': summaries
})
summary_df.to_excel(output_file_path, index=False)

print(f"Summaries have been saved to {output_file_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Summaries have been saved to /content/Abstractive_Summary_example1.xlsx


# Evaluation matrics of Final summary




In [None]:
pip install bert-score


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu
import bert_score
from bert_score import score as bert_score

# Function to preprocess text by lowercasing, stripping spaces, and splitting into words
def preprocess_text(text):
    return ' '.join(text.lower().strip().split())

# Load the original and generated summaries
original_df = pd.read_excel('/content/Final_Example1_summary.xlsx')
generated_df = pd.read_excel('/content/Abstractive_Summary_example1.xlsx')

# Strip any trailing spaces in the column names of original_df
original_df.columns = original_df.columns.str.strip()
generated_df.columns = generated_df.columns.str.strip()

# Ensure the data is aligned
assert 'Final Summary' in original_df.columns, "The column 'Final Summary' is not present in the original dataframe."
assert 'generated_summary' in generated_df.columns, "The column 'generated_summary' is not present in the generated dataframe."
assert len(original_df) == len(generated_df), "The number of rows in both dataframes must be the same."

# Extract and preprocess the summaries
original_summaries = [preprocess_text(summary) for summary in original_df['Final Summary']]
generated_summaries = [preprocess_text(summary) for summary in generated_df['generated_summary']]

# Initialize the ROUGE evaluator
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_summaries, original_summaries, avg=True)

# Print the ROUGE scores
print(f"ROUGE-1: Precision: {scores['rouge-1']['p']:.4f}, Recall: {scores['rouge-1']['r']:.4f}, F1: {scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: Precision: {scores['rouge-2']['p']:.4f}, Recall: {scores['rouge-2']['r']:.4f}, F1: {scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: Precision: {scores['rouge-l']['p']:.4f}, Recall: {scores['rouge-l']['r']:.4f}, F1: {scores['rouge-l']['f']:.4f}")

# Extract the original and generated text for tokenization
y_true = original_df['Final Summary']
y_pred = generated_df['generated_summary']

# Tokenize the summaries into sets of words
y_true_tokens = y_true.str.split().apply(set)
y_pred_tokens = y_pred.str.split().apply(set)

# Combine token sets to create a unified set of all unique tokens
all_tokens = set()
for tokens in y_true_tokens:
    all_tokens.update(tokens)
for tokens in y_pred_tokens:
    all_tokens.update(tokens)

# Function to create binary labels for each token in the unified token set
def create_binary_labels(tokens_list, all_tokens):
    return [1 if token in tokens_list else 0 for token in all_tokens]

# Create binary label vectors for true and predicted summaries
y_true_bin = [create_binary_labels(tokens, all_tokens) for tokens in y_true_tokens]
y_pred_bin = [create_binary_labels(tokens, all_tokens) for tokens in y_pred_tokens]

# Flatten the binary labels for all summaries
y_true_bin_flat = [item for sublist in y_true_bin for item in sublist]
y_pred_bin_flat = [item for sublist in y_pred_bin for item in sublist]

# Calculate precision, recall, and F1 score using 'micro' averaging
precision = precision_score(y_true_bin_flat, y_pred_bin_flat, average='micro')
recall = recall_score(y_true_bin_flat, y_pred_bin_flat, average='micro')
f1 = f1_score(y_true_bin_flat, y_pred_bin_flat, average='micro')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

# Calculate BLEU score
reference_tokenized = [[ref.split()] for ref in original_summaries]
generated_tokenized = [gen.split() for gen in generated_summaries]
bleu_score = corpus_bleu(reference_tokenized, generated_tokenized)
print(f'BLEU Score: {bleu_score:.4f}')

# Calculate BERTScore
P, R, F1 = bert_score(generated_summaries, original_summaries, lang="en", verbose=True)

# Print the BERTScore
print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")


ROUGE-1: Precision: 1.0000, Recall: 0.5844, F1: 0.7377
ROUGE-2: Precision: 1.0000, Recall: 0.4595, F1: 0.6296
ROUGE-L: Precision: 1.0000, Recall: 0.5844, F1: 0.7377
Precision: 0.5844, Recall: 0.5844, F1: 0.5844
BLEU Score: 0.2704


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 19.20 seconds, 0.05 sentences/sec
BERTScore - Precision: 0.9510, Recall: 0.9285, F1: 0.9396


In [None]:
# Required Libraries
from rouge import Rouge
import pandas as pd

# Function to preprocess text by lowercasing, stripping spaces, and splitting into words
def preprocess_text(text):
    return ' '.join(text.lower().strip().split())

# Load original and generated summaries from Excel files
original_df = pd.read_excel('/content/Final_Example1_summary.xlsx')  # Original summaries
generated_df = pd.read_excel('/content/Abstractive_Summary_example1.xlsx')  # Generated summaries

# Clean and align column names
original_df.columns = original_df.columns.str.strip()
generated_df.columns = generated_df.columns.str.strip()

# Ensure the expected columns are present
assert 'Final Summary' in original_df.columns, "The column 'Final Summary' is not present in the original dataframe."
assert 'generated_summary' in generated_df.columns, "The column 'generated_summary' is not present in the generated dataframe."
assert len(original_df) == len(generated_df), "The number of rows in both dataframes must be the same."

# Preprocess the summaries
original_summaries = [preprocess_text(summary) for summary in original_df['Final Summary']]
generated_summaries = [preprocess_text(summary) for summary in generated_df['generated_summary']]

# Initialize ROUGE evaluator
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_summaries, original_summaries, avg=True)

# Calculate overall percentage as the average of precision, recall, and F1 scores for each metric
rouge1_percentage = (scores['rouge-1']['p'] + scores['rouge-1']['r'] + scores['rouge-1']['f']) / 3 * 100
rouge2_percentage = (scores['rouge-2']['p'] + scores['rouge-2']['r'] + scores['rouge-2']['f']) / 3 * 100
rougeL_percentage = (scores['rouge-l']['p'] + scores['rouge-l']['r'] + scores['rouge-l']['f']) / 3 * 100

# Print overall percentages
print(f"Overall ROUGE-1: {rouge1_percentage:.2f}%")
print(f"Overall ROUGE-2: {rouge2_percentage:.2f}%")
print(f"Overall ROUGE-L: {rougeL_percentage:.2f}%")


Overall ROUGE-1: 77.40%
Overall ROUGE-2: 69.64%
Overall ROUGE-L: 77.40%
