In [8]:
# ROGUE SCORE T5
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load CSV file

#csv_file_path = "testing_dataset.csv"
#csv_file_path = "gen_testing.csv"
csv_file_path = "math_testing.csv"
df = pd.read_csv(csv_file_path)

# Calculate ROUGE metrics
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Accumulate ROUGE scores
total_rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
num_entries = 0

for index, row in df.iterrows():
    try:
        source_text = row['source_text']
        reference_summary = row['reference_summary']

        # Check if the text is a string
        if not isinstance(source_text, str) or not isinstance(reference_summary, str):
            print(f"Skipping entry at index {index} due to missing or non-text data.")
            continue

        # Tokenize input text
        input_ids = tokenizer.encode(source_text, return_tensors="pt")

        # Generate output text
        output_ids = model.generate(input_ids)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Calculate ROUGE metrics
        scores = scorer.score(reference_summary, output_text)

        # Accumulate scores
        for rouge_type in total_rouge_scores:
            total_rouge_scores[rouge_type] += scores[rouge_type].fmeasure

        num_entries += 1

    except Exception as e:
        print(f"Error processing entry at index {index}: {e}")
        continue

# Calculate average ROUGE scores
average_rouge_scores = {rouge_type: total_rouge_scores[rouge_type] / num_entries for rouge_type in total_rouge_scores}

# Print average scores
print("\nAverage ROUGE Scores:")
print("ROUGE-1: {:.4f}".format(average_rouge_scores['rouge1']))
print("ROUGE-2: {:.4f}".format(average_rouge_scores['rouge2']))
print("ROUGE-L: {:.4f}".format(average_rouge_scores['rougeL']))


Token indices sequence length is longer than the specified maximum sequence length for this model (2074 > 512). Running this sequence through the model will result in indexing errors



Average ROUGE Scores:
ROUGE-1: 0.1049
ROUGE-2: 0.0323
ROUGE-L: 0.0878


# F1 Score etc

In [17]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load CSV file
#csv_file_path = "math_testing.csv" 
#csv_file_path = "testing_dataset.csv"
csv_file_path = "gen_testing.csv"
df = pd.read_csv(csv_file_path)

# Iterate over different threshold lengths
for threshold_length in range(20, 201, 25):
    print(f"\nThreshold Length: {threshold_length}")

    # Initialize lists to store binary labels and predicted labels
    binary_generated_summaries = []
    predicted_labels = []

    # Accumulate metrics
    num_entries = 0
    total_accuracy = 0.0
    total_precision = 0.0
    total_recall = 0.0
    total_f1 = 0.0

    for index, row in df.iterrows():
        try:
            source_text = row['source_text']
            reference_summary = row['reference_summary']

            # Check if the text is a string
            if not isinstance(source_text, str) or not isinstance(reference_summary, str):
                print(f"Skipping entry at index {index} due to missing or non-text data.")
                continue

            # Tokenize input text
            input_ids = tokenizer.encode(source_text, return_tensors="pt")

            # Generate output text
            output_ids = model.generate(input_ids)
            output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # Determine binary label based on the threshold length
            binary_generated_summary = 1 if len(output_text) > threshold_length else 0
            binary_generated_summaries.append(binary_generated_summary)

            # Append true label and predicted label to lists (for illustration purposes)
            # Note: Replace the following line with your actual comparison logic
            predicted_labels.append(1 if len(output_text) > threshold_length else 0)

            num_entries += 1

        except Exception as e:
            print(f"Error processing entry at index {index}: {e}")
            continue

    # Calculate metrics
    accuracy = accuracy_score(binary_generated_summaries, predicted_labels)
    precision = precision_score(binary_generated_summaries, predicted_labels)
    recall = recall_score(binary_generated_summaries, predicted_labels)
    f1 = f1_score(binary_generated_summaries, predicted_labels)

    # Print metrics
    print("Accuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1 Score: {:.4f}".format(f1))



Threshold Length: 20


Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors


Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Threshold Length: 45




Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Threshold Length: 70




Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Threshold Length: 95




Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Threshold Length: 120


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Threshold Length: 145


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Threshold Length: 170


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Threshold Length: 195




Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


# Extractive

In [24]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from rouge import Rouge  # Install this library using: pip install rouge

# Load CSV file
csv_file_path = "math_testing.csv" 
#csv_file_path = "testing_dataset.csv"
#csv_file_path = "gen_testing.csv"
df = pd.read_csv(csv_file_path)

# Tokenization and preprocessing
nlp = spacy.load('en_core_web_sm')

# Initialize ROUGE scorer
rouge = Rouge()

# Initialize variables for accumulating scores
total_rouge1 = 0.0
total_rouge2 = 0.0
total_rougeL = 0.0

for index, row in df.iterrows():
    source_text = row['source_text']
    reference_summary = row['reference_summary']

    # Tokenization and preprocessing
    doc = nlp(source_text)

    # Word frequencies
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text.lower() not in punctuation:
            word_frequencies[word.text] = word_frequencies.get(word.text, 0) + 1

    # Normalize frequencies
    max_frequency = max(word_frequencies.values())
    word_frequencies = {word: freq / max_frequency for word, freq in word_frequencies.items()}

    # Sentence scores
    sentence_tokens = [sent for sent in doc.sents]
    sentence_scores = {sent: sum(word_frequencies.get(word.text.lower(), 0) for word in sent) for sent in sentence_tokens}

    # Extractive summarization
    select_length = int(len(sentence_tokens) * 0.3)
    summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    generated_summary = ' '.join(word.text for sent in summary for word in sent)

    # ROUGE scoring
    if isinstance(reference_summary, str) and isinstance(generated_summary, str) and generated_summary.strip():  # Check if the generated summary is not empty
        rouge_scores = rouge.get_scores(generated_summary, reference_summary)[0]

        # Accumulate scores
        total_rouge1 += rouge_scores['rouge-1']['f']
        total_rouge2 += rouge_scores['rouge-2']['f']
        total_rougeL += rouge_scores['rouge-l']['f']

        # Print results for each entry (optional)
#         print(f"\nEntry {index + 1}")
#         print("Source Text:", source_text)
#         print("Reference Summary:", reference_summary)
#         print("Generated Summary:", generated_summary)
#         print("ROUGE-1 F1 Score:", rouge_scores['rouge-1']['f'])
#         print("ROUGE-2 F1 Score:", rouge_scores['rouge-2']['f'])
#         print("ROUGE-L F1 Score:", rouge_scores['rouge-l']['f'])
    else:
        print(f"\nSkipping entry {index + 1} due to invalid reference or empty generated summary.")

# Calculate average scores
num_entries = len(df)
avg_rouge1 = total_rouge1 / num_entries
avg_rouge2 = total_rouge2 / num_entries
avg_rougeL = total_rougeL / num_entries

# Print average scores
print("\nAverage ROUGE Scores:")
print("ROUGE-1: {:.4f}".format(avg_rouge1))
print("ROUGE-2: {:.4f}".format(avg_rouge2))
print("ROUGE-L: {:.4f}".format(avg_rougeL))



Average ROUGE Scores:
ROUGE-1: 0.2579
ROUGE-2: 0.0853
ROUGE-L: 0.2346


# F1 score etc

In [34]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load CSV file
#csv_file_path = "math_testing.csv"
csv_file_path = "testing_dataset.csv"
# csv_file_path = "gen_testing.csv"
df = pd.read_csv(csv_file_path)

# Tokenization and preprocessing
nlp = spacy.load('en_core_web_sm')

# Initialize variables for accumulating scores
total_rouge1 = 0.0
total_rouge2 = 0.0
total_rougeL = 0.0
total_accuracy = 0.0
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0

# Set the range of threshold lengths
threshold_range = range(20, 201, 25)

for threshold_length in threshold_range:
    # Initialize variables for each threshold
    binary_generated_summaries = []
    true_labels = []

    for index, row in df.iterrows():
        source_text = row['source_text']
        reference_summary = row['reference_summary']

        # Tokenization and preprocessing
        doc = nlp(source_text)

        # Sentence scores
        sentence_tokens = [sent for sent in doc.sents]
        sentence_lengths = [len(sent) for sent in sentence_tokens]

        # Check if the reference summary is valid
        if isinstance(reference_summary, str):
            # Generate binary labels based on the threshold length
            binary_generated_summary = 1 if any(length > threshold_length for length in sentence_lengths) else 0
            binary_generated_summaries.append(binary_generated_summary)

            # Append true label to the list
            true_label = 1 if len(reference_summary) > threshold_length else 0
            true_labels.append(true_label)
        else:
            print(f"Skipping entry {index + 1} due to invalid reference format.")

    # Check if there are valid entries
    if true_labels:
        # Calculate metrics
        accuracy = accuracy_score(true_labels, binary_generated_summaries)

        # Set zero_division='warn' to handle the warning
        precision = precision_score(true_labels, binary_generated_summaries, zero_division='warn')
        recall = recall_score(true_labels, binary_generated_summaries)
        f1 = f1_score(true_labels, binary_generated_summaries)

        # Accumulate metrics
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_f1 += f1

        # Print results for each threshold (optional)
        print(f"\nThreshold Length: {threshold_length} - Results:")
        print("Accuracy: {:.4f}".format(accuracy))
        print("Precision: {:.4f}".format(precision))
        print("Recall: {:.4f}".format(recall))
        print("F1 Score: {:.4f}".format(f1))
    else:
        print(f"No valid entries for threshold length {threshold_length}.")

# Calculate average scores
num_thresholds = len(threshold_range)
avg_accuracy = total_accuracy / num_thresholds
avg_precision = total_precision / num_thresholds
avg_recall = total_recall / num_thresholds
avg_f1 = total_f1 / num_thresholds

# Print average scores
print("\nAverage Metrics:")
print("Average Accuracy: {:.4f}".format(avg_accuracy))
print("Average Precision: {:.4f}".format(avg_precision))
print("Average Recall: {:.4f}".format(avg_recall))
print("Average F1 Score: {:.4f}".format(avg_f1))


Skipping entry 42 due to invalid reference format.

Threshold Length: 20 - Results:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Skipping entry 42 due to invalid reference format.

Threshold Length: 45 - Results:
Accuracy: 0.7449
Precision: 1.0000
Recall: 0.7449
F1 Score: 0.8538
Skipping entry 42 due to invalid reference format.

Threshold Length: 70 - Results:
Accuracy: 0.2959
Precision: 1.0000
Recall: 0.2959
F1 Score: 0.4567
Skipping entry 42 due to invalid reference format.

Threshold Length: 95 - Results:
Accuracy: 0.1020
Precision: 1.0000
Recall: 0.1020
F1 Score: 0.1852
Skipping entry 42 due to invalid reference format.

Threshold Length: 120 - Results:
Accuracy: 0.0204
Precision: 1.0000
Recall: 0.0204
F1 Score: 0.0400
Skipping entry 42 due to invalid reference format.

Threshold Length: 145 - Results:
Accuracy: 0.0102
Precision: 1.0000
Recall: 0.0102
F1 Score: 0.0202
Skipping entry 42 due to invalid reference format.

Threshold Length: 170 - Results:
Accurac