<a href="https://colab.research.google.com/github/RegNLP/RePASs/blob/main/RIRAG_FleshScores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install textstat


Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.5


In [24]:
import json
import csv
import os
import textstat

def main():
    # Update these variables with your desired inputs.
    input_json_file = "/content/drive/Othercomputers/MBZUAI/MBZUAI/RIRAG System Submission/0_Baseline/retrieval_results.rank_fusion_bm25_answers.json"      # e.g., "data/input.json"
    output_folder_path = "/content/drive/Othercomputers/MBZUAI/MBZUAI/RIRAG Task 2 Simplification Evaluations"      # e.g., "results"
    method_name = "statictical_readability_scores"                 # e.g., "readability_scores"
    team_name = "0_Baseline/rank_fusion"                              # e.g., "TeamA""

    # Create final output folder by joining the output folder path and team name.
    final_output_folder = os.path.join(output_folder_path, method_name, team_name)
    os.makedirs(final_output_folder, exist_ok=True)

    # Load JSON data from the file.
    with open(input_json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # If the loaded data is a single JSON object, wrap it in a list.
    if not isinstance(data, list):
        data = [data]

    # Initialize list for CSV rows and accumulators for averages.
    rows = []
    total_fkg_source = 0
    total_fre_source = 0
    total_smog_source = 0
    total_fkg_simplified = 0
    total_fre_simplified = 0
    total_smog_simplified = 0
    count = 0

    # Process each item.
    for item in data:
        question_id = item.get("QuestionID", "")
        retrieved_passages = item.get("RetrievedPassages", [])

        if not retrieved_passages:
            print(f"Skipping {question_id} due to empty RetrievedPassages")
            raw_text = "No Retrieved Passages Available"
        else:
            raw_text = " ".join(retrieved_passages).strip()

        simplified_text = item.get("Answer", "").strip()
        if not simplified_text:
            print(f"Skipping {question_id} due to empty Answer")
            simplified_text = "No Answer Provided"


        # Calculate readability metrics for the source text.
        fkg_source = textstat.flesch_kincaid_grade(raw_text)
        fre_source = textstat.flesch_reading_ease(raw_text)
        smog_source = textstat.smog_index(raw_text)

        # Calculate readability metrics for the simplified text.
        fkg_simplified = textstat.flesch_kincaid_grade(simplified_text)
        fre_simplified = textstat.flesch_reading_ease(simplified_text)
        smog_simplified = textstat.smog_index(simplified_text)

        # Append the results as a row.
        rows.append({
            "QuestionID": question_id,
            "Flesch_Kincaid_Grade_Source": fkg_source,
            "Flesch_Reading_Ease_Source": fre_source,
            "SMOG_Index_Source": smog_source,
            "Flesch_Kincaid_Grade_Simplified": fkg_simplified,
            "Flesch_Reading_Ease_Simplified": fre_simplified,
            "SMOG_Index_Simplified": smog_simplified
        })

        # Update accumulators.
        total_fkg_source += fkg_source
        total_fre_source += fre_source
        total_smog_source += smog_source
        total_fkg_simplified += fkg_simplified
        total_fre_simplified += fre_simplified
        total_smog_simplified += smog_simplified
        count += 1

    # Write individual item results to a CSV file.
    csv_file_path = os.path.join(final_output_folder, "readability_scores.csv")
    fieldnames = [
        "QuestionID",
        "Flesch_Kincaid_Grade_Source",
        "Flesch_Reading_Ease_Source",
        "SMOG_Index_Source",
        "Flesch_Kincaid_Grade_Simplified",
        "Flesch_Reading_Ease_Simplified",
        "SMOG_Index_Simplified"
    ]
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)

    # Compute average scores across all items.
    if count > 0:
        avg_fkg_source = total_fkg_source / count
        avg_fre_source = total_fre_source / count
        avg_smog_source = total_smog_source / count
        avg_fkg_simplified = total_fkg_simplified / count
        avg_fre_simplified = total_fre_simplified / count
        avg_smog_simplified = total_smog_simplified / count
    else:
        avg_fkg_source = avg_fre_source = avg_smog_source = 0
        avg_fkg_simplified = avg_fre_simplified = avg_smog_simplified = 0

    # Write average scores to a TXT file.
    txt_file_path = os.path.join(final_output_folder, "average_scores.txt")
    with open(txt_file_path, 'w', encoding='utf-8') as txtfile:
        txtfile.write("Average Readability Scores:\n")
        txtfile.write("Source Text:\n")
        txtfile.write(f"Flesch-Kincaid Grade Level: {avg_fkg_source:.2f}\n")
        txtfile.write(f"Flesch Reading Ease: {avg_fre_source:.2f}\n")
        txtfile.write(f"SMOG Index: {avg_smog_source:.2f}\n\n")
        txtfile.write("Simplified Text:\n")
        txtfile.write(f"Flesch-Kincaid Grade Level: {avg_fkg_simplified:.2f}\n")
        txtfile.write(f"Flesch Reading Ease: {avg_fre_simplified:.2f}\n")
        txtfile.write(f"SMOG Index: {avg_smog_simplified:.2f}\n")

    # Print the file content
    with open(txt_file_path, 'r', encoding='utf-8') as txtfile:
        print("\n--- Results ---")
        print(txtfile.read())  # Print the results to the console

if __name__ == "__main__":
    main()



--- Results ---
Average Readability Scores:
Source Text:
Flesch-Kincaid Grade Level: 25.76
Flesch Reading Ease: -2.23
SMOG Index: 14.08

Simplified Text:
Flesch-Kincaid Grade Level: 17.86
Flesch Reading Ease: 14.60
SMOG Index: 19.08

