<a href="https://colab.research.google.com/github/RegNLP/RePASs/blob/main/RIRAG_FluencyRelativeScore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from torch.nn.utils.rnn import pad_sequence
import json
import csv
import os

class FluencyRelativeScore:
    def __init__(self, same_length=False):
        self.model = GPT2LMHeadModel.from_pretrained("gpt2").cuda()
        self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
        self.model.half().eval()
        self.same_length = same_length
        self.max_output_length = 80  # Process in 80-token chunks

    def split_into_chunks(self, text):
        """Splits text into 80-token chunks for full-content processing."""
        tokens = self.tokenizer.encode(text)
        return [tokens[i:i + (self.max_output_length - 1)] for i in range(0, len(tokens), self.max_output_length - 1)]

    def preprocess_batch(self, decoded):
        """Tokenizes, pads, and prepares batch inputs for GPT-2."""
        all_chunks = []
        for dec in decoded:
            all_chunks.extend(self.split_into_chunks(dec))  # Keep all chunks

        decs_inp = pad_sequence(
            [torch.LongTensor([self.tokenizer.bos_token_id] + chunk) for chunk in all_chunks],
            batch_first=True, padding_value=0
        )
        decs_out = pad_sequence(
            [torch.LongTensor(chunk + [self.tokenizer.eos_token_id]) for chunk in all_chunks],
            batch_first=True, padding_value=-1
        )
        return decs_inp.cuda(), decs_out.cuda(), len(all_chunks)

    def text2loss(self, text):
        """Computes loss per token over all chunks and averages them."""
        txt_inp, txt_out, num_chunks = self.preprocess_batch(text)

        with torch.no_grad():
            model_outputs = self.model(input_ids=txt_inp)
            crit = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='none')
            loss = crit(model_outputs["logits"].view(-1, self.tokenizer.vocab_size), txt_out.view(-1)).view(txt_out.shape)

            mask = (txt_inp != 0).float()
            non_pad_count = torch.sum(mask, dim=1)
            loss_per_chunk = torch.sum(loss, dim=1) / non_pad_count

        return loss_per_chunk.mean().item()  # Average loss across all chunks

    def score(self, sources, generateds, printing=False):
        """Computes fluency score by comparing source and generated text losses."""
        sources_score = torch.tensor([self.text2loss([src]) for src in sources])
        generateds_score = torch.tensor([self.text2loss([gen]) for gen in generateds])

        # Relative fluency score formula
        scores = (1.3 + sources_score - generateds_score) / 1.3
        scores = torch.clamp(scores, 0.001, 1.0).tolist()

        if printing:
            print("[fluency]", scores)
        return {"scores": scores, "sources_loss": sources_score.tolist(), "generateds_loss": generateds_score.tolist()}


def main():
    input_json_file = "/content/drive/Othercomputers/MBZUAI/MBZUAI/RIRAG System Submission/0_Baseline/retrieval_results.passage_only_bm25_answers.json"      # e.g., "data/input.json"
    output_folder_path = "/content/drive/Othercomputers/MBZUAI/MBZUAI/RIRAG Task 2 Simplification Evaluations"      # e.g., "results"
    method_name = "gpt2_next_word_loss_fluency_score_chunked"
    team_name = "0_Baseline/only_bm25"

    final_output_folder = os.path.join(output_folder_path, method_name, team_name)
    os.makedirs(final_output_folder, exist_ok=True)

    with open(input_json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if not isinstance(data, list):
        data = [data]

    scorer = FluencyRelativeScore()
    rows = []
    total_fluency_score = 0
    total_source_loss = 0
    total_generated_loss = 0
    count = 0

    for item in data:
        question_id = item.get("QuestionID", "")
        retrieved_passages = item.get("RetrievedPassages", [])
        raw_text = " ".join(retrieved_passages)
        simplified_text = item.get("Answer", "")

        results = scorer.score([raw_text], [simplified_text], printing=False)
        fluency_score = results["scores"][0]
        source_loss = results["sources_loss"][0]
        generated_loss = results["generateds_loss"][0]

        rows.append({
            "QuestionID": question_id,
            "FluencyScore": fluency_score,
            "SourceLoss": source_loss,
            "GeneratedLoss": generated_loss
        })

        total_fluency_score += fluency_score
        total_source_loss += source_loss
        total_generated_loss += generated_loss
        count += 1

    csv_file_path = os.path.join(final_output_folder, "fluency_scores.csv")
    fieldnames = ["QuestionID", "FluencyScore", "SourceLoss", "GeneratedLoss"]
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)

    if count > 0:
        avg_fluency_score = total_fluency_score / count
        avg_source_loss = total_source_loss / count
        avg_generated_loss = total_generated_loss / count
    else:
        avg_fluency_score = avg_source_loss = avg_generated_loss = 0

    txt_file_path = os.path.join(final_output_folder, "average_scores.txt")
    with open(txt_file_path, 'w', encoding='utf-8') as txtfile:
        txtfile.write("Average Fluency Scores:\n")
        txtfile.write(f"Fluency Score: {avg_fluency_score:.4f}\n")
        txtfile.write(f"Source Loss: {avg_source_loss:.4f}\n")
        txtfile.write(f"Generated Loss: {avg_generated_loss:.4f}\n")


if __name__ == "__main__":
    main()


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (6511 > 1024). Running this sequence through the model will result in indexing errors
