In [None]:
RES_FOLDER = "/content/drive/MyDrive/Smruti-GEC-for-Gujarati/results/synthetic/"
res_file = RES_FOLDER + "zs-gpt-4o-mini_results_2.json"

# Installations

In [None]:
! pip install --upgrade --quiet nltk

In [None]:
import json
import nltk
import re
from nltk.translate.gleu_score import sentence_gleu, corpus_gleu

# Tokenizer

In [None]:
stopwords = []
def GujaratiTokenizer(data, keep_stopwords=True):
    data = re.sub(r'([”“.,;:\'\\"!?%#@*<>|\+\-\(\)])', r' \1 ', data)
    data = re.sub(r"   ", ' ', data)
    data = re.sub(r'…', " ", data)
    data = re.sub(r'[‘’]', "'", data)
    data = re.sub(r"[”“]", r'"', data)
    data = re.split(r'[ -]', data)
    words = []

    if not keep_stopwords:
        for word in data:
            if word and word not in stopwords:
                words.append(word)
        return words

    for i in data:
        if i:
            words.append(i)
    return words

In [None]:
def calculate_sentence_gleu(predicted_sentence, correct_sentences):
  """Calculates GLEU for a single sentence pair.

  Args:
    predicted_sentence: The predicted sentence string.
    correct_sentences: A list of correct sentence strings.

  Returns:
    The sentence-level GLEU score.
  """
  tokenized_predicted = GujaratiTokenizer(predicted_sentence)
  tokenized_correct = [GujaratiTokenizer(s) for s in correct_sentences]
  return sentence_gleu(tokenized_correct, tokenized_predicted)

predicted = "મેં તેને સેક્રેટરી દ્વારા કહેવરવ્યું છે."
correct = ["મેં તેને સેક્રેટરી દ્વારા કહેવરાવ્યું છે.",
        "મેં તેને સેક્રેટરી દ્વારા કહેવડાવ્યું છે."]

gleu_score = calculate_sentence_gleu(predicted, correct)
print(f"Sentence GLEU score: {gleu_score:.4f}")

# GLEU

In [None]:
def load_json_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def calculate_avg_gleu(data):
    total_gleu_score = 0
    num_sentences = 0

    for entry in data:
        predicted = entry["prediction"]
        correct_sentences = entry["reference"]
        tokenized_predicted = GujaratiTokenizer(predicted)
        if isinstance(correct_sentences, list):
          tokenized_correct = [GujaratiTokenizer(s) for s in correct_sentences]
        else:
          tokenized_correct = GujaratiTokenizer(correct_sentences)
        # print(tokenized_predicted)
        # print(tokenized_correct)
        gleu_score = sentence_gleu(tokenized_correct, tokenized_predicted)
        total_gleu_score += gleu_score
        num_sentences += 1

    return total_gleu_score / num_sentences if num_sentences else 0

def calculate_corpus_gleu(data):
    references = []
    hypotheses = []

    for entry in data:
        predicted = entry["prediction"]
        correct_sentences = entry["reference"]
        tokenized_predicted = GujaratiTokenizer(predicted)
        if isinstance(correct_sentences, list):
          tokenized_correct = [GujaratiTokenizer(s) for s in correct_sentences]
        else:
          tokenized_correct = GujaratiTokenizer(correct_sentences)
        references.append(tokenized_correct)
        hypotheses.append(tokenized_predicted)

    return corpus_gleu(references, hypotheses)

# _

In [None]:
try:
    filepath = res_file
    json_data = load_json_data(filepath)
    overall_gleu = calculate_avg_gleu(json_data)
    corpus_gleu_score = calculate_corpus_gleu(json_data)

    print(f"Sentence-level GLEU score (average): {overall_gleu:.8f}")
    print(f"Corpus-level GLEU score: {corpus_gleu_score:.8f}")

except FileNotFoundError:
    print("Error: JSON file not found.")
except Exception as e:
    print(f"An error occurred: {e}")
