In [None]:
# RES_FOLDER = "/content/drive/MyDrive/Smruti-GEC-for-Gujarati/results/human-annotated/"
RES_FOLDER = "/content/drive/MyDrive/Smruti-GEC-for-Gujarati/results/synthetic/"
res_file = RES_FOLDER + "cot_with_m1&m2_k1=7_k2=1.json"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Python2 Installation and Git clone

In [None]:
!apt -qq update -y
!apt -qq install python2 -y

In [None]:
!python2 --version

In [None]:
!git clone https://github.com/keisks/m2scorer.git

# Tokenizer

In [None]:
import re, json
stopwords = []
def GujaratiTokenizer(data, keep_stopwords=True):
    data = re.sub(r'([”“.,;:\'\\"!?%#@*<>|\+\-\(\)])', r' \1 ', data)
    data = re.sub(r"   ", ' ', data)
    data = re.sub(r'…', " ", data)
    data = re.sub(r'[‘’]', "'", data)
    data = re.sub(r"[”“]", r'"', data)
    data = re.split(r'[ -]', data)
    words = []

    if not keep_stopwords:
        for word in data:
            if word and word not in stopwords:
                words.append(word)
        return words

    for i in data:
        if i:
            words.append(i)
    return words

# M$^2$

In [None]:
def load_json_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

In [None]:
filepath = res_file
json_data = load_json_data(filepath)

In [None]:
with open('parallel.txt', 'w', encoding='utf-8') as f:
    for entry in json_data:
        incorrect = entry["input"].strip()
        if isinstance(entry["reference"], list):
          correct_sentences = [s.strip() for s in entry["reference"]]
        else:
          correct_sentences = [entry["reference"].strip()]
        line = incorrect + ' || ' + ' | '.join(correct_sentences) + '\n'
        f.write(line)

In [None]:
with open('pred.txt', 'w', encoding='utf-8') as f:
    for entry in json_data:
        predicted = entry["prediction"].strip()
        predicted = ' '.join(GujaratiTokenizer(predicted))
        f.write(predicted + '\n')

print("Predicted values written to pred.txt")

In [None]:
import difflib
from collections import defaultdict

def tokenize_gu(text):
    return GujaratiTokenizer(text)

def generate_edits(src_tokens, ref_tokens, annotator_id):
    matcher = difflib.SequenceMatcher(None, src_tokens, ref_tokens)
    edits = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            continue
        corrections = ' '.join(ref_tokens[j1:j2]) if j1 < j2 else '-NONE-'
        err_type = {
            'replace': 'R:OTHER',
            'insert':  'M:OTHER',
            'delete':  'U:OTHER'
        }[tag]
        edits.append(f"A {i1} {i2}|||{err_type}|||{corrections}|||REQUIRED|||-NONE-|||{annotator_id}")
    return edits

def generate_m2_from_parallel(input_path, output_path="gold.m2"):
    sentence_blocks = []

    with open(input_path, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f if l.strip()]

    for line in lines:
        src_text, refs_part = line.split('||', 1)
        src_tokens = tokenize_gu(src_text)
        refs = [r.strip() for r in refs_part.split('|') if r.strip()]

        s_line = f"S {' '.join(src_tokens)}"
        all_edits = []

        for ann_id, ref in enumerate(refs):
            ref_tokens = tokenize_gu(ref)
            edits = generate_edits(src_tokens, ref_tokens, annotator_id=str(ann_id))
            if not edits:
                all_edits.append(f"A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||{ann_id}")
            else:
                all_edits.extend(edits)

        sentence_blocks.append((s_line, all_edits))

    # Write all at once
    with open(output_path, 'w', encoding='utf-8') as out:
        for s_line, edits in sentence_blocks:
            out.write(s_line + "\n")
            for edit in edits:
                out.write(edit + "\n")
            out.write("\n")

    print(f"Generated {output_path} successfully.")

In [None]:
generate_m2_from_parallel("parallel.txt", "hyp.m2")

# _

In [None]:
len(json_data)

In [None]:
!python2 /content/m2scorer/scripts/m2scorer.py /content/pred.txt /content/hyp.m2

# temp

In [None]:
# prompt: Write the code to compare two json results files, by printing missmatched predictions,
# 29.86 32.47 32.74 33.21 32.15
def compare_predictions(file1_path, file2_path):
    data1 = load_json_data(file1_path)
    data2 = load_json_data(file2_path)

    mismatched_predictions = []

    for entry1, entry2 in zip(data1, data2):
        if entry1["prediction"].strip() != entry2["prediction"].strip():
            mismatched_predictions.append({
                "input": entry1["input"].strip(),
                "prediction_file1": entry1["prediction"].strip(),
                "prediction_file2": entry2["prediction"].strip(),
                "reference": entry1["reference"] if isinstance(entry1["reference"], list) else entry1["reference"]
            })

    if not mismatched_predictions:
        print("All predictions match between the two files.")
    else:
        print("Mismatched predictions found:")
        for mismatch in mismatched_predictions:
            print(f"Input: {mismatch['input']}")
            print(f"Prediction from file 1: {mismatch['prediction_file1']}")
            print(f"Prediction from file 2: {mismatch['prediction_file2']}")
            print(f"Reference(s): {mismatch['reference']}")
            print("-" * 20)

# Example usage: Replace with your actual file paths
file1_path = RES_FOLDER + "cot_with_m1&m2_k1=5_k2=2.json" # Assuming this is the first file
# Create a dummy second file for demonstration if needed, or use your actual second file
# For this example, let's assume you have another results file named "another_results.json"
# If you don't have a second file, you can duplicate the first one for testing:
# !cp {file1_path} {RES_FOLDER}/another_results.json
# file2_path = RES_FOLDER + "another_results.json"

# Replace with the actual path to your second results file
file2_path = RES_FOLDER + "cot_with_m1_k1=3.json" # Replace with your actual file path

compare_predictions(file1_path, file2_path)

In [None]:
import json
import os

def count_total_tokens(file_path):
    total_tokens = 0

    # Determine file extension
    _, ext = os.path.splitext(file_path)

    if ext == ".json":
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for entry in data:
            # Tokenize 'input'
            input_text = entry.get("input", "")
            total_tokens += len(GujaratiTokenizer(input_text))

            # Tokenize 'reference'
            references = entry.get("reference", [])
            if isinstance(references, str):
                references = [references]

            for ref in references:
                total_tokens += len(GujaratiTokenizer(ref))

    elif ext == ".txt":
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:  # Skip empty lines
                    total_tokens += len(GujaratiTokenizer(line))

    else:
        raise ValueError(f"Unsupported file format: {ext}")

    return total_tokens

In [None]:
count_total_tokens("/content/drive/MyDrive/Smruti-GEC-for-Gujarati/data/validation_set.json")