# Experiment 1

In [30]:
# -------------------------------
# Required imports
# -------------------------------
print(os.getcwd())
import pandas as pd
import subprocess
import sys
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

# -------------------------------
# Paths
# -------------------------------
csv_path = "./IndicGEC2025/Hindi/train.csv"
ssf_tokenizer_script = "./Tokenizer_for_Indian_Languages/tokenize_in_SSF_format_with_sentence_tokenization.py"

# -------------------------------
# Load ILID model
# -------------------------------
tokenizer_model = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
device = 0 if torch.cuda.is_available() else -1  # GPU if available

model = AutoModelForSequenceClassification.from_pretrained("pruthwik/ilid-muril-model")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

# ILID index to label mapping
index_to_label_dict = {0: 'asm', 1: 'ben', 2: 'brx', 3: 'doi', 4: 'eng', 5: 'gom', 
                       6: 'guj', 7: 'hin', 8: 'kan', 9: 'kas', 10: 'mai', 11: 'mal', 
                       12: 'mar', 13: 'mni_Beng', 14: 'mni_Mtei', 15: 'npi', 16: 'ory', 
                       17: 'pan', 18: 'san', 19: 'sat', 20: 'snd_Arab', 21: 'snd_Deva', 
                       22: 'tam', 23: 'tel', 24: 'urd'}

# -------------------------------
# Function to tokenize a sentence using SSF tokenizer
# -------------------------------
def tokenize_sentence_ssf(sentence):
    """
    Tokenize a single sentence using SSF tokenizer.
    Returns a list of tokens (strings).
    """
    with open("temp_input.txt", "w", encoding="utf-8") as f:
        f.write(sentence.strip() + "\n")
    
    # Run tokenizer script
    subprocess.run(
        [sys.executable, ssf_tokenizer_script, "--input", "temp_input.txt", "--output", "temp_output.txt", "--lang", "hi"],
        check=True
    )
    
    # Read tokens from output
    tokens = []
    with open("temp_output.txt", "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "" or line.startswith("*") or line.startswith("("):
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                tokens.append(parts[1])
    return tokens

# -------------------------------
# Function to assign token-level language
# -------------------------------
def label_tokens(tokens, sentence_lang):
    """
    Assign token-level language.
    ASCII/English tokens -> 'eng'
    Others -> sentence-level lang
    """
    labeled = []
    for t in tokens:
        if any(c.isascii() for c in t):
            labeled.append({"token": t, "lang": "eng"})
        else:
            labeled.append({"token": t, "lang": sentence_lang})
    return labeled

# -------------------------------
# Load CSV
# -------------------------------
df = pd.read_csv(csv_path)
input_sentences = df["Input sentence"].tolist()
output_sentences = df["Output sentence"].tolist()

# -------------------------------
# Process sentences
# -------------------------------
input_tokens_list = []
output_tokens_list = []

for inp_sent, out_sent in zip(input_sentences, output_sentences):
    # Predict sentence language
    inp_pred = pipe([inp_sent])[0]['label']
    inp_lang = index_to_label_dict[int(inp_pred.split('_')[1])]
    
    out_pred = pipe([out_sent])[0]['label']
    out_lang = index_to_label_dict[int(out_pred.split('_')[1])]
    
    # Tokenize
    inp_tokens = tokenize_sentence_ssf(inp_sent)
    out_tokens = tokenize_sentence_ssf(out_sent)
    
    # Assign token-level language
    input_tokens_list.append(label_tokens(inp_tokens, inp_lang))
    output_tokens_list.append(label_tokens(out_tokens, out_lang))

# -------------------------------
# Example: row 137
# -------------------------------
row_index = 136  # 137th row (0-based index)
print("137th Input sentence:", input_sentences[row_index])
print("Tokens with language:", input_tokens_list[row_index])
print("\n137th Output sentence:", output_sentences[row_index])
print("Tokens with language:", output_tokens_list[row_index])


e:\Rucha_ws\GitHub\bhasha


Device set to use cpu


137th Input sentence: ‡§á‡§® ‡§∏‡§¨ ‡§ï‡•á ‡§ï‡§æ‡§∞‡§£ ‡§¶‡•á‡§ñ‡§æ ‡§ó‡§Ø‡§æ ‡§π‡•à ‡§ï‡§ø ‡§µ‡§æ‡§Ø‡•Å ‡§Æ‡•á‡§Ç ‡§Ö‡§®‡§æ‡§Ü‡§µ‡§∏‡•ç‡§Ø‡§ï ‡§ú‡§ø‡§∏‡•á ‡§ú‡•à‡§∏‡•á, ‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ ‡§Æ‡§æ‡§§‡•ç‡§∞‡§æ ‡§Æ‡•á‡§Ç ‡§ï‡§æ‡§∞‡•ç‡§¨‡§® ‡§°‡§æ‡§à ‡§Ö‡§ï‡•ç‡§∏‡§æ‡§á‡§° (CO‚ÇÇ), ‡§è‡§Ç‡§µ ‡§Ö‡§®‡•ç‡§Ø ‡§ö‡§ø‡§ú ‡§Ü ‡§ú‡§æ‡§§‡•Ä‡•§
Tokens with language: [{'token': '‡§á‡§®', 'lang': 'hin'}, {'token': '‡§∏‡§¨', 'lang': 'hin'}, {'token': '‡§ï‡•á', 'lang': 'hin'}, {'token': '‡§ï‡§æ‡§∞‡§£', 'lang': 'hin'}, {'token': '‡§¶‡•á‡§ñ‡§æ', 'lang': 'hin'}, {'token': '‡§ó‡§Ø‡§æ', 'lang': 'hin'}, {'token': '‡§π‡•à', 'lang': 'hin'}, {'token': '‡§ï‡§ø', 'lang': 'hin'}, {'token': '‡§µ‡§æ‡§Ø‡•Å', 'lang': 'hin'}, {'token': '‡§Æ‡•á‡§Ç', 'lang': 'hin'}, {'token': '‡§Ö‡§®‡§æ‡§Ü‡§µ‡§∏‡•ç‡§Ø‡§ï', 'lang': 'hin'}, {'token': '‡§ú‡§ø‡§∏‡•á', 'lang': 'hin'}, {'token': '‡§ú‡•à‡§∏‡•á', 'lang': 'hin'}, {'token': ',', 'lang': 'eng'}, {'token': '‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ', 'lang': 'hin'}, {'token': '‡§Æ‡§æ‡§§‡•ç‡§∞‡§æ', 'lang': '

# Experiment 2

In [2]:
# -------------------------------
# Imports
# -------------------------------
import pandas as pd
import subprocess
import sys
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch
from indictrans import Transliterator  # Python class directly

# -------------------------------
# Paths
# -------------------------------
csv_path = "../IndicGEC2025/Hindi/train.csv"
ssf_tokenizer_script = "../Tokenizer_for_Indian_Languages/tokenize_in_SSF_format_with_sentence_tokenization.py"

# -------------------------------
# Load ILID model for sentence-level language identification
# -------------------------------
tokenizer_model = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
device = 0 if torch.cuda.is_available() else -1  # GPU if available

model = AutoModelForSequenceClassification.from_pretrained("pruthwik/ilid-muril-model")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

index_to_label_dict = {
    0: 'asm', 1: 'ben', 2: 'brx', 3: 'doi', 4: 'eng', 5: 'gom',
    6: 'guj', 7: 'hin', 8: 'kan', 9: 'kas', 10: 'mai', 11: 'mal',
    12: 'mar', 13: 'mni_Beng', 14: 'mni_Mtei', 15: 'npi', 16: 'ory',
    17: 'pan', 18: 'san', 19: 'sat', 20: 'snd_Arab', 21: 'snd_Deva',
    22: 'tam', 23: 'tel', 24: 'urd'
}

# -------------------------------
# Tokenize a sentence using SSF tokenizer
# -------------------------------
def tokenize_sentence_ssf(sentence):
    with open("temp_input.txt", "w", encoding="utf-8") as f:
        f.write(sentence.strip() + "\n")

    subprocess.run(
        [sys.executable, ssf_tokenizer_script,
         "--input", "temp_input.txt",
         "--output", "temp_output.txt",
         "--lang", "hi"],
        check=True
    )

    tokens = []
    with open("temp_output.txt", "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "" or line.startswith("*") or line.startswith("("):
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                tokens.append(parts[1])
    return tokens

# -------------------------------
# Transliteration helpers
# -------------------------------
trn_cache = {}   # cache Transliterator outputs
trn_dict = {}    # one Transliterator per target language

def get_transliterator(main_lang):
    """Return Transliterator object if supported, else None (fallback)."""
    if main_lang in trn_dict:
        return trn_dict[main_lang]

    try:
        trn = Transliterator(source='eng', target=main_lang, build_lookup=True)
        trn_dict[main_lang] = trn
        return trn
    except NotImplementedError:
        print(f"[Warning] Transliteration not implemented for eng-{main_lang}. Using raw tokens.")
        trn_dict[main_lang] = None
        return None

def label_and_transliterate_tokens(tokens, main_lang):
    labeled = []
    trn = get_transliterator(main_lang)

    for t in tokens:
        # Punctuation
        if all(char in '.,!?()[]{}:;\'"‚Äú‚Äù‚Äò‚Äô' for char in t):
            labeled.append({"token": t, "lang": "punct"})
        # English/ASCII -> transliterate (if available)
        elif any(c.isascii() for c in t):
            if trn is not None:
                if t in trn_cache:
                    translit = trn_cache[t]
                else:
                    translit = trn.transform(t)
                    trn_cache[t] = translit
                labeled.append({"token": translit, "lang": main_lang})
            else:
                # Fallback: keep token as-is
                labeled.append({"token": t, "lang": "eng"})
        # Non-English word -> keep as main language
        else:
            labeled.append({"token": t, "lang": main_lang})
    return labeled

# -------------------------------
# Load CSV
# -------------------------------
df = pd.read_csv(csv_path)
input_sentences = df["Input sentence"].tolist()
output_sentences = df["Output sentence"].tolist()

input_tokens_list = []
output_tokens_list = []

# -------------------------------
# Process each sentence row
# -------------------------------
for inp_sent, out_sent in zip(input_sentences, output_sentences):
    # Sentence-level language
    inp_pred = pipe([inp_sent])[0]['label']
    inp_lang = index_to_label_dict[int(inp_pred.split('_')[1])]

    out_pred = pipe([out_sent])[0]['label']
    out_lang = index_to_label_dict[int(out_pred.split('_')[1])]

    # Tokenize
    inp_tokens = tokenize_sentence_ssf(inp_sent)
    out_tokens = tokenize_sentence_ssf(out_sent)

    # Assign token-level language & transliterate
    input_tokens_list.append(label_and_transliterate_tokens(inp_tokens, inp_lang))
    output_tokens_list.append(label_and_transliterate_tokens(out_tokens, out_lang))

# -------------------------------
# Example row
# -------------------------------
row_index = 136
print("137th Input sentence:", input_sentences[row_index])
print("Tokens with language:", input_tokens_list[row_index])
print("\n137th Output sentence:", output_sentences[row_index])
print("Tokens with language:", output_tokens_list[row_index])


ModuleNotFoundError: No module named 'pandas'

# Experiment 3

In [7]:
# -------------------------------
# Imports
# -------------------------------
import pandas as pd
import subprocess
import sys
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch
from indictrans import Transliterator  # Python class directly

# -------------------------------
# Paths
# -------------------------------
csv_path = "../IndicGEC2025/Hindi/train.csv"
ssf_tokenizer_script = "../Tokenizer_for_Indian_Languages/tokenize_in_SSF_format_with_sentence_tokenization.py"

# -------------------------------
# Load ILID model for sentence-level language identification
# -------------------------------
tokenizer_model = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
device = 0 if torch.cuda.is_available() else -1  # GPU if available

model = AutoModelForSequenceClassification.from_pretrained("pruthwik/ilid-muril-model")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

index_to_label_dict = {
    0: 'asm', 1: 'ben', 2: 'brx', 3: 'doi', 4: 'eng', 5: 'gom',
    6: 'guj', 7: 'hin', 8: 'kan', 9: 'kas', 10: 'mai', 11: 'mal',
    12: 'mar', 13: 'mni_Beng', 14: 'mni_Mtei', 15: 'npi', 16: 'ory',
    17: 'pan', 18: 'san', 19: 'sat', 20: 'snd_Arab', 21: 'snd_Deva',
    22: 'tam', 23: 'tel', 24: 'urd'
}

# -------------------------------
# Tokenize a sentence using SSF tokenizer
# -------------------------------
def label_and_transliterate_tokens(tokens, main_lang):
    labeled = []

    for i, t in enumerate(tokens):
        # ---------------------------
        # 1. Punctuation
        # ---------------------------
        if all(char in '.,!?()[]{}:;\'"‚Äú‚Äù‚Äò‚Äô' for char in t):
            labeled.append({"token": t, "lang": "punct"})
            continue

        # ---------------------------
        # 2. If token inside () and ASCII & <=3 letters ‚Üí keep raw
        # ---------------------------
        if (
            t.startswith("(") and t.endswith(")") and
            all(ord(c) < 128 for c in t.strip("()")) and
            len(t.strip("()")) <= 3
        ):
            labeled.append({"token": t.strip("()"), "lang": "eng"})
            continue

        # ---------------------------
        # 3. If ASCII token ‚Üí transliterate to main_lang
        # ---------------------------
        if all(ord(c) < 128 for c in t):
            try:
                trn = Transliterator(source='eng', target=main_lang, build_lookup=True)
                translit = trn.transform(t)
                labeled.append({"token": translit, "lang": main_lang})
            except NotImplementedError:
                labeled.append({"token": t, "lang": "eng"})
            continue

        # ---------------------------
        # 4. If Non-ASCII (Indic etc.)
        # ---------------------------
        detected_lang = "hin"  # üîπ you can replace with actual token-level detector
        if detected_lang != main_lang:
            try:
                trn = Transliterator(source=detected_lang, target=main_lang, build_lookup=True)
                translit = trn.transform(t)
                labeled.append({"token": translit, "lang": main_lang})
            except NotImplementedError:
                labeled.append({"token": t, "lang": detected_lang})
        else:
            labeled.append({"token": t, "lang": main_lang})

    return labeled

# -------------------------------
# Load CSV
# -------------------------------
df = pd.read_csv(csv_path)
input_sentences = df["Input sentence"].tolist()
output_sentences = df["Output sentence"].tolist()

input_tokens_list = []
output_tokens_list = []

# -------------------------------
# Process each sentence row
# -------------------------------
for inp_sent, out_sent in zip(input_sentences, output_sentences):
    # Sentence-level language
    inp_pred = pipe([inp_sent])[0]['label']
    inp_lang = index_to_label_dict[int(inp_pred.split('_')[1])]

    out_pred = pipe([out_sent])[0]['label']
    out_lang = index_to_label_dict[int(out_pred.split('_')[1])]

    # Tokenize
    inp_tokens = tokenize_sentence_ssf(inp_sent)
    out_tokens = tokenize_sentence_ssf(out_sent)

    # Assign token-level language & transliterate
    input_tokens_list.append(label_and_transliterate_tokens(inp_tokens, inp_lang))
    output_tokens_list.append(label_and_transliterate_tokens(out_tokens, out_lang))

# -------------------------------
# Example row
# -------------------------------
row_index = 136
print("137th Input sentence:", input_sentences[row_index])
print("Tokens with language:", input_tokens_list[row_index])
print("\n137th Output sentence:", output_sentences[row_index])
print("Tokens with language:", output_tokens_list[row_index])


Device set to use cpu


137th Input sentence: ‡§á‡§® ‡§∏‡§¨ ‡§ï‡•á ‡§ï‡§æ‡§∞‡§£ ‡§¶‡•á‡§ñ‡§æ ‡§ó‡§Ø‡§æ ‡§π‡•à ‡§ï‡§ø ‡§µ‡§æ‡§Ø‡•Å ‡§Æ‡•á‡§Ç ‡§Ö‡§®‡§æ‡§Ü‡§µ‡§∏‡•ç‡§Ø‡§ï ‡§ú‡§ø‡§∏‡•á ‡§ú‡•à‡§∏‡•á, ‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ ‡§Æ‡§æ‡§§‡•ç‡§∞‡§æ ‡§Æ‡•á‡§Ç ‡§ï‡§æ‡§∞‡•ç‡§¨‡§® ‡§°‡§æ‡§à ‡§Ö‡§ï‡•ç‡§∏‡§æ‡§á‡§° (CO‚ÇÇ), ‡§è‡§Ç‡§µ ‡§Ö‡§®‡•ç‡§Ø ‡§ö‡§ø‡§ú ‡§Ü ‡§ú‡§æ‡§§‡•Ä‡•§
Tokens with language: [{'token': '‡§á‡§®', 'lang': 'hin'}, {'token': '‡§∏‡§¨', 'lang': 'hin'}, {'token': '‡§ï‡•á', 'lang': 'hin'}, {'token': '‡§ï‡§æ‡§∞‡§£', 'lang': 'hin'}, {'token': '‡§¶‡•á‡§ñ‡§æ', 'lang': 'hin'}, {'token': '‡§ó‡§Ø‡§æ', 'lang': 'hin'}, {'token': '‡§π‡•à', 'lang': 'hin'}, {'token': '‡§ï‡§ø', 'lang': 'hin'}, {'token': '‡§µ‡§æ‡§Ø‡•Å', 'lang': 'hin'}, {'token': '‡§Æ‡•á‡§Ç', 'lang': 'hin'}, {'token': '‡§Ö‡§®‡§æ‡§Ü‡§µ‡§∏‡•ç‡§Ø‡§ï', 'lang': 'hin'}, {'token': '‡§ú‡§ø‡§∏‡•á', 'lang': 'hin'}, {'token': '‡§ú‡•à‡§∏‡•á', 'lang': 'hin'}, {'token': ',', 'lang': 'punct'}, {'token': '‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ', 'lang': 'hin'}, {'token': '‡§Æ‡§æ‡§§‡•ç‡§∞‡§æ', 'lang':

# Experiment 4 

In [1]:
# -------------------------------
# Imports
# -------------------------------
import pandas as pd
import subprocess
import sys
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch
from indictrans import Transliterator  # Python class directly

# -------------------------------
# Paths
# -------------------------------
csv_path = "../IndicGEC2025/Hindi/train.csv"
ssf_tokenizer_script = "../Tokenizer_for_Indian_Languages/tokenize_in_SSF_format_with_sentence_tokenization.py"

# -------------------------------
# Load ILID model for sentence-level language identification
# -------------------------------
tokenizer_model = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
device = 0 if torch.cuda.is_available() else -1  # GPU if available

model = AutoModelForSequenceClassification.from_pretrained("pruthwik/ilid-muril-model")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

index_to_label_dict = {
    0: 'asm', 1: 'ben', 2: 'brx', 3: 'doi', 4: 'eng', 5: 'gom',
    6: 'guj', 7: 'hin', 8: 'kan', 9: 'kas', 10: 'mai', 11: 'mal',
    12: 'mar', 13: 'mni_Beng', 14: 'mni_Mtei', 15: 'npi', 16: 'ory',
    17: 'pan', 18: 'san', 19: 'sat', 20: 'snd_Arab', 21: 'snd_Deva',
    22: 'tam', 23: 'tel', 24: 'urd'
}

# -------------------------------
# Tokenize a sentence using SSF tokenizer
# -------------------------------
def tokenize_sentence_ssf(sentence):
    with open("temp_input.txt", "w", encoding="utf-8") as f:
        f.write(sentence.strip() + "\n")

    subprocess.run(
        [sys.executable, ssf_tokenizer_script,
         "--input", "temp_input.txt",
         "--output", "temp_output.txt",
         "--lang", "hi"],
        check=True
    )

    tokens = []
    with open("temp_output.txt", "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "" or line.startswith("*") or line.startswith("("):
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                tokens.append(parts[1])
    return tokens

# -------------------------------
# Assign token-level language & transliterate
# -------------------------------
def label_and_transliterate_tokens(tokens, main_lang):
    labeled = []

    for t in tokens:
        # 1. Punctuation
        if all(char in '.,!?()[]{}:;\'"‚Äú‚Äù‚Äò‚Äô' for char in t):
            labeled.append({"token": t, "lang": "punct"})
            continue

        # 2. If token inside () and ASCII & <=3 letters ‚Üí keep raw
        if (
            t.startswith("(") and t.endswith(")") and
            all(ord(c) < 128 for c in t.strip("()")) and
            len(t.strip("()")) <= 3
        ):
            labeled.append({"token": t.strip("()"), "lang": "eng"})
            continue

        # 3. If ASCII token ‚Üí transliterate to main_lang
        if all(ord(c) < 128 for c in t):
            try:
                trn = Transliterator(source='eng', target=main_lang, build_lookup=True)
                translit = trn.transform(t)
                labeled.append({"token": translit, "lang": main_lang})
            except NotImplementedError:
                labeled.append({"token": t, "lang": "eng"})
            continue

        # 4. If Non-ASCII token
        detected_lang = "hin"  # placeholder (replace with token-level model if needed)
        if detected_lang != main_lang:
            try:
                trn = Transliterator(source=detected_lang, target=main_lang, build_lookup=True)
                translit = trn.transform(t)
                labeled.append({"token": translit, "lang": main_lang})
            except NotImplementedError:
                labeled.append({"token": t, "lang": detected_lang})
        else:
            labeled.append({"token": t, "lang": main_lang})

    return labeled

# -------------------------------
# Load CSV
# -------------------------------
df = pd.read_csv(csv_path)
input_sentences = df["Input sentence"].tolist()
output_sentences = df["Output sentence"].tolist()

input_tokens_list = []
output_tokens_list = []

# -------------------------------
# Process each sentence row
# -------------------------------
for inp_sent, out_sent in zip(input_sentences, output_sentences):
    # Sentence-level language
    inp_pred = pipe([inp_sent])[0]['label']
    inp_lang = index_to_label_dict[int(inp_pred.split('_')[1])]

    out_pred = pipe([out_sent])[0]['label']
    out_lang = index_to_label_dict[int(out_pred.split('_')[1])]

    # Tokenize
    inp_tokens = tokenize_sentence_ssf(inp_sent)
    out_tokens = tokenize_sentence_ssf(out_sent)

    # Assign token-level language & transliterate
    input_tokens_list.append(label_and_transliterate_tokens(inp_tokens, inp_lang))
    output_tokens_list.append(label_and_transliterate_tokens(out_tokens, out_lang))

# -------------------------------
# Compare tokens ‚Üí assign grammar error label
# -------------------------------
out_rows = []
for inp_sent, out_sent, inp_tokens, out_tokens in zip(input_sentences, output_sentences, input_tokens_list, output_tokens_list):
    inp_raw = [tok["token"] for tok in inp_tokens]
    out_raw = [tok["token"] for tok in out_tokens]

    if len(inp_raw) != len(out_raw):
        label = 1
    else:
        if all(i == o for i, o in zip(inp_raw, out_raw)):
            label = 0
        else:
            label = 1

    out_rows.append({
        "Input sentence": inp_sent,
        "Output sentence": out_sent,
        "has_grammar_error": label
    })

# -------------------------------
# Save new CSV
# -------------------------------
output_csv_path = os.path.join(os.path.dirname("/"), "labeled_sentences.csv")
pd.DataFrame(out_rows).to_csv(output_csv_path, index=False, encoding="utf-8")

print(f"‚úÖ Labeled CSV saved to: {output_csv_path}")


ModuleNotFoundError: No module named 'pandas'