In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import torch

In [2]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")


In [10]:
from google.colab import files
uploaded = files.upload()

Saving mt5_translation_results.csv to mt5_translation_results.csv
Saving translation_results.csv to translation_results.csv


In [11]:
df1 = pd.read_csv("mt5_translation_results.csv")
df2 = pd.read_csv("translation_results.csv")

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers import pipeline

# Load models and tokenizers
eng_ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
eng_ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

eng_sent_model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
eng_sent_tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Load Hinglish model (for example purposes, use indic-bert)
hing_ner_model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")
hing_ner_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")

hing_sent_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
hing_sent_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")

# === Named Entity Recognition ===
eng_ner_pipeline = pipeline("ner", model=eng_ner_model, tokenizer=eng_ner_tokenizer, grouped_entities=True)
hing_ner_pipeline = pipeline("ner", model=hing_ner_model, tokenizer=hing_ner_tokenizer, grouped_entities=True)

# === Sentiment Analysis ===
eng_sent_pipeline = pipeline("sentiment-analysis", model=eng_sent_model, tokenizer=eng_sent_tokenizer)
hing_sent_pipeline = pipeline("sentiment-analysis", model=hing_sent_model, tokenizer=hing_sent_tokenizer)

# === MIPE-like Evaluation Metric (Simple Agreement Check) ===
def compare_outputs(eng_out, hing_out, task="NER"):
    if task == "NER":
        eng_entities = set([ent['entity_group'] for ent in eng_out])
        hing_entities = set([ent['entity_group'] for ent in hing_out])
        match = eng_entities == hing_entities
        return {"NER Consistency": match, "English Entities": eng_entities, "Hinglish Entities": hing_entities}
    elif task == "Sentiment":
        match = eng_out[0]['label'] == hing_out[0]['label']
        return {"Sentiment Consistency": match, "English Sentiment": eng_out[0]['label'], "Hinglish Sentiment": hing_out[0]['label']}

# Define the function to process the sentences
def evaluate_sentences(english_sentences, hinglish_sentences):
    ner_consistencies = []
    sentiment_consistencies = []

    # Iterate over sentences and process
    for eng_sentence, hing_sentence in zip(english_sentences, hinglish_sentences):
        # Get NER outputs
        eng_ner_output = eng_ner_pipeline(eng_sentence)
        hing_ner_output = hing_ner_pipeline(hing_sentence)

        # Get Sentiment Analysis outputs
        eng_sent_output = eng_sent_pipeline(eng_sentence)
        hing_sent_output = hing_sent_pipeline(hing_sentence)

        # Compare outputs
        ner_comparison = compare_outputs(eng_ner_output, hing_ner_output, task="NER")
        sentiment_comparison = compare_outputs(eng_sent_output, hing_sent_output, task="Sentiment")

        # Append results for average calculation
        ner_consistencies.append(ner_comparison["NER Consistency"])
        sentiment_consistencies.append(sentiment_comparison["Sentiment Consistency"])

        # Print the sentence comparison results
        print(f"English Sentence: {eng_sentence}")
        print(f"Hinglish Sentence: {hing_sentence}")
        print("NER Comparison:", ner_comparison)
        print("Sentiment Comparison:", sentiment_comparison)
        print("="*90)

    # Calculate and print average consistency
    avg_ner_consistency = sum(ner_consistencies) / len(ner_consistencies) if ner_consistencies else 0
    avg_sentiment_consistency = sum(sentiment_consistencies) / len(sentiment_consistencies) if sentiment_consistencies else 0

    print("\nAverage NER Consistency:", avg_ner_consistency)
    print("Average Sentiment Consistency:", avg_sentiment_consistency)

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/667M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


In [4]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [5]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c012c43dc7c61e7fd9d18cb38adf5ee055390a5024968e73b8e1e21a6bdd7204
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [6]:
import sacrebleu
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.metrics import accuracy_score

# Ensure that NLTK resources are downloaded
nltk.download('punkt')

def compute_metrics_for_multiple_sentences(eng_sentences, hing_sentences, references):
    """
    Computes various metrics for the comparison between multiple English and Hinglish sentences.

    Parameters:
    - eng_sentences (list of str): The list of English sentences to compare.
    - hing_sentences (list of str): The list of Hinglish sentences to compare.
    - references (list of str): The list of reference sentences for Exact Match and other comparisons.

    Returns:
    dict: A dictionary containing the average of computed metrics over all sentence pairs.
    """

    # Initialize accumulators for metric sums
    bleu_eng_sum = 0
    bleu_hing_sum = 0
    chrf_eng_sum = 0
    chrf_hing_sum = 0
    rouge_eng_sum = 0
    rouge_hing_sum = 0
    sacrebleu_eng_sum = 0
    sacrebleu_hing_sum = 0
    em_eng_sum = 0
    em_hing_sum = 0

    n = len(eng_sentences)  # Number of sentence pairs

    for eng_sentence, hing_sentence, reference in zip(eng_sentences, hing_sentences, references):
        # BLEU Metric
        bleu_eng_sum += sentence_bleu([reference.split()], eng_sentence.split())
        bleu_hing_sum += sentence_bleu([reference.split()], hing_sentence.split())

        # chrF Metric
        chrf_eng_sum += sacrebleu.corpus_chrf([eng_sentence], [reference]).score
        chrf_hing_sum += sacrebleu.corpus_chrf([hing_sentence], [reference]).score

        # ROUGE-L Metric
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_eng_sum += scorer.score(reference, eng_sentence)['rougeL'].fmeasure
        rouge_hing_sum += scorer.score(reference, hing_sentence)['rougeL'].fmeasure

        # SacreBLEU Metric
        sacrebleu_eng_sum += sacrebleu.corpus_bleu([eng_sentence], [[reference]]).score
        sacrebleu_hing_sum += sacrebleu.corpus_bleu([hing_sentence], [[reference]]).score

        # Exact Match Metric (String comparison, not word-level)
        em_eng_sum += 100 if eng_sentence == reference else 0
        em_hing_sum += 100 if hing_sentence == reference else 0

    # Calculate averages for all metrics
    metrics = {
        "Average BLEU (English)": bleu_eng_sum / n,
        "Average BLEU (Hinglish)": bleu_hing_sum / n,
        "Average chrF (English)": chrf_eng_sum / n,
        "Average chrF (Hinglish)": chrf_hing_sum / n,
        "Average ROUGE-L (English)": rouge_eng_sum / n,
        "Average ROUGE-L (Hinglish)": rouge_hing_sum / n,
        "Average SacreBLEU (English)": sacrebleu_eng_sum / n,
        "Average SacreBLEU (Hinglish)": sacrebleu_hing_sum / n,
        "Average Exact Match (English)": em_eng_sum / n,
        "Average Exact Match (Hinglish)": em_hing_sum / n
    }

    return metrics




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# IndicBART evaluation

In [7]:
english_sentences = [
    "My smartwatch just died in the middle of a workout.",
    "The Wi-Fi router is acting up again.",
    "I need to clear my browser history.",
    "I forgot to cancel my subscription to that streaming service.",
    "The game crashed right before I reached the final boss.",
    "My phone froze when I was about to check an important message.",
    "I need to fix the bug in my code before the deadline.",
    "I'm trying to get my hands on the new gaming console.",
    "I ordered food online, but they gave me the wrong item.",
    "I need a caffeine boost to survive this meeting.",
    "Has my timer started?",
    "Set an alarm for me.",
    "Did I get new messages?",
    "What is the time right now?",
    "It will be sunny today."
]

hinglish_sentences = [
    "meri smartwatch just died in the middle of a workout.",
    "Wi-Fi router fir se chal raha hai.",
    "mujhe mere browser history clear karna hai.",
    "I forgot to cancel my subscription to that streaming service.",
    "The game crashed right before I reached the final boss.",
    "mera phone froze kab tha jab mujhe important message check karne ke liye.",
    "mujhe deadline se pehle mere code me bug fix karna hai.",
    "mai new gaming console par apne hands laana chahta hoon.",
    "I ordered food online, but they gave me the wrong item.",
    "mujhe is meeting me caffeine boost chahiye.",
    "Kya mera timer shuru hoga?",
    "Mere liye ek alarm set karen.",
    "Kya maine naye messages milgaye hai?",
    "Abhi ka time kya hai?",
    "Aaj dhoop hogi."
]

print("\n\nGLUECoS\n")
evaluate_sentences(english_sentences, hinglish_sentences)
print("\n\nMIPE\n")
metrics = compute_metrics_for_multiple_sentences(english_sentences, hinglish_sentences, english_sentences)

for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")




GLUECoS

English Sentence: My smartwatch just died in the middle of a workout.
Hinglish Sentence: meri smartwatch just died in the middle of a workout.
NER Comparison: {'NER Consistency': False, 'English Entities': set(), 'Hinglish Entities': {'PER'}}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentiment': '1 star', 'Hinglish Sentiment': 'negative'}
English Sentence: The Wi-Fi router is acting up again.
Hinglish Sentence: Wi-Fi router fir se chal raha hai.
NER Comparison: {'NER Consistency': False, 'English Entities': {'MISC'}, 'Hinglish Entities': set()}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentiment': '5 stars', 'Hinglish Sentiment': 'neutral'}
English Sentence: I need to clear my browser history.
Hinglish Sentence: mujhe mere browser history clear karna hai.
NER Comparison: {'NER Consistency': False, 'English Entities': set(), 'Hinglish Entities': {'PER', 'ORG'}}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentimen

# mT5 evaluation

In [8]:
english_sentences = [
    "My smartwatch just died in the middle of a workout.",
    "The Wi-Fi router is acting up again.",
    "I need to clear my browser history.",
    "I forgot to cancel my subscription to that streaming service.",
    "The game crashed right before I reached the final boss.",
    "My phone froze when I was about to check an important message.",
    "I need to fix the bug in my code before the deadline.",
    "I'm trying to get my hands on the new gaming console.",
    "I ordered food online, but they gave me the wrong item.",
    "I need a caffeine boost to survive this meeting."
]

hinglish_sentences = [
    "meri workout ke middle me ek workout mein hein.",
    "Wi-Fi ka acting up kiya gaya tha.",
    "mujhe mere browser history ko clear karna chahiye.",
    "mai apne subscription ko cancel karna chahta hoon.",
    "maine shuru kiya tha uske baad game crash kiya tha.",
    "mere phone ko booze kiya tha jab maine important message check kiya tha.",
    "mujhe mere code se pehle fix karna chahiye.",
    "mai naya gaming device par milna chahta hoon.",
    "maine ek naya item kharidne ka hein.",
    "mujhe is meeting ko ek caffeine ka accha laga."
]


print("\n\nGLUECoS\n")
evaluate_sentences(english_sentences, hinglish_sentences)
print("\n\nMIPE\n")
metrics = compute_metrics_for_multiple_sentences(english_sentences, hinglish_sentences, english_sentences)

for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")



GLUECoS

English Sentence: My smartwatch just died in the middle of a workout.
Hinglish Sentence: meri workout ke middle me ek workout mein hein.
NER Comparison: {'NER Consistency': False, 'English Entities': set(), 'Hinglish Entities': {'PER'}}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentiment': '1 star', 'Hinglish Sentiment': 'neutral'}
English Sentence: The Wi-Fi router is acting up again.
Hinglish Sentence: Wi-Fi ka acting up kiya gaya tha.
NER Comparison: {'NER Consistency': False, 'English Entities': {'MISC'}, 'Hinglish Entities': {'PER'}}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentiment': '5 stars', 'Hinglish Sentiment': 'negative'}
English Sentence: I need to clear my browser history.
Hinglish Sentence: mujhe mere browser history ko clear karna chahiye.
NER Comparison: {'NER Consistency': False, 'English Entities': set(), 'Hinglish Entities': {'PER', 'ORG'}}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentim

# Llama evaluation

In [9]:
english_sentences = [
    "My smartwatch just died in the middle of a workout.",
    "The Wi-Fi router is acting up again.",
    "I need to clear my browser history.",
    "I forgot to cancel my subscription to that streaming service.",
    "The game crashed right before I reached the final boss.",
    "My phone froze when I was about to check an important message.",
    "I need to fix the bug in my code before the deadline.",
    "I'm trying to get my hands on the new gaming console.",
    "I ordered food online, but they gave me the wrong item.",
    "I need a caffeine boost to survive this meeting."
]

hinglish_sentences = [
    "My smartwatch ko workout mein ek din mein mar diya.",
    "The Wi-Fi router is acting up again.",
    "mujhe apna browser history clear karna chahiye.",
    "mujhe wo streaming service mei apna subscription cancel karne ke liye yaad nahi bacha.",
    "The game crashed right before I reached the final boss.",
    "mujhe mere phone ko pankh milne par froze hoga.",
    "mujhe deadline se pehle code mei bug fix karna chahiye.",
    "mujhe new gaming console ke paas mujhe haan.",
    "I ordered food online, but they gave me the wrong item.",
    "mujhe is meeting me survive karne ke liye caffeine boost chahiye."
]


print("\n\nGLUECoS\n")
evaluate_sentences(english_sentences, hinglish_sentences)
print("\n\nMIPE\n")
metrics = compute_metrics_for_multiple_sentences(english_sentences, hinglish_sentences, english_sentences)

for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")



GLUECoS

English Sentence: My smartwatch just died in the middle of a workout.
Hinglish Sentence: My smartwatch ko workout mein ek din mein mar diya.
NER Comparison: {'NER Consistency': True, 'English Entities': set(), 'Hinglish Entities': set()}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentiment': '1 star', 'Hinglish Sentiment': 'neutral'}
English Sentence: The Wi-Fi router is acting up again.
Hinglish Sentence: The Wi-Fi router is acting up again.
NER Comparison: {'NER Consistency': False, 'English Entities': {'MISC'}, 'Hinglish Entities': set()}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentiment': '5 stars', 'Hinglish Sentiment': 'neutral'}
English Sentence: I need to clear my browser history.
Hinglish Sentence: mujhe apna browser history clear karna chahiye.
NER Comparison: {'NER Consistency': False, 'English Entities': set(), 'Hinglish Entities': {'PER', 'ORG'}}
Sentiment Comparison: {'Sentiment Consistency': False, 'English Sentimen

In [12]:
# Define the function to process the sentences
def evaluate_sentences1(english_sentences, hinglish_sentences):
    ner_consistencies = []
    sentiment_consistencies = []

    # Iterate over sentences and process
    for eng_sentence, hing_sentence in zip(english_sentences, hinglish_sentences):
        # Get NER outputs
        eng_ner_output = eng_ner_pipeline(eng_sentence)
        hing_ner_output = hing_ner_pipeline(hing_sentence)

        # Get Sentiment Analysis outputs
        eng_sent_output = eng_sent_pipeline(eng_sentence)
        hing_sent_output = hing_sent_pipeline(hing_sentence)

        # Compare outputs
        ner_comparison = compare_outputs(eng_ner_output, hing_ner_output, task="NER")
        sentiment_comparison = compare_outputs(eng_sent_output, hing_sent_output, task="Sentiment")

        # Append results for average calculation
        ner_consistencies.append(ner_comparison["NER Consistency"])
        sentiment_consistencies.append(sentiment_comparison["Sentiment Consistency"])

        # # Print the sentence comparison results
        # print(f"English Sentence: {eng_sentence}")
        # print(f"Hinglish Sentence: {hing_sentence}")
        # print("NER Comparison:", ner_comparison)
        # print("Sentiment Comparison:", sentiment_comparison)
        # print("="*90)

    # Calculate and print average consistency
    avg_ner_consistency = sum(ner_consistencies) / len(ner_consistencies) if ner_consistencies else 0
    avg_sentiment_consistency = sum(sentiment_consistencies) / len(sentiment_consistencies) if sentiment_consistencies else 0

    print("\nAverage NER Consistency:", avg_ner_consistency)
    print("Average Sentiment Consistency:", avg_sentiment_consistency)

In [13]:
df = df1
english_sentences = df["English"].tolist()
hinglish_sentences = df["Predicted_Hinglish"].tolist()


print("\n\nGLUECoS\n")
evaluate_sentences1(english_sentences, hinglish_sentences)
print("\n\nMIPE\n")
metrics = compute_metrics_for_multiple_sentences(english_sentences, hinglish_sentences, english_sentences)

for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")



GLUECoS


Average NER Consistency: 0.36
Average Sentiment Consistency: 0.0


MIPE

Average BLEU (English): 0.9600
Average BLEU (Hinglish): 0.0291
Average chrF (English): 15.8106
Average chrF (Hinglish): 4.2856
Average ROUGE-L (English): 1.0000
Average ROUGE-L (Hinglish): 0.2726
Average SacreBLEU (English): 96.0000
Average SacreBLEU (Hinglish): 12.2755
Average Exact Match (English): 100.0000
Average Exact Match (Hinglish): 0.0000


In [14]:
df = df2
english_sentences = df["English"].tolist()
hinglish_sentences = df["Predicted_Hinglish"].tolist()


print("\n\nGLUECoS\n")
evaluate_sentences1(english_sentences, hinglish_sentences)
print("\n\nMIPE\n")
metrics = compute_metrics_for_multiple_sentences(english_sentences, hinglish_sentences, english_sentences)

for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")



GLUECoS


Average NER Consistency: 0.425
Average Sentiment Consistency: 0.0


MIPE

Average BLEU (English): 0.9450
Average BLEU (Hinglish): 0.0534
Average chrF (English): 15.2936
Average chrF (Hinglish): 4.4407
Average ROUGE-L (English): 1.0000
Average ROUGE-L (Hinglish): 0.2878
Average SacreBLEU (English): 94.5000
Average SacreBLEU (Hinglish): 14.0348
Average Exact Match (English): 100.0000
Average Exact Match (Hinglish): 0.0000
