In [None]:
from datasets import load_dataset, load_from_disk

import json
import random

import numpy as np
import pandas as pd
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("LingoIITGN/ganga-2-1b")

In [None]:
with open("token_per_language.json", 'r') as f:
    token_per_language = json.load(f)

### Bitext Mining

Task: CrossSum

In [None]:
english_hindi = []
with open("./Data/Bitext Mining/CrossSum/english-hindi_CrossSum/english-hindi_test.jsonl", "r") as f:
    for sample in f:
        english_hindi.append(json.loads(sample))

hindi_english = []
with open("./Data/Bitext Mining/CrossSum/hindi-english_CrossSum/hindi-english_test.jsonl", "r") as f:
    for sample in f:
        hindi_english.append(json.loads(sample))

hindi_hindi = []
with open("./Data/Bitext Mining/CrossSum/hindi-hindi_CrossSum/hindi-hindi_test.jsonl", "r") as f:
    for sample in f:
        hindi_hindi.append(json.loads(sample))

english_english = []
with open("./Data/Bitext Mining/CrossSum/english-english_CrossSum/english-english_test.jsonl", "r") as f:
    for sample in f:
        english_english.append(json.loads(sample))

In [None]:
total_english_tokens = 0
total_hindi_tokens = 0

for sample in english_hindi:
    eng = tokenizer.encode(sample['text'])
    hin = tokenizer.encode(sample['summary'])

    total_english_tokens+= len(eng)
    total_hindi_tokens+= len(hin)

token_per_language['english_hindi_crosssum'] = {'Hindi': total_hindi_tokens,
                                                'English': total_english_tokens,
                                                'Romanised_Hindi': 0}

total_english_tokens = 0
total_hindi_tokens = 0
for sample in hindi_english:
    hin = tokenizer.encode(sample['text'])
    eng = tokenizer.encode(sample['summary'])

    total_english_tokens+= len(eng)
    total_hindi_tokens+= len(hin)

token_per_language['hindi_english_crosssum'] = {'Hindi': total_hindi_tokens,
                                                'English': total_english_tokens,
                                                'Romanised_Hindi': 0}

total_english_tokens = 0
total_hindi_tokens = 0
for sample in hindi_hindi:
    hin_text = tokenizer.encode(sample['text'])
    hin = tokenizer.encode(sample['summary'])

    total_hindi_tokens+= len(hin_text)
    total_hindi_tokens+= len(hin)

token_per_language['hindi_hindi_crosssum'] = {'Hindi': total_hindi_tokens,
                                                'English': 0,
                                                'Romanised_Hindi': 0}

total_english_tokens = 0
total_hindi_tokens = 0
for sample in english_english:
    english_text = tokenizer.encode(sample['text'])
    english = tokenizer.encode(sample['summary'])

    total_english_tokens+= len(english_text)
    total_english_tokens+= len(english)

token_per_language['english_english_crosssum'] = {'Hindi': 0,
                                                'English': total_english_tokens,
                                                'Romanised_Hindi': 0}

In [None]:
hindi_url = []
english_url = []

hindi_url.extend([sample['target_url'] for sample in english_hindi])
hindi_url.extend([sample['target_url'] for sample in hindi_hindi])
hindi_url.extend([sample['source_url'] for sample in hindi_hindi])
hindi_url.extend([sample['source_url'] for sample in hindi_english])

english_url.extend([sample['source_url'] for sample in english_hindi])
english_url.extend([sample['target_url'] for sample in english_english])
english_url.extend([sample['source_url'] for sample in english_english])
english_url.extend([sample['target_url'] for sample in hindi_english])

hindi_url = set(hindi_url)
english_url = set(english_url)

hindi_url_dict = {url: f"hi_{idx}" for idx, url in enumerate(hindi_url)}
english_url_dict = {url: f"en_{idx}" for idx, url in enumerate(english_url)}

In [None]:
print(len(english_hindi))
print(len(hindi_english))
print(len(hindi_hindi))
print(len(english_english))

In [None]:
english_english[317]

In [None]:
crosssum_hindi_hindi = []
summary_instruction = "निर्देश: दिए गए पाठ के लिए सबसे प्रासंगिक सारांश प्राप्त करें। पाठ:"
query_instruction = "निर्देश: किसी दिए गए सारांश के लिए सबसे प्रासंगिक पैराग्राफ़ प्राप्त करें। सारांश: "

for idx, sample in enumerate(hindi_hindi):

    if idx%2==0:
        data = {'id': f"crosssum_hindi_hindi_{hindi_url_dict[sample['target_url']]}",
                'source': summary_instruction + sample['text'],
                'target': sample['summary']}
    else:
        data = {'id': f"crosssum_hindi_hindi_{hindi_url_dict[sample['source_url']]}",
                'source': query_instruction + sample['summary'],
                'target': sample['text']}


    crosssum_hindi_hindi.append(data)

with open("Processed_data/crosssum_hindi_hindi_test.jsonl", "w", encoding="utf-8") as f:
    for sample in crosssum_hindi_hindi:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

In [None]:
hindi_english[23]

In [None]:
crosssum_hindi_english = []

summary_instruction = "निर्देश: दिए गए पाठ के लिए सबसे प्रासंगिक सारांश प्राप्त करें। पाठ:"
query_instruction = "Instructions: Retrieve the most relevant paragraph for a given summary. Summary: "

for idx, sample in enumerate(hindi_english):

    if idx%2==0:
        data = {'id': f"crosssum_hindi_english_{english_url_dict[sample['target_url']]}",
                'source': summary_instruction + sample['text'],
                'target': sample['summary']}
    else:
        data = {'id': f"crosssum_hindi_english_{hindi_url_dict[sample['source_url']]}",
                'source': query_instruction + sample['summary'],
                'target': sample['text']}


    crosssum_hindi_english.append(data)

with open(f"Processed_data/crosssum_hindi_english_test.jsonl", "w", encoding="utf-8") as f:
    for sample in crosssum_hindi_english:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

In [None]:
english_hindi[23]

In [None]:
crosssum_english_hindi = []

summary_instruction = "Instructions: Retrieve the most relevant summary for the given paragraph. Text: "
query_instruction = "निर्देश: किसी दिए गए सारांश के लिए सबसे प्रासंगिक पैराग्राफ़ प्राप्त करें। सारांश: "

for idx, sample in enumerate(english_hindi):

    if idx%2==0:
        data = {'id': f"crosssum_english_hindi_{hindi_url_dict[sample['target_url']]}",
                'source': summary_instruction + sample['text'],
                'target': sample['summary']}
    else:
        data = {'id': f"crosssum_english_hindi_{english_url_dict[sample['source_url']]}",
                'source': query_instruction + sample['summary'],
                'target': sample['text']}


    crosssum_english_hindi.append(data)

with open(f"Processed_data/crosssum_english_hindi_test.jsonl", "w", encoding="utf-8") as f:
    for sample in crosssum_english_hindi:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

In [None]:
length = np.array([len(tokenizer.encode(sample['text'])) for sample in english_hindi])
print(f"English article length: {length.mean()}")
length = np.array([len(tokenizer.encode(sample['summary'])) for sample in english_hindi])
print(f"Hindi summary length: {length.mean()}")
length = np.array([len(tokenizer.encode(sample['text'])) for sample in hindi_english])
print(f"Hindi article length: {length.mean()}")
length = np.array([len(tokenizer.encode(sample['summary'])) for sample in hindi_english])
print(f"English summary length: {length.mean()}")

In [None]:
crosssum_english_english = []
summary_instruction = "Instructions: Retrieve the most relevant summary from a set of options for the given paragraph. Text: "
query_instruction = "Instructions: Retrieve the most relevant paragraph from a set of options for a given summary. Summary: "

for idx, sample in enumerate(english_english):

    if idx%2==0:
        data = {'id': f"crosssum_english_english_{english_url_dict[sample['target_url']]}",
                'source': summary_instruction + sample['text'],
                'target': sample['summary']}
    else:
        data = {'id': f"crosssum_english_english_{english_url_dict[sample['source_url']]}",
                'source': query_instruction + sample['summary'],
                'target': sample['text']}


    crosssum_english_english.append(data)

with open("Processed_data/crosssum_english_english_test.jsonl", "w", encoding="utf-8") as f:
    for sample in crosssum_english_english:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

Task: Flores

In [None]:
with open("./Data/Bitext Mining/Flores/flores_hi_en_test.json", 'r') as f:
    data = json.load(f)

In [None]:
data['examples'][230]

In [None]:
total_english_tokens = 0
total_hindi_tokens = 0
for sample in data['examples']:
    hindi = tokenizer.encode(sample['source'])
    english = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hindi)
    total_english_tokens+= len(english)

token_per_language['flores'] = {'Hindi': total_hindi_tokens,
                                                'English': total_english_tokens,
                                                'Romanised_Hindi': 0}

In [None]:
flores = []

hindi_instruction = "निर्देश: दिए गए हिंदी पाठ के लिए अर्थ की दृष्टि से सर्वाधिक समान अंग्रेजी पाठ प्राप्त करें। पाठ: "
english_instruction = "Instructions: Retrieve the most semantically similar Hindi text for the given English text. Text: "

for idx, sample in enumerate(data['examples']):

    if idx%2==0:
        data_ = {'id': f"flores_{idx}",
                'source': hindi_instruction + sample['source'],
                'target': sample['target']}
    else:
        data_ = {'id': f"flores_{idx}",
                'source': english_instruction + sample['target'],
                'target': sample['source']}

    flores.append(data_)

with open(f"Processed_data/flores_test.jsonl", "w", encoding="utf-8") as f:
    for sample in flores:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

In [None]:
len_of_texts = np.array([])
for sample in data['examples']:
    len_of_texts = np.append(len_of_texts, len(tokenizer.encode(sample['source'])))

In [None]:
len_of_texts.mean()

Task: LASER

In [None]:
english = []
with open("./Data/Bitext Mining/LASER/tatoeba.hin-eng.eng", "r") as f:
    for sample in f:
        english.append(sample)

hindi = []
with open("./Data/Bitext Mining/LASER/tatoeba.hin-eng.hin", "r") as f:
    for sample in f:
        hindi.append(sample)

length = np.array([len(tokenizer.encode(sample)) for sample in english])
print(f"Average length of english sentences: {length.mean()}")
length = np.array([len(tokenizer.encode(sample)) for sample in hindi])
print(f"Average length of hindi sentences: {length.mean()}")

In [None]:
hindi[0]

In [None]:
english[0]

In [None]:
total_english_tokens = 0
total_hindi_tokens = 0
for idx in range(len(hindi)):
    hin = tokenizer.encode(hindi[idx])
    eng = tokenizer.encode(english[idx])

    total_hindi_tokens+= len(hin)
    total_english_tokens+= len(eng)

token_per_language['laser'] = {'Hindi': total_hindi_tokens,
                                                'English': total_english_tokens,
                                                'Romanised_Hindi': 0}
token_per_language

In [None]:
laser = []

hindi_instruction = "निर्देश: दिए गए हिंदी पाठ के लिए अर्थ की दृष्टि से सर्वाधिक समान अंग्रेजी पाठ प्राप्त करें। पाठ: "
english_instruction = "Instructions: Retrieve the most semantically similar Hindi text for the given English text. Text: "

for idx, (hin, eng) in enumerate(zip(hindi, english)):

    if idx%2==0:
        data = {'id': f"laser_{idx}",
                'source': hindi_instruction + hin,
                'target': eng}
    else:
        data = {'id': f"laser_{idx}",
                'source': english_instruction + eng,
                'target': hin}

    laser.append(data)

with open(f"Processed_data/laser.jsonl", "w", encoding="utf-8") as f:

    for sample in laser:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: Mintaka

In [None]:
with open("./Data/Bitext Mining/Mintaka/mintaka_test.json", 'r') as f:
    samples = json.load(f)

In [None]:
length = np.array([len(tokenizer.encode(sample['question'])) for sample in samples])
print(f"Average length of english questions: {length.mean()}")
length = np.array([len(tokenizer.encode(sample['translations']['hi'])) for sample in samples])
print(f"Average length of hindi questions: {length.mean()}")

In [None]:
samples[23]

In [None]:
total_english_tokens = 0
total_hindi_tokens = 0
for idx in range(len(samples)):
    eng = tokenizer.encode(samples[idx]['question'])
    hin = tokenizer.encode(samples[idx]['translations']['hi'])

    total_hindi_tokens+= len(hin)
    total_english_tokens+= len(eng)

token_per_language['Mintaka'] = {'Hindi': total_hindi_tokens,
                                                'English': total_english_tokens,
                                                'Romanised_Hindi': 0}
token_per_language

In [None]:
mintaka = []

hindi_instruction = "निर्देश: हिंदी प्रश्न के लिए शब्दार्थ की दृष्टि से सर्वाधिक समान अंग्रेजी प्रश्न को पुनः प्राप्त करें। "
english_instruction = "Instructions: Retrieve the most semantically similar Hindi question for the English question. Question: "

for idx, sample in enumerate(samples):

    if idx%2==0:
        data = {'id': f"mintaka_{idx}",
                'source': english_instruction + sample['question'],
                'target': sample['translations']['hi']}
    else:
        data = {'id': f"mintaka_{idx}",
                'source': hindi_instruction + sample['translations']['hi'],
                'target': sample['question']}

    mintaka.append(data)

with open(f"Processed_data/mintaka_test.jsonl", "w") as f:

    for sample in mintaka:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: PHINC

In [None]:
df = pd.read_csv("./Data/Bitext Mining/PHINC/filtered_data.csv")
df.head(3)

In [None]:
length = np.array([len(tokenizer.encode(sample)) for sample in df['Sentence']])
print(f"Average length of english questions: {length.mean()}")
length = np.array([len(tokenizer.encode(sample)) for sample in df['English_Translation']])
print(f"Average length of hindi questions: {length.mean()}")

In [None]:
df.loc[23,'Sentence']

In [None]:
df.loc[23, 'English_Translation']

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
for sample in df.itertuples():
    r_eng = tokenizer.encode(sample.Sentence)
    eng = tokenizer.encode(sample.English_Translation)

    total_r_english_tokens+= len(r_eng)
    total_english_tokens+= len(eng)

token_per_language['Mintaka'] = {'Hindi': 0,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

In [None]:
phinc = []

hindi_instruction = "nirdesh: die gae romanakrt hindi paath ke lie arth kee drshti se sabase adhik samaan angrejee paath praapt karen. paath: "
english_instruction = "Instructions: Retrieve the most semantically similar romanized Hindi text for the given English text. Text: "

for idx in range(len(df)):

    if idx%2==0:
        data = {'id': f"phinc_{idx}",
                'source': hindi_instruction + df.loc[idx, 'Sentence'],
                'target': df.loc[idx, 'English_Translation']}
    else:
        data = {'id': f"phinc_{idx}",
                'source': english_instruction + df.loc[idx, 'English_Translation'],
                'target': df.loc[idx, 'Sentence']}

    phinc.append(data)

with open(f"Processed_data/phinc.jsonl", "w") as f:

    for sample in phinc:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

### Classification

Task: HindiDiscourseClassification

In [None]:
with open("./Data/Classification/HindiDiscourseClassification/discourse_dataset.json", 'r') as f:
    data = json.load(f)

In [None]:
len(data)

In [None]:
length = np.array([len(tokenizer.encode(data[key]['Sentence'])) for key in data.keys()])
print(f"Average length of hindi text: {length.mean()}")

In [None]:
data['0']

In [None]:
data['0']

In [None]:
discourse = []
label_groups = {}

instruction = "निर्देश: दिए गए हिंदी पाठ को निम्नलिखित में से किसी एक श्रेणी में वर्गीकृत करें: 'वर्णनात्मक', 'कथात्मक', 'संवाद', 'तर्कपूर्ण', 'सूचनात्मक', या 'अन्य'। पाठ: "

for key in data.keys():
    label = data[key]["Discourse Mode"]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(data[key]["Sentence"])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        discourse.append({'id': f'discourse_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(discourse)
        
with open(f"Processed_data/discourse.jsonl", "w") as f:

    for sample in discourse:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
label_groups.keys()

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in discourse:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['discourse'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: Massive

In [None]:
data = []
with open("./Data/Classification/Massive/hi-IN.jsonl", 'r') as f:
    for sample in f:
        data.append(json.loads(sample))

In [None]:
len(data)

In [None]:
data[2]

In [None]:
massive = []
label_groups = {}

instruction = "निर्देश: दिए गए आदेश को निम्नलिखित में से किसी एक आशय श्रेणी में वर्गीकृत करें: 'अलार्म', 'ऑडियो', 'आईओटी', 'कैलेंडर', 'प्ले', 'सामान्य', 'डेटटाइम', 'टेकअवे', 'समाचार', 'संगीत', 'मौसम', 'क्यूए', 'सामाजिक', 'सिफारिश', 'खाना पकाना', 'परिवहन', 'ईमेल', 'सूचियाँ'। पाठ: "

for item in data:
    label = item["scenario"]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item["utt"])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        massive.append({'id': f'massive_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(massive)
        
with open(f"Processed_data/massive.jsonl", "w") as f:

    for sample in massive:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
label_groups.keys()

In [None]:
train_data = [sample for sample in data if sample['partition']=='train']
length = np.array([len(tokenizer.encode(sample['utt'])) for sample in train_data])
print(f"Average length of hindi text: {length.mean()}")

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in massive:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['massive'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: SentimentAnalysisHindi

In [None]:
ds = load_from_disk("./Data/Classification/SentimentAnalysisHindi")

In [None]:
length = np.array([len(tokenizer.encode(sample['text'])) for sample in ds['train']])
print(f"Average length of hindi text: {length.mean()}")

In [None]:
ds['train'][287]

In [None]:
sentiment = []
label_groups = {}

instruction = "निर्देश: दिए गए पाठ को निम्नलिखित भावना श्रेणियों में से किसी एक में वर्गीकृत करें: सकारात्मक, नकारात्मक, या तटस्थ। पाठ: "

for item in ds['train']:
    label = item["label"]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item["text"])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        sentiment.append({'id': f'sentiment_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(sentiment)
        
with open(f"Processed_data/sentiment.jsonl", "w") as f:

    for sample in sentiment:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
label_groups.keys()

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in sentiment:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['sentiment'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: Sentiment Analysis Joshi

In [None]:
import pandas as pd

data_csv = pd.read_csv("./Data/Classification/sent_hineng_joshi/sentiment_dataset.csv")

In [None]:
sentiment = []
label_groups = {}

instruction = "निर्देश: दिए गए पाठ को निम्नलिखित भावना श्रेणियों में से किसी एक में वर्गीकृत करें: नकारात्मक (-1), तटस्थ (0), या सकारात्मक (1)। पाठ:"

for (_,item) in data_csv.iterrows():
    label = item.iloc[1]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item.iloc[0])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        sentiment.append({'id': f'sentiment_joshi_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(sentiment)
        
with open(f"Processed_data/sentiment_joshi.jsonl", "w") as f:

    for sample in sentiment:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
sentiment[23]

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in sentiment:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_r_english_tokens+= len(hin_1)
    total_r_english_tokens+= len(hin_2)

token_per_language['sentiment_joshi'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: Sentiment Shete

In [None]:
import pandas as pd

data_csv = pd.read_csv("./Data/Classification/sent_hineng_shete/data.csv")
print(len(data_csv))
data_csv = data_csv.dropna()

In [None]:
sentiment = []
label_groups = {}

instruction = "निर्देश: दिए गए पाठ को निम्नलिखित भावना श्रेणियों में से किसी एक में वर्गीकृत करें: नकारात्मक (-1), तटस्थ (0), या सकारात्मक (1)। पाठ:"
label_dict = {-1: 'neg',
              0: 'neu',
              1: 'pos'}
for (_,item) in data_csv.iterrows():
    label = label_dict[item.iloc[1]]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item.iloc[0])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        sentiment.append({'id': f'sentiment_shete_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(sentiment)
        
with open(f"Processed_data/sentiment_shete.jsonl", "w") as f:

    for sample in sentiment:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in sentiment:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_r_english_tokens+= len(hin_1)
    total_r_english_tokens+= len(hin_2)

token_per_language['sentiment_shete'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: Sentiment Review

In [None]:
import pandas as pd

data_csv = pd.read_csv("Data/Classification/sent_review/sentiment_reviews.csv")
data_csv = data_csv.dropna()

In [None]:
data_csv['sentiment'].unique()

In [None]:
sentiment = []
label_groups = {}

instruction = "निर्देश: दिए गए पाठ को निम्नलिखित भावना श्रेणियों में से किसी एक में वर्गीकृत करें: नकारात्मक (-1), या सकारात्मक (1)। पाठ:"

for (_,item) in data_csv.iterrows():
    label = item.iloc[1]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item.iloc[0])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        sentiment.append({'id': f'sentiment_review_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(sentiment)
        
with open(f"Processed_data/sentiment_review.jsonl", "w") as f:

    for sample in sentiment:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in sentiment:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['sentiment_review'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: Amazon Review

In [None]:
from datasets import load_dataset

ds = load_dataset("mteb/amazon_reviews_multi", "en")

In [None]:
ds['train'][2]

In [None]:
set([sample['label'] for sample in ds['train']])

In [None]:
amazon_review = []
label_groups = {}

instruction = "Instruction: Classify the sentiment of the following Amazon product review into one of the following labels:\n0 - Very Negative  \n1 - Negative \n2 - Neutral  \n3 - Positive  \n4 - Very Positive \nReview: "

for item in ds['train']:
    label = item['label']
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item['text'])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        amazon_review.append({'id': f'amazon_review_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(amazon_review)
        
with open(f"Processed_data/amazon_review.jsonl", "w") as f:

    for sample in amazon_review:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
amazon_review[23]

In [None]:
amazon_review_test = []
label_groups = {}

instruction = "Instruction: Classify the sentiment of the following Amazon product review into one of the following labels:\n0 - Very Negative  \n1 - Negative \n2 - Neutral  \n3 - Positive  \n4 - Very Positive \nReview: "

for item in ds['validation']:
    label = item['label']
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item['text'])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        amazon_review_test.append({'id': f'amazon_review_val{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(amazon_review_test)
        
with open(f"Processed_data/amazon_review_test.jsonl", "w") as f:

    for sample in amazon_review_test:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in amazon_review:
    eng_1 = tokenizer.encode(sample['source'])
    eng_2 = tokenizer.encode(sample['target'])

    total_english_tokens+= len(eng_1)
    total_english_tokens+= len(eng_2)

token_per_language['amazon_review'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: ABP news

In [None]:
import json
with open("Data/Classification/ABP_News/ABP_News_classification.json", "r") as f:
    abp_news = json.load(f)

In [None]:
abp_news_classification = []
label_groups = {}

instruction = "निर्देश: निम्नलिखित समाचार लेख को दिए गए श्रेणियों में से किसी एक में वर्गीकृत करें: श्रेणियाँ: gk, technology, business, entertainment, agriculture, astro, lifestyle, sports, education, states. समाचार लेख:"

for key, val in abp_news.items():
    label = val['domain']
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(val['article'])

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        abp_news_classification.append({'id': f'abp_news_classification_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(abp_news_classification)
        
with open(f"Processed_data/abp_news_classification.jsonl", "w") as f:

    for sample in abp_news_classification:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
abp_news_classification[23]

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in abp_news_classification:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['abp_news_classification'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: MTOP Intent Classification

In [None]:
from mteb.tasks import MTOPIntentClassification

task = MTOPIntentClassification()
task.load_data()

In [None]:
task.dataset['hi']['train'][23]

In [None]:
import random
import json

intent = []
label_groups = {}

instruction = "निर्देश: दिए गए पाठ को उसके उद्देश्य के आधार पर वर्गीकृत करें। पाठ: "

for item in task.dataset['hi']['train']:
    label = item["label"]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(item["text"])

drop_key = []
for key in label_groups.keys():

    if len(label_groups[key]) < 2:
        drop_key.append(key)
    

for key in label_groups.keys():

    if key not in drop_key:

        for idx, sent in enumerate(label_groups[key]):

            pos_idx = list(range(0, len(label_groups[key])))
            pos_idx.remove(idx)
            intent.append({'id': f'intent_{key}',
                            'source': instruction + sent,
                            'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(intent)
        
with open(f"Processed_data/mtop_intent.jsonl", "w") as f:

    for sample in intent:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in intent:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['intent'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: XNLI

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("mteb/xnli", "hi")

In [None]:
ds

In [None]:
ds['train'][23]

In [None]:
import random

xnli = []
label_groups = {}

instruction = "निर्देश: दिए गए प्रेज़म और हाइपोथेसिस के आधार पर निर्धारित करें कि संबंध 'अनुकूलन (entailment)', 'तटस्थ (neutral)', या 'विरोधाभासी (contradiction)' है। "

for idx in range(392702):
    label = ds['train'][idx]["label"]
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(f"आधार: {ds['train'][idx]['premise']} परिकल्पना: {ds['train'][idx]['hypothesis']} ")

for key in label_groups.keys():

    for idx, sent in enumerate(label_groups[key]):

        pos_idx = list(range(0, len(label_groups[key])))
        pos_idx.remove(idx)
        xnli.append({'id': f'xnli_{key}',
                        'source': instruction + sent,
                        'target': label_groups[key][random.choice(pos_idx)]})
        
random.shuffle(xnli)
        
with open(f"Processed_data/xnli.jsonl", "w") as f:

    for sample in xnli:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
xnli[1]

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in xnli:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['xnli'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: ai4bharat/samanantar

In [None]:
from datasets import load_dataset

data = {}

for language in ['as', 'bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']:
    data[language] = load_dataset("ai4bharat/samanantar", language)

In [None]:
ds_as = load_dataset("ai4bharat/samanantar", "as")
ds_bn = load_dataset("ai4bharat/samanantar", "bn")
ds_gu = load_dataset("ai4bharat/samanantar", "gu")
ds_hi = load_dataset("ai4bharat/samanantar", "hi")
ds_kn = load_dataset("ai4bharat/samanantar", "kn")
ds_ml = load_dataset("ai4bharat/samanantar", "ml")
ds_mr = load_dataset("ai4bharat/samanantar", "mr")
ds_or = load_dataset("ai4bharat/samanantar", "or")
ds_pa = load_dataset("ai4bharat/samanantar", "pa")
ds_ta = load_dataset("ai4bharat/samanantar", "ta")
ds_te = load_dataset("ai4bharat/samanantar", "te")

In [None]:
data['as']['train'][0]

In [None]:
import random

language_classification = []

instruction = "निर्देश: दिए गए पाठ की भाषा को निम्नलिखित भाषाओं में से किसी एक के रूप में वर्गीकृत करें: असमिया (as), बांग्ला (bn), गुजराती (gu), हिंदी (hi), कन्नड़ (kn), मलयालम (ml), मराठी (mr), उड़िया (or), पंजाबी (pa), तमिल (ta), या तेलुगू (te)। पाठ: "

for key in ['as', 'bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']:

    for idx, sent in enumerate(data[key]['train']):

        if key=='hi' and idx>5000:
            break
        elif key!='hi' and idx>500:
            break

        language_classification.append({'id': f'samanantar_{key}',
                        'source': instruction + sent['tgt'],
                        'target': data[key]['train'][data[key]['train'].num_rows-1-idx]['tgt']})
        
random.shuffle(language_classification)
        
with open(f"Processed_data/samanantar_language_classification.jsonl", "w") as f:

    for sample in language_classification:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

### Translation

Task: Code Mixed

In [None]:
import pandas as pd

data_csv = pd.read_csv("Data/Translation/codemixed_parallel_corpus/English-Hindi code-mixed parallel corpus.csv")
data_csv = data_csv.dropna()

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in data_csv.itertuples():
    hin_r = tokenizer.encode(sample.Sentence)
    english = tokenizer.encode(sample.English_Translation)

    total_r_english_tokens+= len(hin_r)
    total_english_tokens+= len(english)

token_per_language['code_mixed'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

In [None]:
code_mixed = []
hindi_instruction = "Nirdesh: Diye gaye Hindi vaakya se sabse saman romanised hindi vaakya dhunde. Vaakya: "
english_instruction = "Instruction: Find the most similar hindi sentence to the given romanised hindi sentence. Sentence: "

def remove_username(sentence):

    return " ".join([word for word in sentence.split() if word[0]!='@'])

for (idx, sample) in data_csv.iterrows():


    if idx%2==0:
        data = {'id': f"code_mixed_{idx}",
                'source': hindi_instruction + remove_username(sample.iloc[0]),
                'target': remove_username(sample.iloc[1])}
    else:
        data = {'id': f"code_mixed_{idx}",
                'source': english_instruction + remove_username(sample.iloc[1]),
                'target': remove_username(sample.iloc[0])}


    code_mixed.append(data)

with open("Processed_data/code_mixed.jsonl", "w", encoding="utf-8") as f:
    for sample in code_mixed:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

Task: HinGE

In [None]:
import pandas as pd
import re

data_csv = pd.read_csv("Data/Translation/HinGE/HinGE.csv")
data_csv = data_csv.dropna()

In [None]:
data_csv.head(3)

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in data_csv.itertuples():
    hindi = tokenizer.encode(sample.Hindi)
    english = tokenizer.encode(sample.English)

    total_hindi_tokens+= len(hindi)
    total_english_tokens+= len(english)

token_per_language['hinge'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

In [None]:
hinge = []
englishr_instruction = "Instruction: From the given English sentence, find the translated romanised Hindi sentence. Sentence: "
hindi_instruction = "निर्देश: दिए गए हिंदी वाक्य में से अनुवादित रोमनकृत हिंदी वाक्य चुनिए। वाक्य: "
hindir_instruction = "Nirdesh: Diye gaye romanised hindi vaakya se sabse saman hindi vaakya dhunde. Vaakya: "
hindire_instruction = "Nirdesh: Diye gaye romanised hindi vaakya se sabse saman english vaakya dhunde. Vaakya: "


for (idx, sample) in data_csv.iterrows():

    if len(re.findall(r"'(.*?)'", sample.iloc[2]))==0:
        continue
    if idx%2==0:
        data1 = {'id': f"hinge_{idx}",
                'source': hindi_instruction + sample.iloc[1],
                'target': re.findall(r"'(.*?)'", sample.iloc[2])[0]}
        hinge.append(data1)
        data2 = {'id': f"hinge_{idx}",
                'source': hindir_instruction + re.findall(r"'(.*?)'", sample.iloc[2])[0],
                'target': sample.iloc[1]}
        hinge.append(data2)

    else:
        data1 = {'id': f"hinge_{idx}",
                'source': hindire_instruction + re.findall(r"'(.*?)'", sample.iloc[2])[0],
                'target': sample.iloc[1]}
        hinge.append(data1)
        data2 = {'id': f"hinge_{idx}",
                'source': englishr_instruction + sample.iloc[0],
                'target': re.findall(r"'(.*?)'", sample.iloc[2])[0]}
        hinge.append(data2)
    


    hinge.append(data)

with open("Processed_data/hinge.jsonl", "w", encoding="utf-8") as f:
    for sample in hinge:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

In [None]:
len(hinge)

### Retrieval

Task: IndicQA

In [None]:
with open("./Data/Retrieval/IndicQA/indicqa.hi.json", 'r') as f:
    data = json.load(f)

In [None]:
data['data'][3]

In [None]:
context_list = set([sample['paragraphs'][0]['context'] for sample in data['data']])
context_dict = {context: idx for idx, context in enumerate(context_list)}

In [None]:
data['data'][34]['paragraphs'][0]['qas']

In [None]:
indicqa = []

instruction = "निर्देश: दिए गए प्रश्न के आधार पर उपलब्ध विकल्पों में से सबसे प्रासंगिक गद्यांश चुनिए। प्रश्न: "

for sample in data['data']:

    
    context = sample['paragraphs'][0]['context']

    for qas in sample['paragraphs'][0]['qas']:

        if qas['category'] == 'No':
            continue
        indicqa.append({
            'id': f"indicqa_{context_dict[context]}",
            'source': instruction + qas['question'],
            'target': context
        })

random.shuffle(indicqa)

with open(f"Processed_data/indicqa.jsonl", "w") as f:

    for sample in indicqa:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in indicqa:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['indicqa'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: MLDR

In [None]:
data = []
with open("./Data/Retrieval/MLDR/test.jsonl", "r") as f:

    for sample in f:
        data.append(json.loads(sample))

In [None]:
positive_passage = []
negative_passage = []

for sample in data:
    for pp in sample['positive_passages']:
        positive_passage.append(pp['text'])
    for np in sample['negative_passages']:
        negative_passage.append(np['text'])

positive_passage = set(positive_passage)
negative_passage = set(negative_passage)

pp_dict = {passage: idx for idx, passage in enumerate(positive_passage)}

In [None]:
data[0]

In [None]:
mldr = []

instruction = "निर्देश: दिए गए प्रश्न के आधार पर उपलब्ध विकल्पों में से सबसे प्रासंगिक अनुच्छेद को चुनें। प्रश्न: "

for sample in data:

    passage = sample['positive_passages'][0]['text']
    query = sample['query']

    mldr.append({
        'id': f"mldir_{pp_dict[passage]}",
        'source': instruction + query,
        'target': passage
    })

random.shuffle(mldr)

with open(f"Processed_data/mldr_test.jsonl", "w") as f:

    for sample in mldr:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in mldr:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['mldr'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: MLQA

In [None]:
with open("./Data/Retrieval/MLQA_V1/test/test-context-en-question-hi.json", "r") as f:
    hin_eng_data = json.load(f)
with open("./Data/Retrieval/MLQA_V1/test/test-context-hi-question-en.json", "r") as f:
    eng_hin_data = json.load(f)
with open("./Data/Retrieval/MLQA_V1/test/test-context-hi-question-hi.json", "r") as f:
    hin_hin_data = json.load(f)

In [None]:
hindi_context = []
english_context = []

for topic in hin_hin_data['data']:
    
    for sample in topic['paragraphs']:
        hindi_context.append(sample['context'])

hindi_context = set(hindi_context)

hindi_context_dict = {passage: idx for idx, passage in enumerate(hindi_context)}

In [None]:
mlqa = []

instruction = "निर्देश: प्रश्न के आधार पर उपलब्ध विकल्पों में से सबसे प्रासंगिक संदर्भ प्राप्त करें। प्रश्न:"

for topic in hin_hin_data['data']:

    for sample in topic['paragraphs']:

        context = sample['context']

        for qas in sample['qas']:
            mlqa.append({
                'id': f"mlqa_{hindi_context_dict[context]}",
                'source': instruction + qas['question'],
                'target': context               
            })

random.shuffle(mlqa)
with open("./Processed_data/mlqa_test.jsonl", "w") as f:

    for sample in mlqa:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in mlqa:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['mlqa'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: ABP news

In [None]:
import json
with open("Data/Retrieval/ABP_news/ABP_new_query_doc.json", "r") as f:
    query_doc = json.load(f)

In [None]:
abp = []

instruction = "निर्देश: प्रश्न के आधार पर उपलब्ध विकल्पों में से सबसे प्रासंगिक समाचार लेख ढूंढें। प्रश्न:"
count = 0

for field in query_doc.keys():

    for idx, sample in enumerate(query_doc[field]):

        context = sample[0]
        question = sample[1]

        if question is None:
            count+=1
            continue

        abp.append({
            'id': f"abp_{field}_{idx}",
            'source': instruction + question,
            'target': context          
        })

random.shuffle(abp)
with open("./Processed_data/abp_news.jsonl", "w") as f:

    for sample in abp:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
abp[23]

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in abp:
    hin_1 = tokenizer.encode(sample['source'])
    hin_2 = tokenizer.encode(sample['target'])

    total_hindi_tokens+= len(hin_1)
    total_hindi_tokens+= len(hin_2)

token_per_language['abp'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

Task: SQuAD

In [None]:
from datasets import load_dataset

ds = load_dataset("rajpurkar/squad")

In [None]:
ds['train'][2]

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for idx in range(len(ds['train'])):
    context = tokenizer.encode(ds['train'][idx]['context'])
    qs = tokenizer.encode(ds['train'][idx]['question'])

    total_english_tokens+= len(context)
    total_english_tokens+= len(qs)

token_per_language['squad'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

In [None]:
squad = []

instruction = "Instruction: Given a question, retrieve the most relevant passage. Question: "
count = 0

for idx in range(len(ds['train'])):

    context = ds['train'][idx]['context']
    question = ds['train'][idx]['question']

    squad.append({
        'id': f"squad_{idx}",
        'source': instruction + question,
        'target': context          
    })

random.shuffle(squad)
with open("./Processed_data/squad.jsonl", "w") as f:

    for sample in squad:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
squad = []

instruction = "Instruction: Given a question, retrieve the most relevant passage. Question: "
count = 0

for idx in range(len(ds['validation'])):

    context = ds['validation'][idx]['context']
    question = ds['validation'][idx]['question']

    squad.append({
        'id': f"squad_{idx}",
        'source': instruction + question,
        'target': context          
    })

random.shuffle(squad)
with open("./Processed_data/squad_test.jsonl", "w") as f:

    for sample in squad:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: ELI5

In [None]:
from datasets import load_dataset

ds = load_dataset("sentence-transformers/eli5")

In [None]:
ds['train'][23]

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for idx in range(len(ds['train'])):
    answer = tokenizer.encode(ds['train'][idx]['answer'])
    qs = tokenizer.encode(ds['train'][idx]['question'])

    total_english_tokens+= len(answer)
    total_english_tokens+= len(qs)

token_per_language['eli5'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}
token_per_language

In [None]:
eli5 = []

instruction = "Instruction: Given a question, retrieve the most relevant answer. Question: "
count = 0

for idx in range(len(ds['train'])):

    answer = ds['train'][idx]['answer']
    question = ds['train'][idx]['question']

    eli5.append({
        'id': f"eli5_{idx}",
        'source': instruction + question,
        'target': answer          
    })

random.shuffle(eli5)
with open("./Processed_data/eli5.jsonl", "w") as f:

    for sample in eli5:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

### Reranking

Task: Stackover flow

In [None]:
from datasets import load_dataset

ds = load_dataset("mteb/stackoverflowdupquestions-reranking")

In [None]:
total_english_tokens = 0
total_r_english_tokens = 0
total_hindi_tokens = 0
for sample in ds['train']:
    query = tokenizer.encode(sample['query'])
    positive = tokenizer.encode(sample['positive'][0])

    total_english_tokens+= len(query)
    total_english_tokens+= len(positive)

token_per_language['stackoverflow'] = {'Hindi': total_hindi_tokens,
                                'English': total_english_tokens,
                                'Romanised_Hindi': total_r_english_tokens}

In [None]:
ds['train']

In [None]:
stackoverflow = []

instruction = "Instruction: Given a query, retrieve the most similar sentence. Query: "
count = 0

for idx in range(len(ds['train'])):

    answer = ds['train'][idx]['positive'][0]
    question = ds['train'][idx]['query']

    stackoverflow.append({
        'id': f"stackoverflow_{idx}",
        'source': instruction + question,
        'target': answer          
    })

random.shuffle(stackoverflow)
with open("./Processed_data/stackoverflow.jsonl", "w") as f:

    for sample in stackoverflow:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
stackoverflow_test = []

instruction = "Instruction: Given a query, retrieve the most similar sentence. Query: "
count = 0

for idx in range(len(ds['train'])):

    answer = ds['train'][idx]['positive'][0]
    question = ds['train'][idx]['query']

    stackoverflow_test.append({
        'id': f"stackoverflow_test_{idx}",
        'source': instruction + question,
        'target': answer          
    })

random.shuffle(stackoverflow_test)
with open("./Processed_data/stackoverflow_test.jsonl", "w") as f:

    for sample in stackoverflow_test:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
import json
with open("token_per_language.json", 'w') as f:
    json.dump(token_per_language, f, indent=4)

### MTEB datasets

Task: XNLI New

In [None]:
from datasets import load_dataset

ds = load_dataset("mteb/xnli", "hi")

In [None]:
ds['train'][23]

In [None]:
import random
import json
xnli_new = []

instruction = "Instruction: Given a query, retrieve the most similar sentence. Query: "
count = 0

for idx, sample in enumerate(ds['train']):

    if sample['label'] != 0:
        continue

    answer = sample['premise']
    question = sample['hypothesis']

    xnli_new.append({
        'id': f"xnli_new_{idx}",
        'source': instruction + question,
        'target': answer          
    })

random.shuffle(xnli_new)
with open("./Processed_data/xnli_new.jsonl", "w") as f:

    for sample in xnli_new:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: Belebele

In [None]:
from datasets import load_dataset

ds_dev = load_dataset("facebook/belebele", "hin_Deva")
ds_latin = load_dataset("facebook/belebele", "hin_Latn")

In [None]:
ds_latin['test'][23]

In [None]:
import random
import json
belebele = []

eng_instruction = "Instruction: Given a query, retrieve the most similar passage. Query: "
hindi_instruction = "निर्देश: एक प्रश्न दिया गया है, सबसे समान अनुच्छेद को पुनः प्राप्त करें। प्रश्न: "
count = 0

for idx, sample in enumerate(ds_latin['test']):

    passage = sample['flores_passage']
    question = sample['question']

    belebele.append({
        'id': f"belebele_{idx}",
        'source': eng_instruction + question,
        'target': passage          
    })

max_idx = idx

for idx, sample in enumerate(ds_dev['test']):

    passage = sample['flores_passage']
    question = sample['question']

    belebele.append({
        'id': f"belebele_{max_idx + idx+1}",
        'source': hindi_instruction + question,
        'target': passage          
    })

random.shuffle(belebele)
with open("./Processed_data/belebele.jsonl", "w") as f:

    for sample in belebele:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: IN22-Conv

In [None]:
from mteb.tasks import IN22ConvBitextMining

task = IN22ConvBitextMining()
task.load_data()
task.dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("ai4bharat/IN22-Conv")

In [None]:
import random
import json
convbtm = []

hindi_instruction = "निर्देश: दिए गए पाठ का सबसे समान अनुवाद खोजें। पाठ: "

for idx, sample in enumerate(ds['test']):

    for taregt_lang in ['asm_Beng', 'ben_Beng', 'brx_Deva', 'doi_Deva', 'eng_Latn', 'gom_Deva', 'guj_Gujr', 'kan_Knda', 'kas_Arab', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'mni_Mtei', 'npi_Deva', 'ory_Orya', 'pan_Guru', 'san_Deva', 'sat_Olck', 'snd_Deva', 'tam_Taml', 'tel_Telu', 'urd_Arab']:

        text = sample['hin_Deva']
        target = sample[taregt_lang]

        convbtm.append({
            'id': f"convbtm_{idx}",
            'source': hindi_instruction + text,
            'target': target          
        })

random.shuffle(convbtm)
with open("./Processed_data/convbtm.jsonl", "w") as f:

    for sample in convbtm:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: LinceMTBitextMining

In [None]:
from datasets import load_dataset

ds = load_dataset("mteb/LinceMTBitextMining")

In [None]:
ds['train'][23]

In [None]:
import random
import json
lince = []

instruction = "Instruction: Find the most similar romanised Hindi sentence of the give english sentence. Sentence: "

for idx, sample in enumerate(ds['test']):


    sent1 = sample['hin_Deva']
    sent2 = sample[taregt_lang]

    lince.append({
        'id': f"lince_{idx}",
        'source': instruction + sent1,
        'target': sent2          
    })

random.shuffle(lince)
with open("./Processed_data/lince.jsonl", "w") as f:

    for sample in lince:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: WikiReranking

In [None]:
from datasets import load_dataset

ds = load_dataset("ellamind/wikipedia-2023-11-reranking-multilingual", "hi")

In [None]:
ds['test'][23]

In [None]:
import random
import json
wikireranking = []

hindi_instruction = "निर्देश: दिए गए प्रश्न के लिए सबसे अधिक समानता रखने वाले दस्तावेज़ को पहचानें और पुनः प्राप्त करें। प्रश्न:"

for idx, sample in enumerate(ds['test']):

    query = sample['query']
    positive = sample['positive'][0]

    wikireranking.append({
        'id': f"wikireranking_{idx}",
        'source': hindi_instruction + query,
        'target': positive          
    })

random.shuffle(wikireranking)
with open("./Processed_data/wikireranking.jsonl", "w") as f:

    for sample in wikireranking:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

Task: IndicCrosslingualSTS

In [None]:
from mteb.tasks import IndicCrosslingualSTS

task = IndicCrosslingualSTS()
task.load_data()
task.dataset

### Tokens per languages

In [None]:
Hindi = 0
English = 0
Romanised_Hindi = 0

for key in token_per_language.keys():

    Hindi+=token_per_language[key]['Hindi']
    English+=token_per_language[key]['English']
    Romanised_Hindi+=token_per_language[key]['Romanised_Hindi']

print(Hindi)
print(English)
print(Romanised_Hindi)

### Token Length Info

In [None]:
import json
import glob

files = glob.glob("./Processed_data/*")

data = []
token_len = []

for file in files:

    with open(file, 'r') as f:

        for sample in f:
            sample = json.loads(sample)
            data.append(sample)

            source_tok = len(tokenizer(sample['source']).input_ids)
            target_tok = len(tokenizer(sample['target']).input_ids)

            token_len.append(source_tok)
            token_len.append(target_tok)

In [None]:
import numpy as np

token_len = np.array(token_len)

In [None]:
token_len.max()

In [None]:
token_len.mean()

In [None]:
np.median(token_len)

In [None]:
token_len = sorted(token_len)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.hist(token_len, bins=50, color='blue', alpha=0.7, edgecolor='black')
plt.xlabel("Token Length")
plt.ylabel("Frequency")
plt.title("Token Length Distribution")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### Spliting Data

In [None]:
from sklearn.model_selection import train_test_split
import glob, json, os
import random

random.seed(42)

files = glob.glob("./Processed_data/*")

def get_jsonl(file):
    data = []
    with open(file, 'r') as f:
        for sample in f:
            data.append(json.loads(sample))
    return data

def save_jsonl(data, path):
    with open(path, 'w') as f:
        for sample in data:
            json.dump(sample, f, ensure_ascii=False)
            f.write('\n')

for file in files:

    if "_test" in file:
        continue

    if file not in ["./Processed_data/crosssum_english_hindi.jsonl", "./Processed_data/crosssum_hindi_english.jsonl",
                    "./Processed_data/crosssum_hindi_hindi.jsonl", "./Processed_data/crosssum_english_english.jsonl",
                    "./Processed_data/flores.jsonl", "./Processed_data/mintaka.jsonl",
                    "./Processed_data/mldr.jsonl", "./Processed_data/mlqa.jsonl",
                    "./Processed_data/amazon_review.jsonl", "./Processed_data/squad.jsonl",
                    "./Processed_data/stackoverflow.jsonl"]:
        
        data = get_jsonl(file)
        random.shuffle(data)
        train, val = train_test_split(data, test_size=0.2)

        path = f"./training_data/{file.split('/')[-1].split('.')[0]}/"
        os.makedirs(path, exist_ok=True)

        save_jsonl(train, f"{path}train.jsonl")
        save_jsonl(val, f"{path}val.jsonl")
    else:
        data = get_jsonl(file)
        path = f"./training_data/{file.split('/')[-1].split('.')[0]}/"
        os.makedirs(path, exist_ok=True)
        save_jsonl(data, f"{path}train.jsonl")

        test_path = f".{file.split('.')[1]}_test.{file.split('.')[2]}"
        data = get_jsonl(test_path)
        save_jsonl(data, f"{path}val.jsonl")

### Final training

In [None]:
import glob
import json
import random
import os

train_files = glob.glob("./training_data/*/train.jsonl", recursive=True)
val_files = glob.glob("./training_data/*/val.jsonl", recursive=True)

train_data = []
val_data = []

exclude_files = ['sentiment_shete', 'sentiment_joshi', 'hinge', 'code_mixed', 'sentiment_review', 'abp_news', 'crosssum_english_english']
exclude_files = ['xnli', 'Wikireranking', 'samanantar_language_classification']
#exclude_files = ['samanantar_language_classification']

english_data_files = ['amazon_review', 'crosssum_english_english', 'eli5', 'squad']
#english_data_files = []

def save_jsonl(data, path):

    with open(path, 'w') as f:
        for sample in data:
            json.dump(sample, f, ensure_ascii=False)
            f.write('\n')

for file in train_files:

    if file.split('/')[2] in exclude_files:
        #print(file)
        continue
    print(file)

    if file.split('/')[2] in english_data_files:
        data = []
        with open(file, 'r') as f:
            for sample in f:
                data.append(json.loads(sample))
        length_of_data = len(data)
        train_data = train_data + data[:length_of_data//2]

    else:

        with open(file, 'r') as f:
            for sample in f:
                train_data.append(json.loads(sample))

for file in val_files:

    if file.split('/')[2] in exclude_files:
        continue

    with open(file, 'r') as f:
        for sample in f:
            val_data.append(json.loads(sample))


random.shuffle(train_data)
random.shuffle(val_data)

save_jsonl(train_data, "./new_training_data/train_data.jsonl")
save_jsonl(val_data, "./new_training_data/val_data.jsonl")

In [None]:
len(train_data)

In [None]:
659297

In [None]:
len(train_data)

In [None]:
len(train_data)

#### Generate Hard Negative

In [None]:
from openai import OpenAI
import random
import time
import re
import json

def generate_query_for_article(sample):

    client = OpenAI(
        base_url="https://integrate.api.nvidia.com/v1",
        api_key="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"
    )
    
    system_message = """
        You are an AI assistant designed to generate challenging hard negative examples in the same language as the output. Your task is to produce exactly one concise and well-formed hard negative response that seems similar to the correct Output text, but is actually irrelevant for the given Input text. The hard negative should be misleading in a subtle way — close in topic or style, but not a valid answer. Make sure the grammar and vocabulary are correct. Wrap the hard negative inside ## markers like this: ## hard negative text ##*.**
    """
    
    try:
        completion = client.chat.completions.create(
            model="nvidia/llama-3.3-nemotron-super-49b-v1",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": f"Input Text: {sample['source']} \nOutput Text: {sample['target']} \nGenerate Hard Negative example: "}
            ],
            temperature=0.6,
            top_p=0.95,
            max_tokens=100,
            frequency_penalty=0,
            presence_penalty=0,
            stream=False
        )
    
    except Exception as e:
        print(f"Error: {e}")
        time.sleep(5)
        return generate_query_for_article(sample)
    
    return completion.choices[0].message.content.split("##")[1]

with open("./new_training_data/train_data.jsonl", "r") as f:
    data = []
    for sample in f:
        data.append(json.loads(sample))

for idx, sample in enumerate(data):
    
    data[idx]['hard_negative'] = generate_query_for_article(sample)
    data[idx]['hard_negative_flag'] = 1

    if idx%5000==0:
        print(f"Processed {idx} samples")
        with open("./new_training_data/train_data_with_hard_negative.jsonl", "w") as f:
            for sample in data:
                json.dump(sample, f, ensure_ascii=False)
                f.write('\n')

### Get training data info

In [None]:
import os
import glob
import json
from collections import defaultdict
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("LingoIITGN/Ganga-2-1B")

In [None]:
Bitext_Mining = ['./training_data/crosssum_english_english/train.jsonl',
                 './training_data/crosssum_hindi_hindi/train.jsonl',
                 './training_data/crosssum_english_hindi/train.jsonl',
                 './training_data/crosssum_hindi_english/train.jsonl',
                 './training_data/flores/train.jsonl',
                 './training_data/laser/train.jsonl',
                 './training_data/mintaka/train.jsonl',
                 './training_data/phinc/train.jsonl']

Classification = ['./training_data/discourse/train.jsonl',
                  './training_data/massive/train.jsonl',
                  './training_data/sentiment_joshi/train.jsonl',
                  './training_data/sentiment_shete/train.jsonl',
                  './training_data/sentiment_review/train.jsonl',
                  './training_data/sentiment/train.jsonl',
                 './training_data/abp_news_classification/train.jsonl',
                 './training_data/amazon_review/train.jsonl'
                  ]

Retrieval = ['./training_data/abp_news/train.jsonl',
             './training_data/indicqa/train.jsonl',
             './training_data/mldr/train.jsonl',
             './training_data/mlqa/train.jsonl',
             './training_data/squad/train.jsonl',
             './training_data/stackoverflow/train.jsonl',
             './training_data/eli5/train.jsonl'
            ]

Translation = ['./training_data/code_mixed/train.jsonl',
               './training_data/hinge/train.jsonl',
                ]

In [None]:
task_dict = {}

for task in Bitext_Mining:
    task_dict[task] = "Bitext_Mining"

for task in Classification:
    task_dict[task] = "Classification"

for task in Retrieval:
    task_dict[task] = "Retrieval"

for task in Translation:
    task_dict[task] = "Translation"

In [None]:
file_to_task = {}
for task_type, files in task_dict.items():
    for fname in files:
        file_to_task[fname] = task_type

# Collect all .jsonl files
files = glob.glob("./training_data/*/train.jsonl", recursive=True)

summary = {
    "overall_total_tokens": 0,
    "overall_samples": 0,
    "task_type_summary": defaultdict(lambda: {
        "total_tokens": 0,
        "total_samples": 0,
        "max_token_length": 0,
        "min_token_length": float("inf")
    }),
    "file_summary": {}
}

for file in files:
    num_tokens = 0
    num_sample = 0
    max_length = 0
    min_length = float("inf")

    with open(file, 'r') as f:
        for line in f:
            sample = json.loads(line)
            source = sample['source']
            target = sample['target']
            token_len = len(tokenizer.encode(source)) + len(tokenizer.encode(target))
            
            num_tokens += token_len
            num_sample += 1
            max_length = max(max_length, token_len)
            min_length = min(min_length, token_len)
        
    #filename = os.path.basename(file)
    task_type = task_dict.get(file, "unknown")

    # Update task type summary
    task_data = summary["task_type_summary"][task_type]
    task_data["total_tokens"] += num_tokens
    task_data["total_samples"] += num_sample
    task_data["max_token_length"] = max(task_data["max_token_length"], max_length)
    task_data["min_token_length"] = min(task_data["min_token_length"], min_length)

    summary['file_summary'][file] = {
                        "task_type": task_type,
                        "total_samples": num_sample,
                        "total_tokens": num_tokens,
                        "max_token_length": max_length,
                        "min_token_length": min_length,
    }

    summary["overall_total_tokens"] += num_tokens
    summary["overall_samples"] += num_sample

# Convert defaultdict to dict for saving
summary["task_type_summary"] = dict(summary["task_type_summary"])

# Save to JSON
with open("training_data_stats.json", "w") as f:
    json.dump(summary, f, indent=2)

In [2]:
from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict

best_model_dir = "./checkpoints/ganga-2-1b-embeddings-new-equall-eos-42-epoch-1/checkpoint-15000"
output_dir = "checkpoints/ganga-2-1b-embeddings-new-equall-eos-42-epoch-1/best_model"
convert_zero_checkpoint_to_fp32_state_dict(best_model_dir, output_dir, safe_serialization=True)

Processing zero checkpoint './checkpoints/ganga-2-1b-embeddings-new-equall-eos-42-epoch-1/checkpoint-15000/global_step15000'


Loading checkpoint shards: 100%|██████████| 1/1 [00:00<00:00, 1265.63it/s]


Detected checkpoint of type zero stage 3, world_size: 1
Parsing checkpoint created by deepspeed==0.16.8


Gathering sharded weights: 100%|██████████| 146/146 [00:00<00:00, 741366.08it/s]


Reconstructed Trainable fp32 state dict with 146 params 939591680 elements


Saving checkpoint shards: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]


In [3]:
import torch
from ganga_modeling import EmbeddingModel, BidirectionalMistralConfig, BidirectionalMistralModel
from transformers import AutoModel, AutoConfig

output_dir = "./checkpoints/ganga-2-1b-embeddings-new-equall-eos-42-epoch-1/best_model"
state_dict = torch.load(f"{output_dir}/pytorch_model.bin")

base_model = AutoModel.from_pretrained("LingoIITGN/Ganga-2-1B")

original_config = AutoConfig.from_pretrained("LingoIITGN/Ganga-2-1B")
bidir_config = BidirectionalMistralConfig(**original_config.to_dict())
bidir_model = BidirectionalMistralModel(bidir_config)

model2 = EmbeddingModel(bidir_model, 'mean')
model2.load_state_dict(state_dict)
model2.base_model.save_pretrained(output_dir)

In [None]:
model2

### Finetune data

In [None]:
import os
import json
import random
from sklearn.model_selection import train_test_split

paths = ["./Processed_data/mtop_intent.jsonl",
         #"./Processed_data/samanantar_language_classification.jsonl",
         "./Processed_data/xnli_new.jsonl"]

finetuning_data = []

for path in paths:

    with open(path, 'r') as f:
        for sample in f:
            finetuning_data.append(json.loads(sample))

random.shuffle(finetuning_data)

train, test = train_test_split(finetuning_data, test_size=0.1)

os.makedirs("./new_training_data2", exist_ok=True)

with open("./new_training_data2/train_data.jsonl", "w") as f:

    for sample in train:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

with open("./new_training_data2/val_data.jsonl", "w") as f:

    for sample in test:
        json.dump(sample, f, ensure_ascii=False)
        f.write('\n')

In [None]:
len(train)

In [None]:
with open("./new_training_data/train_data_with_hard_negative.jsonl", "r") as f:
    train_data = []
    for sample in f:
        train_data.append(json.loads(sample))

In [None]:
len(train_data)

In [1]:
from datasets import load_dataset

ds = load_dataset("./Data", split='train', )

  from .autonotebook import tqdm as notebook_tqdm


DataFilesNotFoundError: No (supported) data files found in Data

In [4]:
import pandas as pd

ds = pd.read_parquet("./Data/0000.parquet?download=true")

In [5]:
ds

Unnamed: 0,doc_id,url,title,eng_Latn,asm_Beng,hin_Deva,mal_Mlym,ben_Beng,guj_Gujr,san_Deva,kan_Knda,tel_Telu,mar_Deva,tam_Taml,ory_Orya,npi_Deva,pan_Guru,urd_Arab
0,0000052b8e57f36643caeeb7d1748e3d99a0cde0097db8...,https://en.wikipedia.org/wiki?curid=61293793,Bigger (Beyoncé song),"\nBigger (Beyoncé song)\n\n""Bigger"" (stylized ...","\nডাঙৰ (বেয় 'নচে গীত)\n\n""ডাঙৰ"" (ডাঙৰ বাক্যত ...","\nबड़ा (बेयोंसे गीत)\n\n""बड़ा"" (बड़े अक्षर में...","\nവലിയ (ബിയോൺസ് ഗാനം)\n\n""2019-ലെ ആൽബത്തിൽ നിന...","\nবড় (বেয়োন্সে গান)\n\n""বড়"" (বড় হাতের শৈলী...","\nમોટું (બિયોન્સે ગીત)\n\n"""" ""મોટું"" ""(મોટા અક...","\nबृहत् (बियोन्से गीतम्)\n\n""बिग्"" (बृहत्कृत्य...","\nದೊಡ್ಡದು (ಬಿಯಾನ್ಸ್ ಹಾಡು)\n\n""ದೊಡ್ಡದು"" (ದೊಡ್ಡ ...","\nపెద్దది (బియాన్స్ పాట)\n\n""పెద్దది"" (పెద్ద అ...","\nमोठे (बियॉन्से गाणे)\n\n""मोठे"" (मोठ्या अक्षर...","\nபெரிய (பியோனஸ் பாடல்)\n\n""பெரியது"" (பெரிய எழ...","\nବଡ଼ (ବିଯ଼ୋନ୍ସେ ଗୀତ)\n\n""ବଡ଼"" (ବଡ଼ ଅକ୍ଷରରେ ଶୈ...","\nठुलो (बियोन्से गीत)\n\n""ठुलो"" (ठुलो अक्षरमा ...","\nਵੱਡਾ (ਬੇਓਨਸੇ ਗੀਤ)\n\n""ਵੱਡਾ"" (ਵੱਡੇ ਅੱਖਰ ਵਿੱਚ ...","\nبڑا (بیونسی گانا)\n\n""بڑا"" (بڑے حروف میں اسٹ..."
1,000005ccabca8ce6ec2429b6b42b513ddad4dd32e61ec9...,https://en.wiktionary.org/wiki?curid=8195905,이다,\n이다\n\nKorean.\nEtymology 2.\nVerb.\nConjugat...,কোৰিয়ান। ব্যুৎপত্তি 2. ক্রিয়া। সংমিশ্ৰণ। টোক...,कोरियाई। व्युत्पत्ति 2. क्रिया। संयुग्मन। नोटः...,കൊറിയൻ. പദവ്യുത്പത്തി 2. ക്രിയ. സംയോജനം. കുറിപ...,কোরিয়ান। ব্যুৎপত্তি 2. ক্রিয়া। সংমিশ্রণ। দ্র...,કોરિયન. વ્યુત્પત્તિ 2. ક્રિયાપદ. સંયોજન. નોંધઃ...,कोरिया-देशः। व्युत्पत्ति 2. क्रियापद। संयोगः। ...,ಕೊರಿಯನ್. ವ್ಯುತ್ಪತ್ತಿ 2. ಕ್ರಿಯಾಪದ. ಸಂಯೋಗ. ಗಮನಿಸ...,కొరియన్. వ్యుత్పత్తి శాస్త్రం 2. క్రియ. సంయోగం...,कोरियन. व्युत्पत्तीशास्त्र 2. क्रियापद. संयोग....,கொரியன். சொற்பிறப்பியல் 2. வினைச்சொல். இணைத்தல...,କୋରିଆ। ବ୍ଯ଼ୁତ୍ପତ୍ତି 2. କ୍ରିଯ଼ା। ସଂଯୋଗ। ଟିପ୍ପଣୀ...,कोरियन। व्युत्पत्ति 2. क्रिया। संयोजन। नोटः जब...,ਕੋਰੀਆਈ. ਸ਼ਬਦ-ਵਿਗਿਆਨ 2. ਕ੍ਰਿਆ। ਸੰਯੋਜਨ. ਨੋਟਃ ਹਾਲ...,کوریائی۔ فعلیات 2. فعل۔ کنجگیشن۔ نوٹ: اگرچہ تج...
2,000006115d7c2d887cd80a9be89eb9ae4165d5627d9afd...,https://en.wikipedia.org/wiki?curid=57257423,Harriet White Fisher,\nHarriet White Fisher\n\nHarriet White Fisher...,\nহেৰিয়েট বগা মাছমৰীয়া\n\nহেৰিয়েট হোৱাইট ফি...,\nहैरियट सफेद मछुआरा\n\nहैरियट व्हाइट फिशर एंड...,,\nহ্যারিয়েট সাদা মাছরাঙা\n\nহ্যারিয়েট হোয়াই...,\nહેરિયટ સફેદ માછીમાર\n\nહેરિયટ વ્હાઇટ ફિશર એન...,\nह्यारियेट् श्वेतमत्स्यपालकः\n\nह्यारियेट् वै...,\nಹ್ಯಾರಿಯೆಟ್ ಬಿಳಿ ಮೀನುಗಾರ\n\nಹ್ಯಾರಿಯೆಟ್ ವೈಟ್ ಫ...,\nహారియట్ వైట్ ఫిషర్\n\nహారియెట్ వైట్ ఫిషర్ ఆం...,\nहॅरियट पांढरा मच्छीमार\n\nहॅरियट व्हाईट फिशर...,\nஹாரியட் வெள்ளை மீனவர்\n\nஹாரியட் வெள்ளை மீனவ...,\nହେରିଏଟ୍ ଧଳା ମତ୍ସ୍ଯ଼ଜୀବୀ\n\nହେରିଏଟ୍ ହ୍ୱାଇଟ୍ ଫ...,\nह्यारियट सेतो माछा मार्ने\n\nह्यारिएट ह्वाइट...,\nਹੈਰੀਏਟ ਚਿੱਟਾ ਮਛੇਰਾ\n\nਹੈਰੀਏਟ ਵ੍ਹਾਈਟ ਫਿਸ਼ਰ ਐਂ...,\nہیریٹ وائٹ فشر\n\nہیریٹ وائٹ فشر اینڈریو (18...
3,00000736d1c67c08e28234ea2e1bb4b3157c6db7d52bb6...,https://en.wikipedia.org/wiki?curid=86020,Iris (mythology),\nIris (mythology)\n\nAncient Greek personific...,\nআইৰিছ (পৌৰাণিক কাহিনী)\n\nপ্ৰাচীন গ্ৰীক ধৰ্ম...,\nआइरिस (पौराणिक कथा)\n\nप्राचीन यूनानी धर्म औ...,\nഐറിസ് (പുരാണം)\n\nപുരാതന ഗ്രീക്ക് മതത്തിലും ...,\nআইরিস (পৌরাণিক কাহিনী)\n\nপ্রাচীন গ্রীক ধর্ম...,\nઆઇરિસ (પૌરાણિક કથા)\n\nપ્રાચીન ગ્રીક ધર્મ અન...,\nऐरीस् (पौराणिक कथा)\n\nप्राचीन-ग्रीक्-धर्मस्...,\nಐರಿಸ್ (ಪುರಾಣ)\n\nಪ್ರಾಚೀನ ಗ್ರೀಕ್ ಧರ್ಮ ಮತ್ತು ಪ...,\nఐరిస్ (పురాణం)\n\nపురాతన గ్రీకు మతం మరియు పు...,\nआयरिस (पौराणिक कथा)\n\nप्राचीन ग्रीक धर्म आण...,\nஐரிஸ் (புராணம்)\n\nபண்டைய கிரேக்க மதம் மற்று...,\nଆଇରିସ (ପୌରାଣିକ କଥା)\n\nପ୍ରାଚୀନ ଗ୍ରୀକ୍ ଧର୍ମ ଏ...,\nआइरिस (पौराणिक कथा)\n\nप्राचीन ग्रिक धर्म र ...,\nਆਈਰਿਸ (ਮਿਥਿਹਾਸ)\n\nਪ੍ਰਾਚੀਨ ਯੂਨਾਨੀ ਧਰਮ ਅਤੇ ਮਿ...,\nآئیرس (افسانے)\n\nقدیم یونانی مذہب اور افسان...
4,00000768be7a3eb0f4fa50bfaf2a10f864e44c24c48342...,https://en.wikipedia.org/wiki?curid=64450824,Ioannis Kontoyiannis,\nIoannis Kontoyiannis\n\nGreek mathematician ...,\nইয়োনিছ কনটোয়িয়ানিছ\n\nইঅ 'নিছ কন্টয়ানিছ ...,\nइओनिस कोंटोयनिस\n\nइओनिस कोंटोयनिस (जन्म जनव...,,\nইয়নিস কনটোয়িয়ান্নিস\n\nইওনিস কনটোয়িয়ান্...,\nઇઓનિસ કોન્ટોયિયાનીસ\n\nઇઓનિસ કોન્ટોયિયાનીસ (...,\nइयोनिस् कोण्टोयानिस्\n\nइयोनिस् कोण्टोयानिस्...,\nಇಯೋನಿಸ್ ಕೊಂಟೊಯನ್ನಿಸ್\n\nಇಯೋನಿಸ್ ಕೊಂಟೊಯನ್ನಿಸ್...,\nఇయోనిస్ కొంటోయన్నిస్\n\nఇయోనిస్ కొంటోయాన్నిస...,\nइओनिस कोंटोयॅनिस\n\nइओनिस कोंटोयॅनिस (जन्म ज...,\nஅயோனிஸ் கொன்டோயியன்னிஸ்\n\nஅயோனிஸ் கொன்டோயான...,\nଆଯ଼ୋନିଜ୍ କୋଣ୍ଟୋଯ଼ିଆନିସ୍\n\nଆଇଓନିଜ୍ କୋଣ୍ଟୋଯ଼ି...,\nइओनिस कन्टोयनिस\n\nइओनिस कोन्टोयनिस (जन्म जन...,\nਆਇਓਨਿਸ ਕੋਂਟੋਇਆਨਿਸ\n\nਇਓਨਿਸ ਕੋਂਟੋਇਆਨਿਸ (ਜਨਮ ਜ...,\nآئیونیس کونٹوئینس\n\nآئیونیس کونٹوئینس (پیدا...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22266,01030f0c76ee812c9a71ddc6c0a7cd5365f154bc71d5fc...,https://en.wikipedia.org/wiki?curid=29701458,List of Oceanian records in track cycling,\nList of Oceanian records in track cycling\n\...,,\nट्रैक साइकिलिंग में महासागरीय रिकॉर्ड की सूच...,,\nট্র্যাক সাইক্লিং-এ মহাসাগরীয় রেকর্ডের তালিক...,\nટ્રેક સાઇકલિંગમાં મહાસાગરીય રેકોર્ડની યાદી\n...,\nट्र्याक्-सैक्लिङ्ग् मध्ये ओशिनियन्-अभिलेखाना...,\nಟ್ರ್ಯಾಕ್ ಸೈಕ್ಲಿಂಗ್ನಲ್ಲಿ ಸಾಗರ ದಾಖಲೆಗಳ ಪಟ್ಟಿ\n...,,\nट्रॅक सायकलिंगमधील सागरी विक्रमांची यादी\n\n...,\nதடம் சைக்கிள் ஓட்டுவதில் கடல் பதிவுகளின் பட்...,\nଟ୍ରାକ୍ ସାଇକ୍ଲିଂରେ ମହାସାଗରୀଯ଼ ରେକର୍ଡର ତାଲିକା\...,,\nਟਰੈਕ ਸਾਈਕਲਿੰਗ ਵਿੱਚ ਸਮੁੰਦਰੀ ਰਿਕਾਰਡ ਦੀ ਸੂਚੀ\n\...,
22267,010310f9d9135c2ac0c01cd48dc1fedc3c8ba27910d5de...,https://en.wikipedia.org/wiki?curid=177952,Great Zimbabwe,\nGreat Zimbabwe\n\nRuins of a medieval city i...,\nগ্ৰেট জিম্বাবৱে\n\nগ্ৰেট জিম্বাবৱে হৈছে আধুন...,\nमहान जिम्बाब्वे\n\nमहान जिम्बाब्वे आधुनिक जि...,\nഗ്രേറ്റ് സിംബാബ്വെ\n\nആധുനിക രാജ്യമായ സിംബാബ...,\nগ্রেট জিম্বাবুয়ে\n\nগ্রেট জিম্বাবুয়ে হল আধ...,\nગ્રેટ ઝિમ્બાબ્વે\n\nગ્રેટ ઝિમ્બાબ્વે એ આધુનિ...,\nग्रेट् जिम्बाब्वे\n\nग्रेट् जिम्बाब्वे इति आ...,\nಗ್ರೇಟ್ ಜಿಂಬಾಬ್ವೆ\n\nಗ್ರೇಟ್ ಜಿಂಬಾಬ್ವೆ ಆಧುನಿಕ ...,\nగ్రేట్ జింబాబ్వే\n\nగ్రేట్ జింబాబ్వే అనేది ఆ...,\nमहान झिम्बाब्वे\n\nग्रेट झिम्बाब्वे हे आधुनि...,\nபெரிய ஜிம்பாப்வே\n\nகிரேட் ஜிம்பாப்வே என்பது...,\nଗ୍ରେଟ ଜିମ୍ବାୱେ\n\nଗ୍ରେଟ ଜିମ୍ବାୱେ ହେଉଛି ଆଧୁନି...,\nग्रेट जिम्बाब्वे\n\nग्रेट जिम्बाब्वे आधुनिक ...,\nਮਹਾਨ ਜ਼ਿੰਬਾਬਵੇ\n\nਮਹਾਨ ਜ਼ਿੰਬਾਬਵੇ ਆਧੁਨਿਕ ਦੇਸ਼...,\nعظیم زمبابوے\n\nعظیم زمبابوے جدید ملک زمبابو...
22268,01031122c84ee82f58e0b3675cfb683d18e2eb3ace4c1f...,https://en.wikipedia.org/wiki?curid=24759609,Alberto Muñoz,\nAlberto Muñoz\n\nMexican professional wrestl...,\nআলবাৰ্টো মুনোজ\n\nইছমেল মুনোজ লোপেজ (ইংৰাজীঃ...,\nअल्बर्टो मुनोज़\n\nइस्माइल मुनोज़ लोपेज़ (15...,\nആൽബെർട്ടോ മുനോസ്\n\nആൽബെർട്ടോ മുനോസ് എന്ന റി...,\nআলবার্তো মুনোজ\n\nইসমাইল মুনোজ লোপেজ (15ই জা...,\nઆલ્બર્ટો મુનોઝ\n\nઇસમાઇલ મુનોઝ લોપેઝ (15 જાન...,\nअल्बर्टो मुनोज़्\n\nइस्मायिल् मुनोज़् लोपेज़...,\nಅಲ್ಬರ್ಟೊ ಮುನೋಜ್\n\nಇಸ್ಮಾಯೆಲ್ ಮುನೋಜ್ ಲೋಪೆಜ್ (...,\nఅల్బెర్టో మునోజ్\n\nఇస్మాయిల్ మునోజ్ లోపెజ్ ...,\nअल्बर्टो मुनोझ\n\nइस्मायेल मुनोझ लोपेझ (15 ज...,\nஅல்பெர்டோ முனோஸ்\n\nஇஸ்மாயில் முனோஸ் லோபஸ் (...,\nଆଲବର୍ଟୋ ମୁନୋଜ୍\n\nଇସମାଇଲ ମୁନୋଜ ଲୋପେଜ (ଜାନୁଆର...,\nअल्बर्टो मुनोज\n\nइस्मेल मुनोज लोपेज (जनवरी ...,\nਅਲਬਰਟੋ ਮੁਨੋਜ਼\n\nਇਸਮਾਇਲ ਮੁਨੋਜ਼ ਲੋਪੇਜ਼ (15 ਜਨ...,\nالبرٹو مونوز\n\nاسماعیل منوز لوپیز (15 جنوری...
22269,01031156f9e8456585fa6b657244f50df3bdf240e9c31b...,https://en.wikipedia.org/wiki?curid=17039720,A Vedic Word Concordance,\nA Vedic Word Concordance\n\nMulti-volume con...,\nএটা বৈদিক শব্দৰ সমন্বয়\n\nবৈদিক শব্দ কনকোৰ্...,\nएक वैदिक शब्द समन्वय\n\nएक वैदिक शब्द समन्वय...,\nഒരു വേദ വാക്ക് കോൺകോർഡൻസ്\n\nവൈദിക സംസ്കൃത ഗ...,\nএকটি বৈদিক শব্দ সঙ্গতি\n\nএকটি বৈদিক শব্দ সঙ...,,\nवैदिकशब्दः समन्वयः\n\nवैदिकशब्दः समन्वयः (सं...,\nವೈದಿಕ ಪದದ ಸಾಮರಸ್ಯ\n\nವೈದಿಕ ಪದವಾದ ಸಮನ್ವಯವು (ಸ...,\nవేద పదం సామరస్యం\n\nవేద పదం సామరస్యం (సంస్కృ...,\nएक वैदिक शब्द समन्वय\n\nवैदिक शब्द समन्वय (स...,\nஒரு வேத வார்த்தை இணக்கம்\n\nவேதச் சொல் ஒருங்...,\nଏକ ବୈଦିକ ଶବ୍ଦ ସମନ୍ୱଯ଼\n\nଏକ ବୈଦିକ ଶବ୍ଦ 'କନକୋ...,\nवैदिक शब्द समन्वय\n\nवैदिक शब्द समन्वय (संस्...,\nਇੱਕ ਵੈਦਿਕ ਸ਼ਬਦ ਸੰਜੋਗ\n\nਵੈਦਿਕ ਸ਼ਬਦ ਸੰਜੋਗ (ਸੰ...,\nایک ویدک لفظ مطابقت\n\nویدک لفظ مطابقت (سنسک...
