# Package Imports

In [7]:
import pandas as pd
import os
import re
from sklearn.utils import shuffle
import nltk
from nltk.tokenize import word_tokenize

# Model Imports

In [3]:
from Detect.IndicLID.Inference.ai4bharat.IndicLID import IndicLID                           # Detection

IndicLID_model = IndicLID(input_threshold = 0.5, roman_lid_threshold = 0.6)                 # Loading Model with recommended Hyperparams

# Dataset to dict of dfs

In [5]:
val_dataset_folder_path = r'F:\Projects\translate\IndicTrans2\Dataset\v2\Translation_val_data'

csv_files = os.listdir(val_dataset_folder_path)

dfs_dict = {}

for csv_file in csv_files:
    file_path = os.path.join(val_dataset_folder_path, csv_file)
    df_name = os.path.splitext(csv_file)[0]
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    df = pd.DataFrame({'Column': lines})
    dfs_dict[df_name] = df

In [6]:
for file_name, df in dfs_dict.items():
    print(f"DataFrame '{file_name}' has shape: {df.shape}")

DataFrame 'train_bn' has shape: (8604580, 1)
DataFrame 'train_bn_en' has shape: (8604580, 1)
DataFrame 'train_hi' has shape: (10125706, 1)
DataFrame 'train_hi_en' has shape: (10125706, 1)
DataFrame 'train_te' has shape: (4946035, 1)
DataFrame 'train_te_en' has shape: (4946035, 1)


In [7]:
subset_dict = {}
test_size = 10000
# Iterate through the original dictionary
for file_name, df in dfs_dict.items():
    # Get the first 10,000 rows of each DataFrame
    df_first_10000 = df.head(test_size)
    df_first_10000['Language_true'] = file_name.split('_', 1)[1]
    df_first_10000['id'] = range(1, test_size + 1)
    df_first_10000.rename(columns={'Column': 'Text'}, inplace=True)
    
    # Store the new DataFrame in the new dictionary
    subset_dict[file_name] = df_first_10000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_10000['Language_true'] = file_name.split('_', 1)[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_10000['id'] = range(1, test_size + 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_10000.rename(columns={'Column': 'Text'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

# Text Preprocessing 

In [3]:
def remove_mentions_and_urls(sentence):

    pattern = r'(@\S+|https://\S+|#\S+)'
    sentence = sentence.replace('\n', '')
    sentence = re.sub(pattern, '', sentence)
    
    if '\n' in sentence:
        print(sentence)

    return sentence

In [4]:
def remove_emojis(text):
    # Define a regular expression pattern to match emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)

    # Use re.sub to replace emojis with an empty string
    cleaned_text = emoji_pattern.sub(r'', text)

    return cleaned_text

In [5]:
def remove_english_words(text):

    words = word_tokenize(text)
    non_english_words = [word for word in words if not any(char.isalpha() and ord(char) < 128 for char in word)]
    return ' '.join(non_english_words)

In [10]:
# for file_name, df in subset_dict.items():
#     df['Text'] = df['Text'].astype(str).apply(remove_mentions_and_urls).apply(remove_emojis)
en_dfs = {key: df for key, df in subset_dict.items() if 'en' in key}
ind_dfs = {key: df for key, df in subset_dict.items() if 'en' not in key}

# Concatenate DataFrames from the filtered dictionary into a single DataFrame
merged_en_df = pd.concat(en_dfs.values(), ignore_index=True)
merged_ind_df = pd.concat(ind_dfs.values(), ignore_index=True)

merged_en_df['Text'] = merged_en_df['Text'].astype(str).apply(remove_mentions_and_urls).apply(remove_emojis)
merged_ind_df['Text'] = merged_ind_df['Text'].astype(str).apply(remove_mentions_and_urls).apply(remove_emojis)

# Shuffle the rows in random order
shuffled_df = shuffle(merged_ind_df)
shuffled_seqs = shuffled_df['Text'].to_list()

In [21]:
outputs = IndicLID_model.batch_predict(shuffled_seqs, batch_size = 128)

In [22]:
df = pd.DataFrame(outputs, columns = ['Text', 'Language_pred', 'Certainity', 'Model'])

In [45]:
eval_df = pd.merge(df, shuffled_df, on='Text')

In [46]:
eval_df['Language_pred'].value_counts()

Language_pred
tel_Telu     9988
ben_Beng     9467
hin_Deva     8724
mai_Deva      841
asm_Beng      485
kas_Deva      138
brx_Deva      126
mar_Deva       46
eng_Latn       41
doi_Deva       39
san_Deva       34
mni_Beng       28
nep_Deva       25
kok_Deva        7
other           5
sat_Olch        3
tel_Latn        2
mni_Meti        2
mni_Latn        2
kas_Latn        1
tam_Tamil       1
ori_Latn        1
Name: count, dtype: int64

In [49]:
hindi_categories = ['hin_Deva', 'mai_Deva', 'mar_Deva', 'san_Deva', 'hin_Latn', 'kas_Deva', 'pan_Latn', 'kok_Deva', 'brx_Deva', 'doi_Deva', 'urd_Latn', 'mai_Latn', 'brx_Latn', 'nep_Deva', 'asm_Latn']
ben_categories = ['ben_Beng', 'asm_Beng', 'ben_Latn', 'mni_Beng', 'sat_Olch']
tel_categories = ['tel_Telu', 'tel_Latn']
eng_categories = ['eng_Latn', 'mni_Latn', 'kok_Latn']

In [40]:
eng_df = eval_df[eval_df['Language_pred'] == 'eng_Latn']
eng_df['Native_text'] = eng_df['Text'].apply(remove_english_words)
fixed_outputs = IndicLID_model.batch_predict(eng_df['Native_text'], batch_size = 128)
fixed_df = pd.DataFrame(fixed_outputs, columns = ['Text', 'Language_pred', 'Certainity', 'Model'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_df['Native_text'] = eng_df['Text'].apply(remove_english_words)


In [69]:
eng_df.to_csv(r'C:\Users\useer\Downloads\eng_fixed.csv')

In [48]:
eng_df['Language_pred'] = fixed_df['Language_pred'].tolist()

eval_df = eval_df[~(eval_df['Language_pred'] == 'eng_Latn')]
eval_df = pd.concat([eval_df, eng_df], ignore_index=True)
eval_df

Unnamed: 0,Text,Language_pred,Certainity,Model,Language_true,id,Native_text
0,""""""" उन्होंने कहा, """"मैं यहां जोर देकर कहना चाह...",hin_Deva,0.999996,IndicLID-FTN,hi,9966,
1,কারও কাছে জীবনের অভ্যেস এক লহমায় বদলে যাওয়া।,ben_Beng,1.000049,IndicLID-FTN,bn,8871,
2,అలాంటి వారిని పొగడటం లేదు.,tel_Telu,1.000050,IndicLID-FTN,te,7224,
3,ঘটনায় জড়িত একজনকে আটক করা হয়েছে।,ben_Beng,1.000047,IndicLID-FTN,bn,5316,
4,తిరిగి తిరిగీ తిప్పలే!,tel_Telu,1.000025,IndicLID-FTN,te,365,
...,...,...,...,...,...,...,...
30001,"দর্শী এ,আর The Gallant Defender সিং পি, The Go...",ben_Beng,0.668298,IndicLID-FTR,bn,3699,"দর্শী এ , আর সিং পি ,"
30002,"""""""The BJP, however, has made it clear that it...",ben_Beng,0.999888,IndicLID-FTR,bn,8008,"`` `` '' , , ’ . `` `` , তবে তিনি এও পরিষ্কার ..."
30003,इस खबर को अंग्रेजी में पढ़ें- Behind ISI’s ren...,hin_Deva,0.999575,IndicLID-FTR,hi,4953,इस खबर को अंग्रेजी में पढ़ें- ’ : 2
30004,పూర్వ్ అనే దక్షిణ పూర్వ్ ఈంగ్లేన్డ్City in Eas...,tel_Telu,0.998167,IndicLID-FTR,te,8674,"పూర్వ్ అనే దక్షిణ పూర్వ్ ,"


In [49]:
correct_hindi_count = ((eval_df['Language_true'] == 'hi') & (eval_df['Language_pred'].isin(hindi_categories))).sum()
correct_ben_count = ((eval_df['Language_true'] == 'bn') & (eval_df['Language_pred'].isin(ben_categories))).sum()
correct_tel_count = ((eval_df['Language_true'] == 'te') & (eval_df['Language_pred'].isin(tel_categories))).sum()

In [50]:
hindi_acc = correct_hindi_count * 100 / ((eval_df['Language_true'] == 'hi').sum())
ben_acc = correct_ben_count * 100 / ((eval_df['Language_true'] == 'bn').sum())
tel_acc = correct_tel_count * 100 / ((eval_df['Language_true'] == 'te').sum())
total_valid = (eval_df['Language_true'] == 'hi').sum() + (eval_df['Language_true'] == 'bn').sum() + (eval_df['Language_true'] == 'te').sum()

print("Hindi Prediction accuracy:", hindi_acc, "out of", ((eval_df['Language_true'] == 'hi').sum()))
print("Bengali Prediction accuracy:", ben_acc, "out of", ((eval_df['Language_true'] == 'bn').sum()))
print("Telugu Prediction accuracy:", tel_acc, "out of", ((eval_df['Language_true'] == 'te').sum()))
print("Overall accuracy:", (correct_hindi_count + correct_ben_count + correct_tel_count) * 100 / total_valid, "out of", total_valid)

Hindi Prediction accuracy: 99.95001999200319 out of 10004
Bengali Prediction accuracy: 99.97000599880025 out of 10002
Telugu Prediction accuracy: 99.93 out of 10000
Overall accuracy: 99.9500099980004 out of 30006


In [21]:
eval_df.to_csv(r'C:\Users\useer\Downloads\eval_df.csv')
merged_en_df.to_csv(r'C:\Users\useer\Downloads\en_ref_df.csv')

In [51]:
rows_not_satisfying_conditions = eval_df[~((eval_df['Language_true'] == 'hi') & (eval_df['Language_pred'].isin(hindi_categories))) &
                                           ~((eval_df['Language_true'] == 'bn') & (eval_df['Language_pred'].isin(ben_categories))) &
                                           ~((eval_df['Language_true'] == 'te') & (eval_df['Language_pred'].isin(tel_categories)))]

In [52]:
rows_not_satisfying_conditions.to_excel(r'C:\Users\useer\Downloads\erroneous_entries_detection_iter3.xlsx')

In [None]:
rows_not_satisfying_conditions

# Translation

In [1]:
from inference.engine import Model as IndicTrans2                       # Translation
import warnings
warnings.filterwarnings("ignore")

In [2]:
IndicTrans2_model = IndicTrans2("models/indic-en-preprint/fairseq_model", model_type="fairseq")# , device='cuda') #kernel crash

Initializing sentencepiece model for SRC and TGT
Initializing model for translation



2024-02-24 11:42:27 | INFO | fairseq.tasks.translation | [SRC] dictionary: 122706 types
2024-02-24 11:42:27 | INFO | fairseq.tasks.translation | [TGT] dictionary: 32296 types


In [3]:
import pandas as pd
eng_ref_translation_df = pd.read_csv(r'C:\Users\useer\Downloads\en_ref_df.csv')
# input_df = pd.read_csv(r'C:\Users\useer\Downloads\eval_df.csv')

In [4]:
input_df = pd.read_csv(r'C:\Users\useer\Downloads\eng_fixed.csv')

In [5]:
input_df['match_id'] = input_df['Language_true'] + '_en_' + input_df['id'].astype(str)
eng_ref_translation_df['match_id'] = eng_ref_translation_df['Language_true'] + '_' + eng_ref_translation_df['id'].astype(str)

In [6]:
eng_ref_translation_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Language_true,id,match_id
0,0,The International Civil Aviation Organisation ...,bn_en,1,bn_en_1
1,1,They can be deactivated using the control pane...,bn_en,2,bn_en_2
2,2,He was born at Ulail in Dhaka district of Beng...,bn_en,3,bn_en_3
3,3,Atra Gilatola Union,bn_en,4,bn_en_4
4,4,He is the father of five daughters.,bn_en,5,bn_en_5


In [7]:
input_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Language_pred,Certainity,Model,Language_true,id,Native_text,match_id
0,522,Russia in Figures (PDF) (প্রতিবেদন)।,ben_Beng,0.998066,IndicLID-FTR,bn,506,( ) ( প্রতিবেদন ) ।,bn_en_506
1,947,अंग्रेजी में भी पढ़ें: LSR starts special law ...,hin_Deva,0.848665,IndicLID-FTR,hi,5384,अंग्रेजी में भी पढ़ें :,hi_en_5384
2,2664,"com, 29 नवंबर 2019 को Jagran ‘HiTech Awards 20...",hin_Deva,0.83092,IndicLID-FTR,hi,7292,", 29 नवंबर 2019 को ‘ 2019 का आयोजन कर रही है।",hi_en_7292
3,2960,"""উদ্ধৃতি টেমপ্লেট ইংরেজি প্যারামিটার ব্যবহার ক...",ben_Beng,5.824115,IndicLID-BERT,bn,8883,`` উদ্ধৃতি টেমপ্লেট ইংরেজি প্যারামিটার ব্যবহার...,bn_en_8883
4,3303,White House Threatens To Suspend Again Cnn Rep...,hin_Deva,0.622242,IndicLID-FTR,hi,960,| व्हाइट हाउस दोबारा निलंबित कर सकता है के जिम...,hi_en_960


In [8]:
hindi_categories = ['hin_Deva', 'mai_Deva', 'mar_Deva', 'san_Deva', 'hin_Latn', 'kas_Deva', 'pan_Latn', 'kok_Deva', 'brx_Deva', 'doi_Deva', 'urd_Latn', 'mai_Latn', 'brx_Latn', 'nep_Deva']
ben_categories = ['ben_Beng', 'asm_Beng', 'ben_Latn', 'mni_Beng']
tel_categories = ['tel_Telu', 'tel_Latn']
eng_categories = ['eng_Latn', 'mni_Latn', 'kok_Latn']

In [9]:
hindi_sentences = input_df[input_df['Language_pred'].isin(hindi_categories)][['Text', 'match_id']]

In [10]:
filtered_hindi_list = hindi_sentences['Text'].tolist()

for sent in filtered_hindi_list:
    if len(sent) > 256:
        filtered_hindi_list.remove(sent)

print(len(filtered_hindi_list))

20


In [11]:
hindi_translated = IndicTrans2_model.batch_translate(filtered_hindi_list, "hin_Deva", "eng_Latn")

2024-01-04 16:58:17 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2024-01-04 16:58:17 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2024-01-04 16:58:17 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = False
2024-01-04 16:58:17 | INFO | fairseq.tasks.fairseq_task | creating new batches for epoch 1


In [12]:
hindi_sentences = hindi_sentences[hindi_sentences['Text'].isin(filtered_hindi_list)]
hindi_sentences['translated_text'] = hindi_translated
hindi_val_df = pd.merge(hindi_sentences, eng_ref_translation_df, on='match_id', how='inner')
hindi_val_df.head()

Unnamed: 0.1,Text_x,match_id,translated_text,Unnamed: 0,Text_y,Language_true,id
0,अंग्रेजी में भी पढ़ें: LSR starts special law ...,hi_en_5384,Also read: LSR starts special law course on wo...,15383,LSR starts special law course on women's issues,hi_en,5384
1,"com, 29 नवंबर 2019 को Jagran ‘HiTech Awards 20...",hi_en_7292,JAGRAN 'HiTech Awards 2019 is being held on 29...,17291,New Delhi | Jagran Technology Desk: Digital me...,hi_en,7292
2,White House Threatens To Suspend Again Cnn Rep...,hi_en_960,White House Threatens to Suspend CNN Reporter ...,10959,White House suspends press credentials of CNN ...,hi_en,960
3,सरणी नाम (_N):New RAID Array,hi_en_8809,_ Array name: New RAID Array,18808,Array _Name:,hi_en,8809
4,"साथियो, यहां आने से पहले मैं International ric...",hi_en_5292,"Friends, Before coming here, I had also visite...",15291,"Before coming here, I had visited the campus o...",hi_en,5292


In [21]:
hindi_val_df['Text_x'].to_excel(r'C:\Users\useer\Downloads\gtrans_eng_hin_labels.xlsx')

In [22]:
y = pd.read_excel(r'C:\Users\useer\Downloads\gtrans_eng_hin_labels_translated.xlsx')
y.head()

Unnamed: 0.1,Unnamed: 0,text_x
0,0,Also read in English: LSR starts special law ...
1,1,com is organizing Jagran'HiTech Awards 2019 o...
2,2,White House Threatens To Suspend Again Cnn Re...
3,3,Array Name (_N):New RAID Array
4,4,"Friends, before coming here, I had also gone ..."


In [9]:
x = pd.read_excel(r'C:\Users\useer\Downloads\hindi_text_gtrans.xlsx')
x.head()

Unnamed: 0.1,Unnamed: 0,Text,text_x
0,0,'गलतफहमी के कारण हुआ झगड़ा','The fight happened due to misunderstanding'
1,1,इसके अलावा यह अभ्‍यास दोनों सेनाओं के मध्‍य सम...,"Additionally, the exercise will mark a milesto..."
2,2,"हे परमेश्‍वर जब मैं बूढ़ा हो जाऊं, और मेरे बाल...","God, even when I grow old and my hair grows gr..."
3,3,इसी बात को लेकर दोनों पक्षों में जमकर मारामारी...,There was a fierce fight between the two sides...
4,4,कभी - कभी जीवन के तनावों और खिंचावों के कारण ल...,"Sometimes, due to the stresses and strains of ..."


In [13]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu_score(reference, translation):
    reference_tokenized = [reference.split()]
    translation_tokenized = translation.split()

    bleu_score = sentence_bleu(reference_tokenized, translation_tokenized)
    return bleu_score

In [14]:
from nltk.translate.bleu_score import corpus_bleu

def calculate_bleu_scores(candidate_list, reference_list):
    # candidate_list: List of translated sentences
    # reference_list: List of reference sentences

    # Tokenize sentences
    candidate_tokenized = [sentence.split() for sentence in candidate_list]
    reference_tokenized = [[sentence.split()] for sentence in reference_list]

    # Calculate BLEU score
    bleu_score = corpus_bleu(reference_tokenized, candidate_tokenized)

    return bleu_score


In [15]:
# Gtest data
translations = hindi_val_df['translated_text'].tolist()
reference_sentences = hindi_val_df['Text_y'].tolist()

bleu_score = calculate_bleu_scores(translations, reference_sentences)
print(f"BLEU Score of IndicTrans2: {bleu_score}")

BLEU Score of IndicTrans2: 0.22047813494583085


In [23]:
# Gtest data
translations = y['text_x'].tolist()
reference_sentences = hindi_val_df['Text_y'].tolist()

bleu_score = calculate_bleu_scores(translations, reference_sentences)
print(f"BLEU Score of Google Translate: {bleu_score}")

BLEU Score of Google Translate: 0.19120931692401408
