# Imports

In [1]:
import pandas as pd
from tqdm import tqdm
from ai4bharat.transliteration import XlitEngine                                            # Transliteration Module

In [16]:
# Load ONLY the appropriate language models; Beamwidth increases search size, i.e., beamwidth increases accuracy with increase in compute time. 10 is recommended in the docs

Translit_hindi_model = XlitEngine("hi", beam_width=10)
# Translit_telugu_model = XlitEngine("te", beam_width=10)
Translit_bengali_model = XlitEngine("bn", beam_width=10)

Initializing Multilingual model for transliteration


Loading dicts into RAM: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:06<00:00,  6.05s/it]


Initializing Multilingual model for transliteration


Loading dicts into RAM: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:06<00:00,  6.70s/it]


In [23]:
# Read output file of Detection

file_path = r"C:\Users\useer\Downloads\Mamata_replies_Aug_28_2024_detected.xlsx"
df = pd.read_excel(file_path)

In [24]:
df = df.dropna(subset = ['Text'])

In [25]:
df['consol_lang_tag'].value_counts()

consol_lang_tag
en              114
hi               32
bn               30
unidentified     16
Name: count, dtype: int64

In [17]:
# Function to use appropriate language model based on the 'consol_lang_tag' column

def selective_translit(sentence, tag, actual_tag):
    if tag == 'bn':
        return list(Translit_bengali_model.translit_sentence(sentence).values())[0]
    #     if actual_tag == 'tel_Latn':
    #         return list(Translit_telugu_model.translit_sentence(sentence).values())[0]
    elif tag == 'hi':
        return list(Translit_hindi_model.translit_sentence(sentence).values())[0]
            
    else:
        return sentence

In [26]:
# Run transliteration

for _, row in tqdm(df.iterrows(), total=len(df)):
    df.at[row.name, 'translit_text'] = selective_translit(row['Text'], row['consol_lang_tag'], row['Language_pred'])

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 192/192 [00:18<00:00, 10.54it/s]


In [27]:
# Save file

file_save_path = r"C:\Users\useer\Downloads\Mamata_replies_Aug_28_2024_translit.xlsx"
df.to_excel(file_save_path)

In [28]:
df['translit_text'].to_excel(r'C:\Users\useer\Downloads\Mamata_replies_Aug_28_2024_translit.xlsx')

In [22]:
def replace_none_with_column2(row):
    if row['translit_text']: 
        return row['translit_text'] 
    else:
        return row['comment']  

In [19]:
df['translit_text'].isna().sum()

14141

In [23]:

df['translit_text'] = df.apply(replace_none_with_column2, axis=1)
df

Unnamed: 0.1,Unnamed: 0,author_channel_id,video_id,comment_id,comment,publish_date,Preprocessed_text,emoji_list,Text,Language_pred,Certainty,Model,consol_lang_tag,translit_text
0,0,UCcPv-0y-RwPVEu-Im6w0-YA,RRe_Y9gHqJM,UgwsPd4B54QP5toSpM14AaABAg,‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ,2024-07-02T10:27:39Z,‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ,[],‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ,ben_Beng,1.000050,IndicLID-FTN,bn,‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ
1,32,UC95Rt-2QG6Y68mboHgoIxMQ,RRe_Y9gHqJM,Ugygi1htXl95_zSnTNd4AaABAg,‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...,2024-06-27T16:25:38Z,‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...,[],‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...,ben_Beng,1.000049,IndicLID-FTN,bn,‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...
2,33,UCnIAley2VwVBxxVf1NHKPQw,RRe_Y9gHqJM,UgxoFOuv1L5vRXvtPgx4AaABAg,I am very proud of you .,2024-06-26T16:51:40Z,I am very proud of you .,[],I am very proud of you .,kok_Latn,1.000044,IndicLID-FTR,en,I am very proud of you .
3,34,UCTVyVQtBb4eCDBC6OqUtt4g,RRe_Y9gHqJM,UgwoN-OAagLykdaobH54AaABAg,Bangladeshi gunda dharo jail bharo üîê,2024-06-26T04:35:39Z,Bangladeshi gunda dharo jail bharo $,['üîê'],Bangladeshi gunda dharo jail bharo $,guj_Latn,5.600022,IndicLID-BERT,unidentified,Bangladeshi gunda dharo jail bharo üîê
4,35,UCTVyVQtBb4eCDBC6OqUtt4g,RRe_Y9gHqJM,Ugw_3s2CEMcgCrsJ_BF4AaABAg,Bangladeshi gunda hatao West Bengal Bachao,2024-06-26T04:35:17Z,Bangladeshi gunda hatao West Bengal Bachao,[],Bangladeshi gunda hatao West Bengal Bachao,eng_Latn,0.999290,IndicLID-FTR,en,Bangladeshi gunda hatao West Bengal Bachao
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31875,485289,UCv0lt8m0Wld2la8DEXdjLmQ,z1qPF_xD1_A,Ugw4ErB0A6oxWqYAuSB4AaABAg,Lalsalam comred,2024-07-24T04:08:09Z,Lalsalam comred,[],Lalsalam comred,kas_Latn,0.898363,IndicLID-FTR,unidentified,Lalsalam comred
31876,485368,UCoAmD2WwaTYS5ndCGG8-noA,z1qPF_xD1_A,Ugzl6j7LBGZykCq2w_t4AaABAg,Inqilab zindabad,2024-07-23T15:34:03Z,Inqilab zindabad,[],Inqilab zindabad,snd_Latn,0.761096,IndicLID-FTR,unidentified,Inqilab zindabad
31877,485370,UCY54-tbS1toDRx-a_968GsQ,z1qPF_xD1_A,UgyZ5Eg7PmNAZk4-4EV4AaABAg,‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?,2024-07-23T09:19:40Z,‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?,[],‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?,ben_Beng,0.987689,IndicLID-FTN,bn,‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?
31878,485371,UCzs5fgYahkwQIgIVpr84FeQ,z1qPF_xD1_A,UgyEfigxo4s6Gtf_JnN4AaABAg,INQUILAB ZINDABAD ‚úä,2024-07-23T07:01:42Z,INQUILAB ZINDABAD $,['‚úä'],INQUILAB ZINDABAD $,eng_Latn,6.290914,IndicLID-BERT,en,INQUILAB ZINDABAD ‚úä


In [24]:
df_filtered = df[df['consol_lang_tag'].isin(['bn', 'en'])]
df_filtered

Unnamed: 0.1,Unnamed: 0,author_channel_id,video_id,comment_id,comment,publish_date,Preprocessed_text,emoji_list,Text,Language_pred,Certainty,Model,consol_lang_tag,translit_text
0,0,UCcPv-0y-RwPVEu-Im6w0-YA,RRe_Y9gHqJM,UgwsPd4B54QP5toSpM14AaABAg,‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ,2024-07-02T10:27:39Z,‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ,[],‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ,ben_Beng,1.000050,IndicLID-FTN,bn,‡¶ú‡¶Ø‡¶º ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ
1,32,UC95Rt-2QG6Y68mboHgoIxMQ,RRe_Y9gHqJM,Ugygi1htXl95_zSnTNd4AaABAg,‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...,2024-06-27T16:25:38Z,‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...,[],‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...,ben_Beng,1.000049,IndicLID-FTN,bn,‡¶°‡¶æ‡¶Ø‡¶º‡¶Æ‡¶®‡ßç‡¶° ‡¶π‡¶æ‡¶∞‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶≠‡ßã‡¶ü‡ßá ‡¶Ø‡¶¶‡¶ø ‡¶ú‡¶ø‡¶§‡¶§...
2,33,UCnIAley2VwVBxxVf1NHKPQw,RRe_Y9gHqJM,UgxoFOuv1L5vRXvtPgx4AaABAg,I am very proud of you .,2024-06-26T16:51:40Z,I am very proud of you .,[],I am very proud of you .,kok_Latn,1.000044,IndicLID-FTR,en,I am very proud of you .
4,35,UCTVyVQtBb4eCDBC6OqUtt4g,RRe_Y9gHqJM,Ugw_3s2CEMcgCrsJ_BF4AaABAg,Bangladeshi gunda hatao West Bengal Bachao,2024-06-26T04:35:17Z,Bangladeshi gunda hatao West Bengal Bachao,[],Bangladeshi gunda hatao West Bengal Bachao,eng_Latn,0.999290,IndicLID-FTR,en,Bangladeshi gunda hatao West Bengal Bachao
5,36,UCTVyVQtBb4eCDBC6OqUtt4g,RRe_Y9gHqJM,UgxzkSjDs_faUxr57gB4AaABAg,Bangladeshi TMC chor hatao West Bengal Bachao,2024-06-26T04:34:52Z,Bangladeshi TMC chor hatao West Bengal Bachao,[],Bangladeshi TMC chor hatao West Bengal Bachao,eng_Latn,0.999997,IndicLID-FTR,en,Bangladeshi TMC chor hatao West Bengal Bachao
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31870,485197,UCJ4fzqAk5gxRS4l77r00Szw,-fQp9DJc8kc,UgyHr7ii3B-x2lb66Qh4AaABAg,Inclab Jindabad,2024-07-22T14:00:18Z,Inclab Jindabad,[],Inclab Jindabad,eng_Latn,0.834536,IndicLID-FTR,en,Inclab Jindabad
31873,485285,UCgkUq--co3RKAA2RPJvV76A,z1qPF_xD1_A,UgyJc81HgFM8G33Cxoh4AaABAg,Red salute.,2024-07-24T16:46:48Z,Red salute.,[],Red salute.,eng_Latn,0.984346,IndicLID-FTR,en,Red salute.
31877,485370,UCY54-tbS1toDRx-a_968GsQ,z1qPF_xD1_A,UgyZ5Eg7PmNAZk4-4EV4AaABAg,‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?,2024-07-23T09:19:40Z,‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?,[],‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?,ben_Beng,0.987689,IndicLID-FTN,bn,‡¶ï‡ßá ‡¶è‡¶á ‡¶á‡¶â‡¶∏‡ßÅ‡¶´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶ó‡ßÄ ?
31878,485371,UCzs5fgYahkwQIgIVpr84FeQ,z1qPF_xD1_A,UgyEfigxo4s6Gtf_JnN4AaABAg,INQUILAB ZINDABAD ‚úä,2024-07-23T07:01:42Z,INQUILAB ZINDABAD $,['‚úä'],INQUILAB ZINDABAD $,eng_Latn,6.290914,IndicLID-BERT,en,INQUILAB ZINDABAD ‚úä


In [25]:
df_filtered.to_excel(r'C:\Users\useer\Downloads\WB_Comments_June_Aug_2024_trans.xlsx')