# Imports

In [1]:
import pandas as pd
from inference.engine import Model as IndicTrans2                       # Translation
import warnings
import nest_asyncio
import asyncio
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")

In [2]:
# Load Model

IndicTrans2_model = IndicTrans2("models/indic-en-preprint/fairseq_model", model_type="fairseq")

Initializing sentencepiece model for SRC and TGT
Initializing model for translation



2024-02-16 10:37:03 | INFO | fairseq.tasks.translation | [SRC] dictionary: 122706 types
2024-02-16 10:37:03 | INFO | fairseq.tasks.translation | [TGT] dictionary: 32296 types


In [3]:
# Read transliteration output file

file_path = r"C:\Users\useer\Downloads\AP_JSP_YT_Hashtag_Comments_Jan_15_31_2024translit.xlsx"
df = pd.read_excel(file_path)

In [6]:
# Add based on languages needed

lang_tag_mapping = {
                    'hi': 'hin_Deva',
                    'te':'tel_Telu'
                    # 'bn': 'ben_Beng'
                    }

In [7]:
# consol_lang_tag to translation tag mapping

filtered_df = df[df['consol_lang_tag'].isin(lang_tag_mapping.keys())] 
filtered_df['len'] = filtered_df['translit_text'].apply(len)
len_filtered_df = filtered_df[filtered_df['len'] < 400]
anomaly_df = filtered_df[filtered_df['len'] >= 400]

In [8]:
# Language based splitting and batch translating

completion_len = 0
translated_df = pd.DataFrame()

for lang_tag in lang_tag_mapping.keys():
    lang_filtered_df = len_filtered_df[len_filtered_df['consol_lang_tag'] == lang_tag]
    temp_translated = IndicTrans2_model.batch_translate(lang_filtered_df['translit_text'].to_list(), lang_tag_mapping[lang_tag], "eng_Latn")
    lang_filtered_df['translated_text'] = temp_translated
    translated_df = pd.concat([translated_df, lang_filtered_df], ignore_index=True)
    completion_len += len(lang_filtered_df)
    print(f'{completion_len * 100 / len(len_filtered_df)} % completed')

2024-02-16 10:41:03 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2024-02-16 10:41:03 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2024-02-16 10:41:03 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = False
2024-02-16 10:41:03 | INFO | fairseq.tasks.fairseq_task | creating new batches for epoch 1


18.793877680096134 % completed


2024-02-16 12:04:24 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2024-02-16 12:04:24 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2024-02-16 12:04:24 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = False
2024-02-16 12:04:24 | INFO | fairseq.tasks.fairseq_task | creating new batches for epoch 1


100.0 % completed


In [10]:
translated_df = pd.concat([translated_df, anomaly_df], ignore_index = True)

In [11]:
file_save_path = r'C:\Users\useer\Downloads\AP_JSP_YT_Hashtag_Comments_Jan_15_31_2024_translated.xlsx'
translated_df.to_excel(file_save_path)