### Package Imports

In [25]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from Preprocessing import remove_mentions_and_urls, remove_emojis, remove_english_words, emoji_handler

In [5]:
from Detect.IndicLID.Inference.ai4bharat.IndicLID import IndicLID                           # Detection Module

### Load Model

In [6]:
IndicLID_model = IndicLID(input_threshold = 0.5, roman_lid_threshold = 0.6)                 # Loading Model with recommended Hyperparams



In [45]:
# Read data and preprocessing and set the column name of text

file_path = r"C:\Users\useer\Downloads\mamata_post_replies(next 1hr).xlsx"
input_df = pd.read_excel(file_path)
text_col = 'text'
unique_row_identifier_col = 'Unnamed: 0'
# input_df.drop_duplicates(subset = ['Author_channel_id', 'video_id', 'comment'], inplace = True)

In [46]:
# Mentions, URLs, special characters are stripped; emojis are replaced with a '$' sign and a separate list of emojis is maintained      // to entirely strip emojis, use the commented line

input_df[['Preprocessed_text', 'emoji_list']] = input_df[text_col].astype(str).apply(remove_mentions_and_urls).apply(emoji_handler).apply(pd.Series)
# input_df['Preprocessed_text'] = input_df[text_column_name].astype(str).apply(remove_mentions_and_urls).apply(remove_emojis)

In [38]:
def predict_batch(comment_batch, detection_batch_size):
    return IndicLID_model.batch_predict(comment_batch, batch_size=detection_batch_size)

In [39]:
# Parallel Processing function for detection

batch_size = 75

def process_batches_by_index(text_list, batch_size):
    prev = 0
    batches = [text_list[i:i + batch_size] for i in range(0, len(text_list), batch_size)]
    with ThreadPoolExecutor() as executor:
        results = []
        for count, batch in enumerate(batches):
            batch_results = predict_batch(batch, batch_size)
            results.extend(batch_results) 
            if int(count * 100/ len(batches)) - prev > 1:
                prev += 1
                print(count * 100/ len(batches), "% complete")
    return results


In [47]:

detected_tuples = process_batches_by_index(input_df['Preprocessed_text'].tolist(), batch_size)
detected_df = pd.DataFrame(detected_tuples, columns=['Text', 'Language_pred', 'Certainty', 'Model'])

33.333333333333336 % complete
66.66666666666667 % complete


In [41]:
# Detection tags classification

hindi_categories = ['hin_Deva', 'mai_Deva', 'mar_Deva', 'san_Deva', 'hin_Latn', 'kas_Deva', 'pan_Latn', 'kok_Deva', 'brx_Deva', 'doi_Deva', 'urd_Latn', 'mai_Latn', 'brx_Latn', 'nep_Deva', 'asm_Latn']
ben_categories = ['ben_Beng', 'asm_Beng', 'ben_Latn', 'mni_Beng']
tel_categories = ['tel_Telu', 'tel_Latn']
eng_categories = ['eng_Latn', 'mni_Latn', 'kok_Latn', 'ori_Orya', 'mni_Meti']

# mni_Meti corresponds to empty string, sat_olch to only punctuations and ori_Orya for '$'

In [53]:
# Filter out sentences with 'eng_Latn' tag and strip english characters

eng_df = detected_df[detected_df['Language_pred'].isin(eng_categories)]
eng_df['Native_text'] = eng_df['Text'].apply(remove_english_words)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\useer/nltk_data'
    - 'f:\\Projects\\translate\\nltk_data'
    - 'f:\\Projects\\translate\\share\\nltk_data'
    - 'f:\\Projects\\translate\\lib\\nltk_data'
    - 'C:\\Users\\useer\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [15]:
# Run Detection Model after english character stripping

fixed_outputs = IndicLID_model.batch_predict(eng_df['Native_text'].tolist(), batch_size = batch_size)

KeyError: 'Native_text'

In [14]:
# Join back corrected results to parent df

fixed_df = pd.DataFrame(fixed_outputs, columns = ['Text', 'Language_pred', 'Certainity', 'Model'])
eng_df['Language_pred'] = fixed_df['Language_pred'].tolist()
detected_df = detected_df[~(detected_df['Language_pred'].isin(eng_categories))]
detected_df = pd.concat([detected_df, eng_df], ignore_index=True)
detected_df['Text'] = detected_df['Text'].astype(str)

NameError: name 'fixed_outputs' is not defined

In [48]:
# Function to map langauges

def language_mapper(lang_code):
    if lang_code in hindi_categories:
        return 'hi'
    elif lang_code in tel_categories:
        return 'te'
    elif lang_code in ben_categories:
        return 'bn'
    elif lang_code in eng_categories:
        return 'en'
    else:
        return 'unidentified'

In [49]:
detected_df['consol_lang_tag'] = detected_df['Language_pred'].apply(language_mapper)

In [50]:
# Merge input df with detected df based on preprocessed text column

merged_df = pd.merge(input_df, detected_df, left_on = 'Preprocessed_text', right_on = 'Text')
merged_df.drop_duplicates(subset = unique_row_identifier_col, inplace = True)

In [52]:
# Save file

file_save_path = r"C:\Users\useer\Downloads\Mamata_replies_Aug_28_2024_detected.xlsx"
merged_df.to_excel(file_save_path)