### Package Imports

In [1]:
import pandas as pd
from Preprocessing import remove_mentions_and_urls, remove_emojis, remove_english_words

from Detect.IndicLID.Inference.ai4bharat.IndicLID import IndicLID                           # Detection Module

### Load Model

In [2]:
IndicLID_model = IndicLID(input_threshold = 0.5, roman_lid_threshold = 0.6)                 # Loading Model with recommended Hyperparams

detection_batch_size = 64



In [3]:
# Load dataset and preprocess

input_file_path = r'C:\Users\useer\Downloads\Twitter_Mention_Biweekly_16_31_Jan2023.xlsx'
input_df = pd.read_excel(input_file_path)
input_df['Preprocessed_text'] = input_df['Tweet'].astype(str).apply(remove_mentions_and_urls).apply(remove_emojis)

In [4]:
# Run detection model
detected_tuples = IndicLID_model.batch_predict(input_df['Preprocessed_text'].tolist(), batch_size = detection_batch_size)

# Turn detection out (in tuple format) to df
detected_df = pd.DataFrame(detected_tuples, columns = ['Text', 'Language_pred', 'Certainity', 'Model'])

In [5]:
# Detection tags classification

hindi_categories = ['hin_Deva', 'mai_Deva', 'mar_Deva', 'san_Deva', 'hin_Latn', 'kas_Deva', 'pan_Latn', 'kok_Deva', 'brx_Deva', 'doi_Deva', 'urd_Latn', 'mai_Latn', 'brx_Latn', 'nep_Deva', 'asm_Latn']
ben_categories = ['ben_Beng', 'asm_Beng', 'ben_Latn', 'mni_Beng', 'sat_Olch']
tel_categories = ['tel_Telu', 'tel_Latn']
eng_categories = ['eng_Latn', 'mni_Latn', 'kok_Latn']

## Evaluation before english character removal

In [6]:
detected_df['Language_true'] = input_df['Label'].tolist()

detected_df = detected_df[~detected_df['Language_pred'].isin(['mni_Meti', 'sat_Olch'])]     # mni_Meti corresponds to empty string and sat_olch to only punctuations

eval_df = detected_df

# Estimating the count of correct predictions
correct_hindi_count = ((eval_df['Language_true'] == 'Hindi') & (eval_df['Language_pred'].isin(hindi_categories))).sum()
correct_ben_count = ((eval_df['Language_true'] == 'Bengali') & (eval_df['Language_pred'].isin(ben_categories))).sum()
correct_tel_count = ((eval_df['Language_true'] == 'Telugu') & (eval_df['Language_pred'].isin(tel_categories))).sum()

In [7]:
# Estimating accuracy in each language

hindi_acc = str(round(correct_hindi_count * 100 / ((eval_df['Language_true'] == 'Hindi').sum()), 2)) + '%'
ben_acc = str(round(correct_ben_count * 100 / ((eval_df['Language_true'] == 'Bengali').sum()), 2)) + '%'
tel_acc = str(round(correct_tel_count * 100 / ((eval_df['Language_true'] == 'Telugu').sum()), 2)) + '%'
total_valid = (eval_df['Language_true'] == 'Hindi').sum() + (eval_df['Language_true'] == 'Bengali').sum() + (eval_df['Language_true'] == 'Telugu').sum()

print("Hindi Prediction accuracy:", hindi_acc, "out of", ((eval_df['Language_true'] == 'Hindi').sum()))
print("Bengali Prediction accuracy:", ben_acc, "out of", ((eval_df['Language_true'] == 'Bengali').sum()))
print("Telugu Prediction accuracy:", tel_acc, "out of", ((eval_df['Language_true'] == 'Telugu').sum()))
print("Overall accuracy:", str(round((correct_hindi_count + correct_ben_count + correct_tel_count) * 100 / total_valid, 2)) + '%', "out of", total_valid)

Hindi Prediction accuracy: 97.86% out of 234
Bengali Prediction accuracy: 100.0% out of 227
Telugu Prediction accuracy: 97.4% out of 231
Overall accuracy: 98.41% out of 692


## English Character Stripping

In [8]:
# Filter out sentences with 'eng_Latn' tag and strip english characters

eng_df = eval_df[eval_df['Language_pred'].isin(eng_categories)]
eng_df['Native_text'] = eng_df['Text'].apply(remove_english_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_df['Native_text'] = eng_df['Text'].apply(remove_english_words)


In [9]:
# Run Detection Model after english character stripping

fixed_outputs = IndicLID_model.batch_predict(eng_df['Native_text'].tolist(), batch_size = 32)

In [10]:
# Join back corrected results to parent df

fixed_df = pd.DataFrame(fixed_outputs, columns = ['Text', 'Language_pred', 'Certainity', 'Model'])
eng_df['Language_pred'] = fixed_df['Language_pred'].tolist()
eng_df.to_excel(r'C:\Users\useer\Downloads\xyz.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_df['Language_pred'] = fixed_df['Language_pred'].tolist()


In [11]:

eval_df = eval_df[~(eval_df['Language_pred'].isin(eng_categories))]
eval_df = pd.concat([eval_df, eng_df], ignore_index=True)

## Evaluation after english character removal

In [12]:
# Estimating the count of correct predictions
correct_hindi_count = ((eval_df['Language_true'] == 'Hindi') & (eval_df['Language_pred'].isin(hindi_categories))).sum()
correct_ben_count = ((eval_df['Language_true'] == 'Bengali') & (eval_df['Language_pred'].isin(ben_categories))).sum()
correct_tel_count = ((eval_df['Language_true'] == 'Telugu') & (eval_df['Language_pred'].isin(tel_categories))).sum()

In [13]:
# Estimating accuracy in each language

hindi_acc = str(round(correct_hindi_count * 100 / ((eval_df['Language_true'] == 'Hindi').sum()), 2)) + '%'
ben_acc = str(round(correct_ben_count * 100 / ((eval_df['Language_true'] == 'Bengali').sum()), 2)) + '%'
tel_acc = str(round(correct_tel_count * 100 / ((eval_df['Language_true'] == 'Telugu').sum()), 2)) + '%'
total_valid = (eval_df['Language_true'] == 'Hindi').sum() + (eval_df['Language_true'] == 'Bengali').sum() + (eval_df['Language_true'] == 'Telugu').sum()

print("Hindi Prediction accuracy:", hindi_acc, "out of", ((eval_df['Language_true'] == 'Hindi').sum()))
print("Bengali Prediction accuracy:", ben_acc, "out of", ((eval_df['Language_true'] == 'Bengali').sum()))
print("Telugu Prediction accuracy:", tel_acc, "out of", ((eval_df['Language_true'] == 'Telugu').sum()))
print("Overall accuracy:", str(round((correct_hindi_count + correct_ben_count + correct_tel_count) * 100 / total_valid, 2)) + '%', "out of", total_valid)

Hindi Prediction accuracy: 98.29% out of 234
Bengali Prediction accuracy: 100.0% out of 227
Telugu Prediction accuracy: 97.84% out of 231
Overall accuracy: 98.7% out of 692


In [None]:
# Save erroneous entries

rows_not_satisfying_conditions = eval_df[~((eval_df['Language_true'] == 'hi') & (eval_df['Language_pred'].isin(hindi_categories))) &
                                           ~((eval_df['Language_true'] == 'bn') & (eval_df['Language_pred'].isin(ben_categories))) &   
                                           ~((eval_df['Language_true'] == 'te') & (eval_df['Language_pred'].isin(tel_categories)))]

rows_not_satisfying_conditions.to_excel(r'C:\Users\useer\Downloads\final_detect_check.xlsx')