# Imports

In [19]:
import pandas as pd
pd.set_option('display.max_rows', 5)
import re

import logging
logger_format = '%(asctime)s %(levelname)-8s %(message)s'
logging.basicConfig(format=logger_format)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Loading transliteration map

The first 3 columns are statistics of the original full names transliterated to create this map, and the last 2 columns were generated by the transliteration model: 
* `name_English` - the transliteration.
* `is_male_first_name`: if the name part is a male/female first name it is `1`/`-1`, and '0' means it is not a first name.    

In [20]:
transliteration_map_xlsx_path = "data/names2EN system=1 filtering=AR-FA N.xlsx"
# -----------
transliteration_map = pd.read_excel(transliteration_map_xlsx_path, index_col=0)
transliteration_map

Unnamed: 0_level_0,count,%,% cumsum,name_English,is_male_first_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
محمد,67055,4.132909,4.132909,Mohammed,1
علي,63535,3.915955,8.048864,Ali,1
...,...,...,...,...,...
توتونجيان,1,0.000062,99.999938,Toutonjian,-1
يُسر,1,0.000062,100.000000,Yusr,-1


# Defining `transliterate` function

In [21]:
# regex to detect any Arabic or Persian characters
filtering_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFBC1\uFE70-\uFEFF]'

# regex to drop any surrounding chars that are not Arabic or Persian
surrounding_chars_pattern = r'^[^a-zA-Z\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFBC1\uFE70-\uFEFF]+|[^a-zA-Z\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFBC1\uFE70-\uFEFF]+$'
# -----------------

filtering_regex = re.compile(filtering_pattern)
surrounding_chars_regex = re.compile(surrounding_chars_pattern)

def transliterate(sentence,
                  transliteration_map: pd.DataFrame = transliteration_map, target_col: str = 'name_English',
                  surrounding_chars_regex: re.Pattern = surrounding_chars_regex,
                  filtering_regex: re.Pattern = filtering_regex) -> str:
    """ Transliterating a sentence (assumed to be an Arabic full name) by splitting into name parts
     and transliterating each by looking it up in the transliteration map, 
     keeping any non-Arabic/Persian characters as is.
    
    Warning if a name part contains all-Arabic/Persian characters but doesn't exist in the transliteration map,
    and returning the missing names.
    """
    sentence_ = str(sentence)
    
    missing_names = []
    transliterated_fullname = ''
    for name in sentence_.split():
        if surrounding_chars_regex:
            name_clean = surrounding_chars_regex.sub('', name)
        if name_clean in transliteration_map.index:
            name_EN = transliteration_map.loc[name_clean, target_col]
            if pd.isnull(name_EN):
                logger.warning(f"'{name}' EN transliteration is None -> leaving as is")
                transliterated_fullname += name + ' '
            else:
                if not isinstance(name_EN, str):
                    logger.info(f"'{name}' transliteration is not a string, so casting it")
                    name_EN = str(name_EN)
                transliterated_fullname += name.replace(name_clean, name_EN) + ' ' 
        else:
            transliterated_fullname += name + ' '
            if filtering_regex.search(name_clean):
                missing_names.append(name)
                logger.warning(f"'{name_clean}' (cleaned of surrounding chars) contains Arabic or Persian letters but doesn't exist in transliteration_map -> leaving as is, consider transliterating and adding to transliteration_map!") 
            else:
                logger.info(f"'{name}' not in transliteration_map -> leaving")

    transliterated_fullname = transliterated_fullname.strip()
    return transliterated_fullname, missing_names

# Testing `transliterate`

In [22]:
test_fullname = 'בדיקה: محمد (النبطية-بئر) جواد رضوان نصار'
# -----------------

test_transliterated, missing_names = transliterate(sentence=test_fullname)
print(f"'{test_fullname}' ->\n'{test_transliterated}'")

2023-11-20 12:29:24,021 INFO     'בדיקה:' not in transliteration_map -> leaving


'בדיקה: محمد (النبطية-بئر) جواد رضوان نصار' ->
'בדיקה: Mohammed (Alnabatieh-Bir) Jawad Ridwan Nassar'
