# Generate local files with the received delta translated excel files

In [1]:
import pandas as pd
import openpyxl
import json
import os
import re

In [2]:
def reformat_json(json_obj):
    json_dict = {}
    for key, value in json_obj:
        json_dict[key] = value
    return json_dict

In [3]:
def set_values(df_row):
    if pd.notnull(df_row[lang]):
        df_row['value'] = df_row[lang]
    return df_row

In [4]:
def set_variables(df_row):
    for value in allowed_values:
        try:
            if pd.notna(df_row[value]):
                df_row[lang] = df_row[lang].replace('<'+ value + '>', df_row[value])
                df_row['key'] = df_row['key'].replace('<'+ value + '>', df_row[value])
        except:
            pass
    try:
        if pd.notna(df_row['a-tag-replacement']):
            start_index = df_row[lang].find('<a')+2
            end_index = df_row[lang].find('>')
            df_row[lang] = df_row[lang][:start_index+1] + df_row['a-tag-replacement'] + df_row[lang][end_index:]
            df_row['key'] = df_row['key'][:start_index+1] + df_row['a-tag-replacement'] + df_row['key'][end_index:]
    except:
        pass
        
    return df_row

In [5]:
def load_json_as_df(json_data):
    out_df = pd.DataFrame(list(json_data.items()),
                       columns=['Key', 'value'])        
    out_df['Key_lower'] = out_df['Key'].str.lower()
    return out_df

In [6]:
def read_excels_as_df(translation_excel_files, language_code, language_name):
    excel_df = pd.DataFrame([], columns=['Key', language_name])
    for excel_file_name in translation_excel_files:
        excel = pd.ExcelFile(excel_file_name)
        for sheet_name in excel.sheet_names:
            sheet = excel.parse(sheet_name = sheet_name, header=1)
            if(len(sheet.columns) == 0):
                continue
            FORMAT = [english_col,language_name]
            for value in allowed_values:
                if value in sheet.columns:
                    FORMAT.append(value)
            filtered_sheet = sheet[FORMAT]
            sheet_no_na = filtered_sheet.dropna(subset = [english_col, language_name], inplace=False)
            sheet_new = sheet_no_na.rename(columns = {english_col: 'Key'}, inplace=False)
            excel_df = pd.concat([excel_df, sheet_new], axis=0)
            
    excel_df = excel_df.apply(set_variables, axis=1)
    return excel_df

In [7]:
def read_excels(input_base_path, language_code, language_name):
    path_to_excel = '{}/{}'.format(input_base_path,language_code)
    excel_files = sorted(os.listdir(path_to_excel))
    translation_excel_files = [path_to_excel+"/"+ excel_file_name for excel_file_name in excel_files if excel_file_name.endswith('.xlsx') and not excel_file_name.startswith('~')]
    excel_df = read_excels_as_df(translation_excel_files, language_code, language_name)
    return excel_df

In [8]:
def read_json(json_file_path):
    with open(json_file_path) as f:
        data = json.load(f)
    return data

In [9]:
def clean_excel_df(df, language_name):
    excel_df = df.copy()
    excel_df['Key_lower'] = excel_df['Key'].str.lower().str.strip()
    excel_df[language_name] = excel_df[language_name].str.strip()
    excel_df = excel_df.drop_duplicates(subset=['Key_lower'], keep='last')
    excel_df.Key_lower=excel_df.Key_lower.apply(lambda x: re.sub(r' -$','',re.sub(r'^X ','',re.sub(r'^x ','',str(x)))))
    excel_df[language_name]=excel_df[language_name].apply(lambda x: re.sub(r' -$','',re.sub(r'^X ','',re.sub(r'^x ','',str(x)))))
    return excel_df

In [10]:
def clean_json_df(df):
    out_df = df.copy()
    out_df['Key_lower'] = out_df['Key'].str.lower().str.strip()
    out_df_dropped = out_df.drop_duplicates(subset=['Key'], keep='first')
    return out_df

In [11]:
def write_df_to_json(df, output_json_path):
    jsonFile = df.to_json(orient='values')
    json_string = json.loads(jsonFile)

    reformatted_json = reformat_json(json_string)

    with open(output_json_path, 'w') as f:
        f.write(json.dumps(reformatted_json, indent = 4, ensure_ascii=False))

In [21]:
def get_locale_data(input_base_path, language_code, language_name):
    global lang
    lang = language_name
    
    excel_df = read_excels(input_base_path, language_code, language_name)
    existing_locale_json_data = read_json('./../../../crowdsource-ui/locales/{locale}.json'.format(locale=language_code))
    out_df = load_json_as_df(existing_locale_json_data)

    excelDf_dropped = clean_excel_df(excel_df, language_name)
    out_df_dropped = clean_json_df(out_df)

    merged_df = pd.merge(excelDf_dropped, out_df_dropped, on="Key_lower", how='right')

    merged_df = merged_df.apply(set_values, axis = 1)

    select_columns = ['Key_y', 'value']

    filtered_merged_df = merged_df[select_columns]

    final_df = filtered_merged_df.drop_duplicates(subset='Key_y', keep='first', inplace=False)

    return excelDf_dropped, final_df, merged_df


In [43]:
def get_matched_count(excel_df, merged_df):
    count = 0
    for key in excel_df['Key_lower']:
        for k_key in merged_df['Key_lower']:
            if key == k_key:
                count+=1
                break
    return count
            

In [42]:
english_col = 'English copy'
allowed_values = ['x','y','z','u','v','w']
languages = [('hi','Hindi'),('mr','Marathi'),('ta','Tamil'),('or','Odia'),('kn','Kannada'),('te','Telugu')
,('gu','Gujarati'), ('bn','Bengali'),('pa','Punjabi'),('as','Assamese'),('ml','Malayalam'),]

input_base_path = "./input_excel_files"
output_base_path = "./output_json_files"
os.makedirs(output_base_path, exist_ok=True)
for language_code, language_name in languages:
    excelDf_dropped, final_df, merged_df = get_locale_data(input_base_path, language_code, language_name)
    print("****** LOCALE = {} **********".format(language_name))
    print("Final Number of Keys(WithoutDuplicates/WithDuplicates) = {}/{}".format(final_df['Key_y'].nunique(), final_df['Key_y'].count()))
    print("Matched Keys => {}/{}".format(get_matched_count(excelDf_dropped, merged_df), excelDf_dropped['Key'].count()))
    print("****************")
    output_json_path = '{base_path}/{language}.json'.format(base_path=output_base_path, language=language_code)
#     write_df_to_json(final_df, output_json_path)


****** LOCALE = Hindi **********
Final Number of Keys(WithoutDuplicates/WithDuplicates) = 337/337
Matched Keys => 12/12
****************
****** LOCALE = Marathi **********
Final Number of Keys(WithoutDuplicates/WithDuplicates) = 337/337
Matched Keys => 12/12
****************
****** LOCALE = Tamil **********
Final Number of Keys(WithoutDuplicates/WithDuplicates) = 337/337
Matched Keys => 0/0
****************
****** LOCALE = Odia **********
Final Number of Keys(WithoutDuplicates/WithDuplicates) = 337/337
Matched Keys => 12/12
****************
****** LOCALE = Kannada **********
Final Number of Keys(WithoutDuplicates/WithDuplicates) = 337/337
Matched Keys => 12/12
****************
****** LOCALE = Telugu **********
Final Number of Keys(WithoutDuplicates/WithDuplicates) = 337/337
Matched Keys => 12/12
****************
****** LOCALE = Gujarati **********
Final Number of Keys(WithoutDuplicates/WithDuplicates) = 337/337
Matched Keys => 0/0
****************
****** LOCALE = Bengali **********
Fin