In [197]:

import pandas as pd
import openpyxl
import json
import os

from pandas import ExcelWriter

In [198]:
lang = 'Hindi'
english_col = 'English copy'
allowed_values = ['x','y','z']
languages = [('hi','Hindi'),('gu','Gujarati'),('ta','Tamil'),
               ('bn','Bengali'),('te','Telugu'),('pa','Punjabi'),
               ('mr','Marathi'),('as','Assamese'),('ml','Malayalam'),
               ('or','Odia'),('kn','Kannada')]

In [199]:
def readInputJsonAsDf(input_json_file):
    with open(input_json_file) as f:
                js = json.load(f)

    out_df = pd.DataFrame(list(js.items()),
                       columns=['Key', 'value'])        
    out_df['Key_lower'] = out_df['Key'].str.lower()
    return out_df

In [200]:
def set_values(df_row):
    if not(pd.notnull(df_row[lang])):
        pass
    else:
        df_row['value'] = df_row[lang]
    return df_row
    

In [201]:
def set_variables(df_row):
    for value in allowed_values:
        try:
            if pd.notna(df_row[value]):
                df_row[lang] = df_row[lang].replace('<'+ value + '>', df_row[value])
                df_row['key'] = df_row['key'].replace('<'+ value + '>', df_row[value])
        except:
            pass
    return df_row

In [202]:
def reformat_json(json_obj):
    json_dict = {}
    for key, value in json_obj:
        json_dict[key] = value
    return json_dict

In [203]:
def save_xls(list_dfs, xls_path):
    with ExcelWriter(xls_path) as writer:
        for n, df in enumerate(list_dfs):
            df.to_excel(writer,languages[n][1])
        writer.save()

In [204]:
def getTranslationFromSheets(locale, translation_excel_files):
    excelDf = pd.DataFrame([], columns=['Key', lang])
    for excel_file_name in translation_excel_files:
#         print("---------------",excel_file_name, "------------------")
        excel = pd.ExcelFile(locale + '/' + excel_file_name)

        count = 0
        for sheetName in excel.sheet_names:
            sheet = excel.parse(sheet_name = sheetName, header=1)
            if(len(sheet.columns) == 0):
                continue
#             print(sheetName, sheet.columns)
            FORMAT = [english_col,lang]
            for value in allowed_values:
                if value in sheet.columns:
                    FORMAT.append(value)
            filteredSheet = sheet[FORMAT]
            sheet_no_na = filteredSheet.dropna(subset = [english_col, lang], inplace=False)
            sheet_new = sheet_no_na.rename(columns = {english_col: 'Key'}, inplace=False)
            count += sheet_new.count()
            excelDf = pd.concat([excelDf, sheet_new], axis=0)
            
    excelDf = excelDf.apply(set_variables, axis=1)
    return excelDf

In [207]:
def getAllTranslations():
    

    locales = languages
    all_translations = pd.DataFrame()
    langs_translations = []
    for locale, language in locales:
        global lang
        lang = language
        path_to_xl = './'+ locale
        excel_files = sorted(os.listdir(path_to_xl))
        translation_excel_files = [pos_json for pos_json in excel_files if pos_json.endswith('.xlsx') and not pos_json.startswith('~')]
        
        excelDf = getTranslationFromSheets(locale, translation_excel_files)
        out_df = readInputJsonAsDf('./../../../crowdsource-ui/locales/{locale}.json'.format(locale=locale))
    
        filter_chars = [':','-']
    
        excelDf['Key_lower'] = excelDf['Key'].str.lower().str.strip()
        excelDf[language] = excelDf[language].str.strip()
        excelDf = excelDf.drop_duplicates(subset=['Key_lower'], keep='last')
        excelDf.Key_lower=excelDf.Key_lower.apply(lambda x: re.sub(r' -$','',re.sub(r'^X ','',re.sub(r'^x ','',str(x)))))
        excelDf[language]=excelDf[language].apply(lambda x: re.sub(r' -$','',re.sub(r'^X ','',re.sub(r'^x ','',str(x)))))

        
        out_df['Key_lower'] = out_df['Key'].str.lower().str.strip()
        
        
        excelDf_dropped = excelDf.drop_duplicates(subset=['Key_lower'], keep='first')
        out_df_dropped = out_df.drop_duplicates(subset=['Key'], keep='first')
        
        lang_translations = pd.DataFrame()
        lang_translations = lang_translations.append(excelDf_dropped[['Key',language]])
#         lang_translations = lang_translations.rename(columns = {"Key": language+'_key'}, inplace=False)
#         lang_translations = lang_translations.rename(columns = {language: language+'_value'}, inplace=False)
        
        all_translations = all_translations.append(lang_translations)
        
        merged_df = pd.merge(excelDf_dropped, out_df_dropped, on="Key_lower", how='right')
        new_trans = pd.merge(excelDf_dropped, out_df_dropped, on="Key_lower", how='inner')

        merged_df = merged_df.apply(set_values, axis = 1)

        select_columns = ['Key_y', 'value']

        filtered_merged_df = merged_df[select_columns]

        final_df = filtered_merged_df.drop_duplicates(subset='Key_y', keep='first', inplace=False)
        print(new_trans.shape)

        jsonFile = final_df.to_json(orient='values')
        jsonFile = json.loads(jsonFile)
        
        final_final_json = reformat_json(jsonFile)
        
        output_json_file = locale + '.json'
        langs_translations.append(lang_translations)

        with open(output_json_file, 'w') as f:
            f.write(json.dumps(final_final_json, indent = 4, ensure_ascii=False))
    save_xls(langs_translations,'./allTrans.xlsx')

    return all_translations


In [208]:
all = getAllTranslations()
all.head()

(227, 8)
(346, 7)
(227, 5)
(311, 7)
(261, 7)
(291, 7)
(255, 7)
(308, 7)
(264, 7)
(316, 5)
(253, 5)


Unnamed: 0,Key,Hindi,Gujarati,Tamil,Bengali,Telugu,Punjabi,Marathi,Assamese,Malayalam,Odia,Kannada
1,Please validate any error message before proce...,"आगे बढ़ने से पहले, कृपया किसी गड़बड़ी वाले मैसेज ...",,,,,,,,,,
2,Help us understand what’s wrong with the recor...,"रिकॉर्डिंग में क्या गड़बड़ी है, यह समझने में हमा...",,,,,,,,,,
5,Levels and Badging @BoloIndia,लेवल और बैज @बोलो इंडिया,,,,,,,,,,
7,Participate more to gain badges,बैज पाने के लिए और भागीदारी करें,,,,,,,,,,
9,"Level - ""X""","लेवल - ""X""",,,,,,,,,,


In [85]:
all 219

Unnamed: 0,Hindi_key,Hindi_value,Punjabi_key,Punjabi_value
1,Please validate any error message before proce...,"आगे बढ़ने से पहले, कृपया किसी गड़बड़ी वाले मैसेज ...",,
2,Help us understand what’s wrong with the recor...,"रिकॉर्डिंग में क्या गड़बड़ी है, यह समझने में हमा...",,
5,Levels and Badging @BoloIndia,लेवल और बैज @बोलो इंडिया,,
7,Participate more to gain badges,बैज पाने के लिए और भागीदारी करें,,
9,"Level - ""X""","लेवल - ""X""",,
...,...,...,...,...
124,,,You've earned a Silver Bhasha Samarthak Badge ...,ਤੁਸੀਂ 50 ਰਿਕਾਰਡਿੰਗ ਨੂੰ ਪ੍ਰਮਾਣਿਤ ਕਰਕੇ ਸਿਲਵਰ ਭਾਸ਼...
125,,,You've earned a Gold Bhasha Samarthak Badge by...,ਤੁਸੀਂ 100 ਰਿਕਾਰਡਿੰਗਾਂ ਨੂੰ ਪ੍ਰਮਾਣਿਤ ਕਰਦਿਆਂ ਗੋਲਡ...
126,,,You've earned a Platinum Bhasha Samarthak Badg...,ਤੁਸੀਂ 200 ਰਿਕਾਰਡਿੰਗਾਂ ਨੂੰ ਪ੍ਰਮਾਣਿਤ ਕਰਕੇ ਇੱਕ ਪਲ...
127,,,Thank you for your translation efforts,ਤੁਹਾਡੇ ਅਨੁਵਾਦ ਦੇ ਕੋਸ਼ਿਸ਼ ਲਈ ਧੰਨਵਾਦ


In [None]:
## Test Block

In [None]:
locale = 'hi'
path_to_xl = './'+ locale
excel_files = sorted(os.listdir(path_to_xl))
translation_excel_files = [pos_json for pos_json in excel_files if pos_json.endswith('.xlsx') and not pos_json.startswith('~')]

excelDf = getTranslationFromSheets(locale, translation_excel_files)
out_df = readInputJsonAsDf('./../../../crowdsource-ui/locales/{locale}.json'.format(locale=locale))

excelDf['Key_lower'] = excelDf['Key'].str.lower().str.strip()
out_df['Key_lower'] = out_df['Key'].str.lower().str.strip()

excelDf = excelDf.drop_duplicates(subset=['Key_lower'], keep='last')

excelDf_dropped = excelDf.drop_duplicates(subset=['Key_lower'], keep='first')
out_df_dropped = out_df.drop_duplicates(subset=['Key'], keep='first')


merged_df = pd.merge(excelDf_dropped, out_df_dropped, on="Key_lower", how='right')

merged_df = merged_df.apply(set_values, axis = 1)

select_columns = ['Key_y', 'value']

filtered_merged_df = merged_df[select_columns]

final_df = filtered_merged_df.drop_duplicates(subset='Key_y', keep='first', inplace=False)

jsonFile = final_df.to_json(orient='values')
jsonFile = json.loads(jsonFile)

final_final_json = reformat_json(jsonFile)

output_json_file = locale + '.json'

# with open(output_json_file, 'w') as f:
#     f.write(json.dumps(final_final_json, indent = 4, ensure_ascii=False))


In [None]:
import re
count = 0
checkCount = 0
for key, value in jsonFile:
    x = re.search("^[\u0020-\u007F]+$", value)
    count += 1
    if x:
      checkCount += 1
print(count, checkCount)

In [11]:
def validateAndPrintResult(locale,lang, translation_excel_files):
    for excel_file_name in translation_excel_files:
        print("---------------",excel_file_name, "------------------")
        excel = pd.ExcelFile(locale + '/' + excel_file_name)
        for sheetName in excel.sheet_names:
            sheet = excel.parse(sheet_name = sheetName, header=1)
            FORMAT = ['English copy',lang]
            for value in allowed_values:
                if value in sheet.columns:
                    FORMAT.append(value)
            if 'English copy' not in sheet.columns:
                print('{} does not have English copy or C in copy is in caps'.format(sheetName))
            if lang not in sheet.columns:
                print('{} does not have {} column or has some hidden sheet(If so, unhide and delete it).'.format(sheetName, lang))

In [167]:
locale = 'kn'
path_to_xl= './'+ locale
excel_files = sorted(os.listdir(path_to_xl))
translation_excel_files = [pos_json for pos_json in excel_files if pos_json.endswith('.xlsx') and not pos_json.startswith('~')]
validateAndPrintResult(locale,'Kannada', translation_excel_files)


--------------- a.xlsx ------------------
--------------- b.xlsx ------------------
--------------- c.xlsx ------------------
--------------- d.xlsx ------------------
--------------- e.xlsx ------------------
--------------- g.xlsx ------------------
--------------- h.xlsx ------------------
--------------- i.xlsx ------------------
--------------- j.xlsx ------------------
--------------- k.xlsx ------------------
--------------- l.xlsx ------------------
--------------- m.xlsx ------------------
--------------- n.xlsx ------------------
--------------- o.xlsx ------------------
--------------- p.xlsx ------------------
--------------- q.xlsx ------------------
--------------- r.xlsx ------------------
--------------- s.xlsx ------------------
