In [None]:
import pandas as pd
import openpyxl
import json
import os
import re

input_json_file = 'sample/en.json'
out_excel_file = 'hi/hi_edited.xlsx'
# translation_excel_file = ['sample/sample.xlsx']

In [None]:
# this finds our json files
path_to_json = './hi'
translation_excel_file = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.xlsx')]


In [None]:
with open(input_json_file) as f:
    js = json.load(f)

In [None]:
out_df = pd.DataFrame(list(js.items()),
                   columns=['Key', 'To be Translated'])

In [None]:
out_df['Key']

In [None]:
out_df["Key"].to_excel('tran.xlsx')

In [None]:
lang = 'Hindi'
english_col = 'English copy'
allowed_values = ['x','y','z']

In [None]:
excelDf = pd.DataFrame([], columns=['Key', lang])
for excel_file_name in translation_excel_file:
    print(excel_file_name)
    excel = pd.ExcelFile('hi/'+excel_file_name)
    
    count = 0
    for sheetName in excel.sheet_names:
        sheet = excel.parse(sheet_name = sheetName, header=1)
        print(sheet.columns)
        FORMAT = [english_col,lang]
        for value in allowed_values:
            if value in sheet.columns:
                FORMAT.append(value)
        filteredSheet = sheet[FORMAT]
        sheet_no_na = filteredSheet.dropna(subset = [english_col, lang], inplace=False)
        sheet_new = sheet_no_na.rename(columns = {english_col: 'Key'}, inplace=False)
        count += sheet_new.count()
        excelDf = pd.concat([excelDf, sheet_new], axis=0)

In [None]:
excelDf['Key_lower'] = excelDf['Key'].str.lower()
out_df['Key_lower'] = out_df['Key'].str.lower()

In [None]:
excelDf_dropped = excelDf.drop_duplicates(subset=['Key_lower'], keep='first')
out_df_dropped = out_df.drop_duplicates(subset=['Key'], keep='first')

In [None]:
df_diff = pd.merge(out_df_dropped, excelDf_dropped, on="Key_lower",how='left')

In [None]:
df_diff = df_diff.drop_duplicates(subset=['Key_x'], keep='first')

In [None]:
df_diff = df_diff[['Key_x', 'To be Translated', lang]]

In [None]:
df_diff_a = df_diff[df_diff[lang].isna()]

In [None]:
df_diff[df_diff[lang].isna()].count()

In [None]:
df_diff_a = df_diff_a.rename(columns = {'Key_x': 'Key'}, inplace=False)

In [None]:
df_diff_a.to_excel(out_excel_file)

In [None]:
excelDf = pd.DataFrame([], columns=['Key', lang])
for excel_file_name in translation_excel_file:
    excel = pd.ExcelFile('hi/'+excel_file_name)
    print(excel_file_name)
    count = 0
    for sheetName in excel.sheet_names:
        sheet = excel.parse(sheet_name = sheetName, header=1)
        FORMAT = [english_col,lang]
        print(sheet.columns)
        for value in allowed_values:
            if value in sheet.columns:
                FORMAT.append(value)
        filteredSheet = sheet[FORMAT]
        sheet_no_na = filteredSheet.dropna(subset = [english_col, lang], inplace=False)
        sheet_new = sheet_no_na.rename(columns = {english_col: 'Key'}, inplace=False)
        count += sheet_new.count()
        excelDf = pd.concat([excelDf, sheet_new], axis=0)

In [None]:
excelDf.to_excel('allHindi.xlsx')

In [None]:
def getKeysWithTranslation(lang):
    with open(lang+"/"+lang+".json") as f:
        js = json.load(f)
    
    keys_list = []
    for key, value in js.items():
        if key != value:
            keys_list.append(key)
    return keys_list
    

In [None]:
getKeysWithTranslation('hi')

In [None]:
##--------- NEW CODE FOR TRANSLATION GENERATION ------

In [1]:
import pandas as pd
import openpyxl
import json
import os
import re
from pandas import ExcelWriter

In [2]:

def findKeysWithoutTranslation(lang):
    with open('./../../../crowdsource-ui/locales/{lang}.json'.format(lang=lang)) as f:
        js = json.load(f)
    
    keys_without_translation = []
    for key, value in js.items():
        if key == value and value :
            keys_without_translation.append(key)
    return keys_without_translation

In [3]:
def extract_tags(text):
    allowed_replacements = ["x","y","z", "u", "v", "w"]
    regex = r"<(\S*?)[^>]*>.*?<\/\1>|<.*?\/>"
    out_txt = text
    matches = re.finditer(regex, out_txt, re.MULTILINE)
    index = 0
    matched_tags_dict = {}
    for match in matches:
        matched_tag = match.group()
        if "id" in matched_tag:
            replacement = allowed_replacements[index]
            out_txt = out_txt.replace(matched_tag, '<{}>'.format(replacement))
            index+=1
            matched_tags_dict[replacement] = matched_tag
    return out_txt , matched_tags_dict

In [10]:
def save_xls(list_dfs, xls_path):
    with ExcelWriter(xls_path) as writer:
        for lang_code, lang_df in list_dfs.items():
            lang_df.to_excel(writer,lang_code)
        writer.save()
    print('done')

In [11]:
languages = ['hi','gu','as','bn','ta','te','mr','pa','ml','or','kn']
all_dfs = {}
for lang_code in languages:
    lang_df = pd.DataFrame([], columns=[])
    keys_without_translation = findKeysWithoutTranslation(lang_code)
    for key in keys_without_translation:
        processed_text, extracted_tags = extract_tags(key)
        data = []
        column_names = ['English copy']
        tmp_col_names = []
        data.append(processed_text)
        for replacement in sorted (extracted_tags.keys()):
            data.append(extracted_tags[replacement])
            tmp_col_names.append(replacement)
        column_names = column_names + sorted(tmp_col_names)
        try:
            tmp_df = pd.DataFrame([data], columns=column_names)
            tmp_df.reset_index(drop=True, inplace=True)
            lang_df.reset_index(drop=True, inplace=True)
            lang_df = lang_df.append(tmp_df, ignore_index=True)
        except Exception as e:
            print(e, "\n", data.shape, column_names.shape, "\n\n")
    all_dfs[lang_code] = lang_df

In [12]:
save_xls(all_dfs, 'final.xls')

done
