In [1]:
import os
import calendar
import pandas as pd
import threading
from tqdm.notebook import tqdm

In [2]:
base = "bucket/dataset_customs"

In [3]:
im_ex_dir = [os.path.join(base,x) for x in os.listdir(base) if os.path.isdir(os.path.join(base,x))]
im_ex_dir

['bucket/dataset_customs/export', 'bucket/dataset_customs/import']

In [4]:
def group_file_names(file_paths):
    groups = {}
    file_names = [(fp.split('/')[-1].split('-')[0], fp) for fp in file_paths]
    for year, f_path in file_names:
        if year in groups:
            groups[year].append(f_path)
        else:
            groups[year] = [f_path]
    for elm in groups:
        groups[elm].sort()
    sort_groups = sorted(groups.items(), key=lambda key: key[0])
    return {elm[0]:elm[1] for elm in sort_groups}

In [5]:
months = [elm for elm in calendar.month_abbr]
months_dict = { elm:months.index(elm) for elm in months}
months_dict['Sept'] = 9
def month2int(month):
    return months_dict[month]

def convert2com(name):
    month_name, year = name.replace('(',' ').replace(')','').split(' ')[-2:]
    month = month2int(month_name)
    return "{}-{:02d}".format(year, month)

In [6]:
base_out = "bucket/merge_dataset_customs"
if not os.path.exists(base_out):
    os.mkdir(base_out)

# Parallel

In [10]:
def merge_files(key, data):
    sub_outpath = os.path.join(base_out, key)
    if not os.path.exists(sub_outpath):
        os.makedirs(sub_outpath)
    merge_df_name = "{}.csv".format(data[0].split('/')[-2])
    df = None
    for index, f_path in enumerate(data):
        df_tmp = pd.read_csv(f_path)
        columns_acc = list(df_tmp.columns)[1:3]
        df_tmp = df_tmp[columns_acc]
        df_tmp = df_tmp.rename(columns={columns_acc[-1]:convert2com(columns_acc[-1])})
        if index==0:
            df = df_tmp
        else:
            df = pd.merge(df, df_tmp, how='outer')
    df.to_csv(os.path.join(sub_outpath, merge_df_name), index=False)

In [None]:
t = []
for data_type in im_ex_dir:
    category_paths = [os.path.join(data_type, x) for x in os.listdir(data_type) if os.path.isdir(os.path.join(data_type, x))]
    category_paths.sort()
    for category_path in tqdm(category_paths):
        files_path = [os.path.join(category_path, x) for x in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, x))]
        group_file = group_file_names(files_path)
        for group in group_file:
            x = threading.Thread(target=merge_files, args=(group, group_file[group]))
            t.append(x)
            x.start()
        for thread in t:
            thread.join()

In [None]:
'''single thread'''
# for data_type in im_ex_dir:
#     category_paths = [os.path.join(data_type, x) for x in os.listdir(data_type) if os.path.isdir(os.path.join(data_type, x))]
#     category_paths.sort()
#     for category_path in tqdm(category_paths):
#         files_path = [os.path.join(category_path, x) for x in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, x))]
#         group_file = group_file_names(files_path)
#         for key, data in group_file.items():
#             sub_outpath = os.path.join(base_out, key)
#             if not os.path.exists(sub_outpath):
#                 os.makedirs(sub_outpath)
#             merge_df_name = "{}.csv".format(data[0].split('/')[-2])
#             df = None
#             for index, f_path in enumerate(data):
#                 df_tmp = pd.read_csv(f_path)
#                 columns_acc = list(df_tmp.columns)[1:3]
#                 df_tmp = df_tmp[columns_acc]
#                 df_tmp = df_tmp.rename(columns={columns_acc[-1]:convert2com(columns_acc[-1])})
#                 if index==0:
#                     df = df_tmp
#                 else:
#                     df = pd.merge(df, df_tmp, how='outer')
#             df.to_csv(os.path.join(sub_outpath, merge_df_name), index=False)

In [14]:
out_dirs = [os.path.join(base_out,x) for x in os.listdir(base_out) if os.path.isdir(os.path.join(base_out,x))]
out_dirs

['bucket/merge_dataset_customs/2001',
 'bucket/merge_dataset_customs/2002',
 'bucket/merge_dataset_customs/2003',
 'bucket/merge_dataset_customs/2004',
 'bucket/merge_dataset_customs/2005',
 'bucket/merge_dataset_customs/2006',
 'bucket/merge_dataset_customs/2007',
 'bucket/merge_dataset_customs/2008',
 'bucket/merge_dataset_customs/2009',
 'bucket/merge_dataset_customs/2010',
 'bucket/merge_dataset_customs/2011',
 'bucket/merge_dataset_customs/2012',
 'bucket/merge_dataset_customs/2013',
 'bucket/merge_dataset_customs/2014',
 'bucket/merge_dataset_customs/2015',
 'bucket/merge_dataset_customs/2016',
 'bucket/merge_dataset_customs/2017',
 'bucket/merge_dataset_customs/2018',
 'bucket/merge_dataset_customs/2019',
 'bucket/merge_dataset_customs/2020',
 'bucket/merge_dataset_customs/2021']

In [17]:
all_files_path = []
for out_dir in out_dirs:
    all_files_path += [os.path.join(out_dir, x) for x in os.listdir(out_dir) if os.path.isfile(os.path.join(out_dir, x))]
print("Total files after merge : {:,}".format(len(all_files_path)))

Total files after merge : 26,334
