In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import uuid
import pickle

In [None]:
data_folder  = Path('../Data/')

In [None]:
def optimize_df(df):
    """
    Convert each column of a pandas DataFrame to the datatype that takes the lowest memory.

    Parameters:
    -----------
    df : pandas DataFrame
        The input DataFrame to convert.

    Returns:
    --------
    pandas DataFrame
        The converted DataFrame with lowest memory datatypes for each column.
    """

    # First, convert all object columns to category type
    obj_cols = df.select_dtypes(include=['object']).columns
    df[obj_cols] = df[obj_cols].astype('category')

    # Next, loop through all numeric columns and downcast the data types
    for col in df.select_dtypes(include=['int', 'float']).columns:
        col_type = df[col].dtype
        if str(col_type)[:3] == 'int':
            # Use smallest integer type possible
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
#         else:
#             # Use smallest float type possible ! Bug 'halffloat' not supported by Arrow ! -> Commenting out
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                 df[col] = df[col].astype('float32')
#             else:
#                 df[col] = df[col].astype('float64')
    
    return df

In [None]:
df_flag_aos = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_flag_AOS_231222.csv', sep = ';')
df_couverture_aos = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_couverture_AOS_231222.csv', sep =';')


In [None]:
df_couverture_lca = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_res_couverture_LCA_231222.csv', sep = ';')


In [None]:
df_couverture_lca

In [None]:
df_aos_address = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'AOS_address_masked_ssdoubl.csv', sep=';')
df_multiple_address_aos = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Output_adress_doubl_AOS_050123.csv', sep=';')
df_multiple_address_lca = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Output_adress_doubl_LCA_050123.csv', sep=';')
df_remaining_multiple_lca = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Remaining_LCA_address_masked_ssdoubl.csv', sep=';')
##
df_couverture_aos = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_couverture_AOS_231222.csv', sep =';')
df_flag_aos = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_flag_AOS_231222.csv', sep = ';')
df_drug_aos = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_medic_AOS_231222.csv', sep = ';')
df_couverture_lca = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_res_couverture_LCA_231222.csv', sep = ';')
df_prestation_aos = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Set_res_prest_AOS_231222.csv', sep = ';')
df_prestation_lca = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'SET_RES_PREST_LCA_231222.csv', sep = ';')

In [None]:
df_aos_address_updated = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Santeintegra_adresses_27012023'/'AOS_masked_ssdoubl.csv', sep=';')
df_multiple_address_aos_updated = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Santeintegra_adresses_27012023'/'Output_adress_doubl_AOS.csv', sep=';')
df_multiple_address_lca_updated = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Santeintegra_adresses_27012023'/'Output_adress_doubl_LCA.csv', sep=';')
df_remaining_multiple_lca_updated = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Santeintegra_adresses_27012023'/'LCA_miss_masked.csv', sep=';')
df_remaining_multiple_aos_updated = pd.read_csv(data_folder/'raw'/'GM'/'Full'/'Santeintegra_adresses_27012023'/'LAMAL_miss_masked.csv', sep=';')

In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
df_prestation_aos = optimize_df(df_prestation_aos)
df_prestation_lca = optimize_df(df_prestation_lca)
df_flag_aos = optimize_df(df_flag_aos)
df_couverture_aos = optimize_df(df_couverture_aos)
df_couverture_lca = optimize_df(df_couverture_lca)
df_drug_aos = optimize_df(df_drug_aos)
# Old dataset for addresses
df_aos_address = optimize_df(df_aos_address)
df_multiple_address_aos = optimize_df(df_multiple_address_aos)
df_multiple_address_lca = optimize_df(df_multiple_address_lca)
df_remaining_multiple_lca = optimize_df(df_remaining_multiple_lca)

# Updated datasets
df_aos_address_updated = optimize_df(df_aos_address_updated)
df_multiple_address_aos_updated = optimize_df(df_multiple_address_aos_updated)
df_multiple_address_lca_updated = optimize_df(df_multiple_address_lca_updated)
df_remaining_multiple_lca_updated = optimize_df(df_remaining_multiple_lca_updated)
df_remaining_multiple_aos_updated = optimize_df(df_remaining_multiple_aos_updated)

## Add UUID

In [None]:
# Linkage
df_paires_lamal_lca = pd.read_csv('../Data/max_probs_w_zipcode_pour_david.csv')

In [None]:
df_paires_lamal_lca['uuid'] = df_paires_lamal_lca.apply(lambda _: uuid.uuid4(), axis=1).astype('string')

In [None]:
df_paires_lamal_lca.to_csv('../Data/max_probs_w_zipcode_pour_david_w_uuid.csv', index = False)

In [None]:
dict_lamal_to_uuid = df_paires_lamal_lca.set_index('id_lamal')['uuid'].to_dict()
dict_lca_to_uuid = df_paires_lamal_lca.set_index('id_lca')['uuid'].to_dict()

In [None]:
dict_lamal_to_lca = df_paires_lamal_lca.set_index('id_lamal')['id_lca'].to_dict()
dict_lca_to_lamal = df_paires_lamal_lca.set_index('id_lca')['id_lamal'].to_dict()

In [None]:
with open('../Data/processed/dict_lamal_to_uuid.pkl', 'wb') as handle:
        pickle.dump(dict_lamal_to_uuid, handle, protocol=pickle.HIGHEST_PROTOCOL)
handle.close()

In [None]:
with open('../Data/processed/dict_lca_to_uuid.pkl', 'wb') as handle:
        pickle.dump(dict_lca_to_uuid, handle, protocol=pickle.HIGHEST_PROTOCOL)
handle.close()

In [None]:
# Add a UUID so that we have a single unique ID instead of pairs of ID_LAMAL-ID_LCA
df_prestation_lca['uuid'] = df_prestation_lca['ID_LCA'].map(dict_lca_to_uuid)
df_prestation_aos['uuid'] = df_prestation_aos['ID_LAMAL'].map(dict_lamal_to_uuid)

df_couverture_lca['uuid'] = df_couverture_lca['ID_LCA'].map(dict_lca_to_uuid)
df_couverture_aos['uuid'] = df_couverture_aos['ID_LAMAL'].map(dict_lamal_to_uuid)

df_drug_aos['uuid'] = df_drug_aos['ID_LAMAL'].map(dict_lamal_to_uuid)
df_flag_aos['uuid'] = df_flag_aos['ID_LAMAL'].map(dict_lamal_to_uuid)

## Export

In [None]:
df_prestation_aos.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_prestation_aos.parquet.gzip', compression = 'gzip')
df_prestation_lca.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_prestation_lca.parquet.gzip', compression = 'gzip')

df_flag_aos.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_flag_aos.parquet.gzip', compression = 'gzip')

df_couverture_aos.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_couverture_aos.parquet.gzip', compression = 'gzip')
df_couverture_lca.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_couverture_lca.parquet.gzip', compression = 'gzip')

df_drug_aos.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_drug_aos.parquet.gzip', compression = 'gzip')
## Export old
df_aos_address.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_aos_address.parquet.gzip', compression = 'gzip')
df_multiple_address_aos.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_multiple_address_aos.parquet.gzip', compression = 'gzip')
df_multiple_address_lca.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_multiple_address_lca.parquet.gzip', compression = 'gzip')
df_remaining_multiple_lca.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_remaining_multiple_lca.parquet.gzip', compression = 'gzip')
## Export updated
df_aos_address_updated.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_aos_address_updated.parquet.gzip', compression = 'gzip')
df_multiple_address_aos_updated.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_multiple_address_aos_updated.parquet.gzip', compression = 'gzip')
df_multiple_address_lca_updated.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_multiple_address_lca_updated.parquet.gzip', compression = 'gzip')
df_remaining_multiple_lca_updated.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_remaining_multiple_lca_updated.parquet.gzip', compression = 'gzip')
df_remaining_multiple_aos_updated.to_parquet(data_folder/'raw'/'GM'/'Full'/'Compressed files'/'df_remaining_multiple_aos_updated.parquet.gzip', compression = 'gzip')

In [None]:
df_prestation_aos

In [None]:
for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))