In [1]:
import json
import re
import pandas as pd
import numpy as np
import os

The Flattening requires the schemas of the input csv tables to load the (typed) parquet tables. This script makes the conversion between tableschema and the Flattening input schemas. 

NB: The tableschema of the SNDS are collaboratively edited and can comport some errors. Nonetheless, we think that they are a good proxy of the real types of the data. They can be downloaded on the [HDH gitlab here](https://gitlab.com/healthdatahub/schema-snds/).

The output of this script can then be copy-past to the corresponding product file in the [Schema folder of the flattening](https://github.com/X-DataInitiative/SCALPEL-Flattening/tree/develop/src/main/resources/schema) of your project.

In [2]:
# path to the table schema folder 
path2schema_snds = '/home/matthieu/Documents/santeFrance/projets/hub_santé/healthdatahub_gitlab/schema-snds/schemas/'
print(os.listdir(path2schema_snds))

['DCIRS', 'PMSI', 'DAMIR', 'DCIR_DCIRS', 'DCIR', 'EGB', 'CARTOGRAPHIE_PATHOLOGIES', 'BENEFICIAIRE', 'Causes de décès']


In [3]:
#path2table_schema = path2schema_snds + 'data/tableschema/'
#products_to_build = ['PMSI']
product = "PMSI/PMSI MCO/"
#product = products_to_build
#tables = [os.listdir(path2table_schema + product) for p in product]
tables = path2schema_snds + product
os.listdir(tables)[:4]

['T_MCOaa_nnMED.json',
 'T_MCOaa_nnMEDATU.json',
 'T_MCOaa_nnBPHN.json',
 'T_MCOaa_nnORP.json']

In [4]:
table = "T_MCOaa_nnFCSTC.json"

In [5]:
print(table)
print(path2schema_snds + product + table )

T_MCOaa_nnFCSTC.json
/home/matthieu/Documents/santeFrance/projets/hub_santé/healthdatahub_gitlab/schema-snds/schemas/PMSI/PMSI MCO/T_MCOaa_nnFCSTC.json


In [6]:
# change the table name to be consistent with the flattening syntax of tables (eg: MCO_FSTC)
tab_name = table.split('/')[-1]
tab_name = re.sub('AAAA', '', tab_name)
tab_name = re.sub('__', '_', tab_name)
tab_name = re.sub('aa_nn','_', tab_name)
tab_name = re.sub('^T_', '', tab_name)
tab_name = tab_name[:-5]
# load the tableschema
with open(path2schema_snds + product + '/' + table, 'r') as f:
    tab_schem = json.load(f)
print('There are {} fields in the tableschema of {} table'.format(len(tab_schem['fields']), tab_name))

There are 32 fields in the tableschema of MCO_FCSTC table


In [7]:
# defining target colnames for the flattening schema
tab_flattening_schema = pd.DataFrame(columns = ['MEMNAME', 'NAME', 'FORMAT', 'FORMATL', 'FORMATD', 'DATATYPE'])

#### Small transformations

In [8]:
# mapping types
datatype_dic = {
    'string': 'String',
    'number': 'Double',
    'integer': 'Integer',
    'boolean': 'Integer',
    'datetime': 'Date',
    'date': 'String',
    'object': 'String'
}

# special field for dates    
def create_format(datatype):
    if datatype == 'Date':
        return 'DATETIME'
    elif datatype == 'String':
        return '$'
    else:
        return np.nan

# special field for double decimals (default to 2)
def create_formatd(datatype):
    if datatype == 'Double':
        return 2
    else:
        return 0

# default length of the variable to 20 if dates, else take the tableschema input
def create_length(datatype, length):
    if datatype == 'Date':
        return 20
    else:
        if length is not None:
            return length
        else:
            return 2

In [9]:
# compute the transformations
for i, var in enumerate(tab_schem['fields']):
    datatype = datatype_dic[var['type']]
    
    dataformat = create_format(datatype)
    formatl = create_length(datatype, var["length"])
    formatd = create_formatd(datatype)
    
    tab_flattening_schema.loc[i] = [tab_name, var['name'], dataformat, formatl, formatd, datatype]

In [10]:
tab_flattening_schema.head()

Unnamed: 0,MEMNAME,NAME,FORMAT,FORMATL,FORMATD,DATATYPE
0,MCO_FCSTC,ACT_COD,$,5,0,String
1,MCO_FCSTC,ACT_COE,,62,2,Double
2,MCO_FCSTC,ACT_DNB,,2,2,Double
3,MCO_FCSTC,ACT_NBR,,3,2,Double
4,MCO_FCSTC,AMC_MNR,,72,2,Double


In [11]:
# Write somewhere the schema 
# ****** TODO change to a path that works for you.********
path2schema = '../../dreesUtils/flattening_schemas/{}.csv'.format(tab_name)

tab_flattening_schema.sort_values(['MEMNAME', 'NAME'], inplace=True)
tab_flattening_schema.to_csv(path2schema, sep=';', index=False, header=True, na_rep='')