In [1]:
import pyspark.sql.functions as func
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrame
import pyarrow.parquet as pq
import pandas as pd
import os

In [2]:
path_parquet = '../src/test/resources/flattening/parquet-table/single_table'
path_PMSI_flat = '../src/test/resources/flattening/parquet-table/flat_table/PMSI_Flat'

In [3]:
os.listdir(path_parquet + '/MCO_D')

['year=2007', 'year=2008', 'year=2006']

In [4]:
 pd.read_parquet(path_parquet + '/MCO_D')

Unnamed: 0,ASS_DGN,ETA_NUM,RSA_NUM,RSS_NUM,year
0,C66.9,10000123,10000543,217,2006
1,C66.5,10000123,20000123,217,2007
2,C652,10000123,30000852,217,2008


In [5]:
liste_bases = [base for base in os.listdir(path_parquet) if ("MCO" in base)]

In [6]:
liste_bases

['MCO_D', 'MCO_C', 'MCO_B', 'MCO_UM', 'MCO_A']

In [7]:
bases_MCO = {}

for base in liste_bases:
    path = path_parquet + '/' + base
    bases_MCO[base] = pd.DataFrame()
    for year in os.listdir(path):
        temp_df = pd.read_parquet(path = path + '/' + year, engine="pyarrow")
        temp_df['year'] = int(year.split("=")[1])
        bases_MCO[base] = pd.concat([bases_MCO[base], temp_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [8]:
bases_MCO['MCO_C']

Unnamed: 0,DAT_RET,ENT_DAT,ETA_NUM,FHO_RET,FOR_NUM,NAI_RET,NIR_RET,NUM_ENQ,PMS_RET,RSA_NUM,SEJ_NUM,SEJ_RET,SEX_RET,SOR_ANN,SOR_DAT,SOR_MOI,VID_HOSP_FOR,year
0,0,,10000123,0,,0,0,Patient_02,0,20000123,50,0,0,2007,,2,,2007
1,0,,10000123,0,,0,0,Patient_02,0,20000345,55,0,0,2007,,2,,2007
0,0,8032008.0,10000123,0,2.0,0,0,Patient_02,0,30000546,90,0,0,2008,10032008.0,3,10.0,2008
1,0,15032008.0,10000123,0,2.0,0,0,Patient_02,0,30000852,95,0,0,2008,17032008.0,3,10.0,2008
0,0,,10000123,0,,0,0,Patient_02,0,10000987,4,0,0,2006,,1,,2006
1,0,,10000123,0,,0,0,Patient_02,0,10000543,9,0,0,2006,,1,,2006


In [9]:
for base in bases_MCO:
    if base != 'MCO_C':
        new_cols = [base + '__'+ col if col not in ['ETA_NUM', 'RSA_NUM', 'year'] else col for
                    col in bases_MCO[base].columns ]
        bases_MCO[base].columns = new_cols
    print(base,' : ',bases_MCO[base].shape, " ", bases_MCO[base].columns)

MCO_D  :  (3, 5)   Index(['MCO_D__ASS_DGN', 'ETA_NUM', 'RSA_NUM', 'MCO_D__RSS_NUM', 'year'], dtype='object')
MCO_C  :  (6, 18)   Index(['DAT_RET', 'ENT_DAT', 'ETA_NUM', 'FHO_RET', 'FOR_NUM', 'NAI_RET',
       'NIR_RET', 'NUM_ENQ', 'PMS_RET', 'RSA_NUM', 'SEJ_NUM', 'SEJ_RET',
       'SEX_RET', 'SOR_ANN', 'SOR_DAT', 'SOR_MOI', 'VID_HOSP_FOR', 'year'],
      dtype='object')
MCO_B  :  (6, 64)   Index(['MCO_B__AGE_ANN', 'MCO_B__AGE_GES', 'MCO_B__AGE_JOU', 'MCO_B__BDI_COD',
       'MCO_B__BDI_DEP', 'MCO_B__BEB_SEJ', 'MCO_B__BEH_NBJ',
       'MCO_B__CAI_SUP_NBR', 'MCO_B__COD_IGS', 'MCO_B__COD_SEX',
       'MCO_B__DGN_PAL', 'MCO_B__DGN_REL', 'MCO_B__DLY_ACT', 'MCO_B__ENT_MOD',
       'MCO_B__ENT_PRV', 'ETA_NUM', 'MCO_B__ETE_GHS_NUM',
       'MCO_B__GHM_24705Z_ACT', 'MCO_B__GHM_24706Z_ACT',
       'MCO_B__GHM_24707Z_ACT', 'MCO_B__GHS_9510_ACT', 'MCO_B__GHS_9511_ACT',
       'MCO_B__GHS_9512_ACT', 'MCO_B__GHS_9515_ACT', 'MCO_B__GHS_9524_ACT',
       'MCO_B__GHS_NUM', 'MCO_B__GRC_GHM', 'MCO_B__GRC

# Construction de la base applatie

MCO_B / MCO_C

In [10]:
table_centrale = pd.merge(bases_MCO['MCO_B'], bases_MCO['MCO_C'], on=['ETA_NUM', 'RSA_NUM'], how='left')
print(table_centrale.shape)
table_centrale.to_parquet(path_PMSI_flat + '/table_centrale.parquet', compression=None) 

(6, 80)


ajout de MCO_A

In [11]:
temp_MCO_A = pd.merge(table_centrale, bases_MCO['MCO_A'], on=['ETA_NUM', 'RSA_NUM'], how='left')
temp_MCO_A.to_parquet(path_PMSI_flat + '/tc_MCO_A.parquet', compression=None) 
print(temp_MCO_A.shape)
PMSI_flat = temp_MCO_A

(6, 92)


ajout de MCO_D

In [12]:
temp_MCO_D = pd.merge(table_centrale, bases_MCO['MCO_D'], on=['ETA_NUM', 'RSA_NUM'], how='left')
temp_MCO_D.to_parquet(path_PMSI_flat + '/tc_MCO_D.parquet', compression=None) 
print(temp_MCO_D.shape)
PMSI_flat = pd.concat([PMSI_flat, temp_MCO_D], sort=True)
print(PMSI_flat.shape)

(6, 83)
(12, 94)


ajout de MCO_UM

In [13]:
temp_MCO_UM = pd.merge(table_centrale, bases_MCO['MCO_UM'], on=['ETA_NUM', 'RSA_NUM'], how='left')
temp_MCO_UM.to_parquet(path_PMSI_flat + '/tc_MCO_UM.parquet', compression=None) 
print(temp_MCO_UM.shape)
PMSI_flat = pd.concat([PMSI_flat, temp_MCO_UM], sort=True)
print(PMSI_flat.shape)
del PMSI_flat['year_x'], PMSI_flat['year_y'] 
PMSI_flat.to_parquet(path_PMSI_flat + '/PMSI_flat.parquet', compression=None, index=False) 

(6, 84)
(18, 97)


In [14]:
PMSI_flat.shape

(18, 95)

In [16]:
PMSI_flat

Unnamed: 0,DAT_RET,ENT_DAT,ETA_NUM,FHO_RET,FOR_NUM,MCO_A__ACV_ACT,MCO_A__ANP_ACT,MCO_A__CDC_ACT,MCO_A__DOC_ACT,MCO_A__ENT_DAT_DEL,...,PMS_RET,RSA_NUM,SEJ_NUM,SEJ_RET,SEX_RET,SOR_ANN,SOR_DAT,SOR_MOI,VID_HOSP_FOR,year
0,0,,10000123,0,,1.0,1.0,AAAA123,,2.0,...,0,20000123,50,0,0,2007,,2,,2007.0
1,0,,10000123,0,,1.0,0.0,BBBB123,,2.0,...,0,20000345,55,0,0,2007,,2,,2007.0
2,0,8032008.0,10000123,0,2.0,1.0,1.0,AAAA123,,2.0,...,0,30000546,90,0,0,2008,10032008.0,3,10.0,2008.0
3,0,15032008.0,10000123,0,2.0,1.0,0.0,BBBB123,,2.0,...,0,30000852,95,0,0,2008,17032008.0,3,10.0,2008.0
4,0,,10000123,0,,1.0,1.0,AAAA123,,2.0,...,0,10000987,4,0,0,2006,,1,,2006.0
5,0,,10000123,0,,1.0,0.0,BBBB123,,2.0,...,0,10000543,9,0,0,2006,,1,,2006.0
0,0,,10000123,0,,,,,,,...,0,20000123,50,0,0,2007,,2,,2007.0
1,0,,10000123,0,,,,,,,...,0,20000345,55,0,0,2007,,2,,
2,0,8032008.0,10000123,0,2.0,,,,,,...,0,30000546,90,0,0,2008,10032008.0,3,10.0,
3,0,15032008.0,10000123,0,2.0,,,,,,...,0,30000852,95,0,0,2008,17032008.0,3,10.0,2008.0
