In [1]:
import pyspark.sql.functions as func
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrame
import pyarrow.parquet as pq
import pandas as pd
import os

In [2]:
path_parquets = '../src/test/resources/flattening/parquet-table/single_table'
path_PMSI_flat = '../src/test/resources/flattening/parquet-table/flat_table/PMSI_Flat'

In [3]:
os.listdir(path_parquets)

['HAD_C',
 'HAD_D',
 'HAD_B',
 'IR_IMB_R',
 'MCO_D',
 'MCO_C',
 'MCO_B',
 'ER_PRS_F',
 'MCO_UM',
 'SSR_CSARR',
 'SSR_CCAM',
 'SSR_C',
 'SSR_D',
 'HAD_A',
 'SSR_B',
 'ER_PHA_F',
 'MCO_A',
 'ER_CAM_F',
 'IR_BEN_R']

In [4]:
liste_bases = [base for base in os.listdir(path_parquets) if base[:3]=='MCO']

In [5]:
bases_MCO = {}

for base in liste_bases:
    if base == 'MCO_C' :
        path = path_parquets + '/' + base + '/year=2008/NUM_ENQ=Patient_02'
        parquet =  [file for file in os.listdir(path) if file.endswith('parquet')][0]
        bases_MCO[base] = pq.read_table(path + '/' + parquet).to_pandas()
    else:
        path = path_parquets + '/' + base + '/year=2008'
        parquet =  [file for file in os.listdir(path) if file.endswith('parquet')][0]
        bases_MCO[base] = pq.read_table(path + '/' + parquet).to_pandas()

In [6]:
for base in bases_MCO:
    if base != 'MCO_B':
        new_cols = [base + '__'+ col if col not in ['ETA_NUM', 'RSA_NUM'] else col for
                    col in bases_MCO[base].columns ]
        bases_MCO[base].columns = new_cols
    print(base,' : ',bases_MCO[base].shape, " ", bases_MCO[base].columns)

MCO_D  :  (1, 4)   Index(['MCO_D__ASS_DGN', 'ETA_NUM', 'RSA_NUM', 'MCO_D__RSS_NUM'], dtype='object')
MCO_C  :  (2, 16)   Index(['MCO_C__DAT_RET', 'ETA_NUM', 'MCO_C__FHO_RET', 'MCO_C__FOR_NUM',
       'MCO_C__NAI_RET', 'MCO_C__NIR_RET', 'MCO_C__PMS_RET', 'RSA_NUM',
       'MCO_C__SEJ_NUM', 'MCO_C__SEJ_RET', 'MCO_C__SEX_RET', 'MCO_C__SOR_ANN',
       'MCO_C__SOR_MOI', 'MCO_C__VID_HOSP_FOR', 'MCO_C__ENT_DAT',
       'MCO_C__SOR_DAT'],
      dtype='object')
MCO_B  :  (2, 60)   Index(['BDI_DEP', 'BDI_COD', 'AGE_ANN', 'AGE_GES', 'AGE_JOU', 'BEB_SEJ',
       'BEH_NBJ', 'CAI_SUP_NBR', 'COD_IGS', 'COD_SEX', 'DGN_PAL', 'DGN_REL',
       'DLY_ACT', 'ENT_MOD', 'ENT_PRV', 'ETA_NUM', 'ETE_GHS_NUM',
       'GHS_9510_ACT', 'GHS_9511_ACT', 'GHS_9512_ACT', 'GHS_9515_ACT',
       'GHS_9524_ACT', 'GHS_NUM', 'GRC_GHM', 'GRC_RET', 'GRC_VER', 'GRG_GHM',
       'GRG_RET', 'GRG_VER', 'NBR_ACT', 'NBR_DGN', 'NBR_RUM', 'NBR_SEA',
       'NBR_SEA_SROS', 'NBR_SUP_NN1', 'NBR_SUP_NN2', 'NBR_SUP_NN3',
       'NBR_SUP_

# Construction de la base applatie

MCO_B / MCO_C

In [7]:
table_centrale = pd.merge(bases_MCO['MCO_B'], bases_MCO['MCO_C'], on=['ETA_NUM', 'RSA_NUM'], how='inner')
print(table_centrale.shape)
table_centrale.to_parquet(path_PMSI_flat + '/table_centrale.parquet', compression=None) 

(2, 74)


ajout de MCO_A

In [8]:
temp_MCO_A = pd.merge(table_centrale, bases_MCO['MCO_A'], on=['ETA_NUM', 'RSA_NUM'], how='inner')
temp_MCO_A.to_parquet(path_PMSI_flat + '/tc_MCO_A.parquet', compression=None) 
print(temp_MCO_A.shape)
PMSI_flat = temp_MCO_A

(2, 85)


ajout de MCO_D

In [9]:
temp_MCO_D = pd.merge(table_centrale, bases_MCO['MCO_D'], on=['ETA_NUM', 'RSA_NUM'], how='inner')
temp_MCO_D.to_parquet(path_PMSI_flat + '/tc_MCO_D.parquet', compression=None) 
print(temp_MCO_D.shape)
PMSI_flat = pd.concat([PMSI_flat, temp_MCO_D], sort=True)
print(PMSI_flat.shape)

(1, 76)
(3, 87)


ajout de MCO_UM

In [10]:
temp_MCO_UM = pd.merge(table_centrale, bases_MCO['MCO_UM'], on=['ETA_NUM', 'RSA_NUM'], how='inner')
temp_MCO_UM.to_parquet(path_PMSI_flat + '/tc_MCO_UM.parquet', compression=None) 
print(temp_MCO_UM.shape)
PMSI_flat = pd.concat([PMSI_flat, temp_MCO_UM], sort=True)
print(PMSI_flat.shape)
PMSI_flat.to_parquet(path_PMSI_flat + '/PMSI_flat.parquet', compression=None) 

(2, 77)
(5, 90)


In [11]:
PMSI_flat

Unnamed: 0,AGE_ANN,AGE_GES,AGE_JOU,BDI_COD,BDI_DEP,BEB_SEJ,BEH_NBJ,CAI_SUP_NBR,COD_IGS,COD_SEX,...,SEJ_TYP,SEQ_RUM,SOR_ANN,SOR_DES,SOR_MOD,SOR_MOI,SUP_ENT_DPA,SUP_ENT_DPC,SUP_ENT_HEM,SUP_HEM_HS
0,35,,,75014,75,0,0,0,,1,...,,10,2008,,5,3,0,0,0,0
1,35,,,75014,75,1,0,0,,1,...,,10,2008,,5,3,0,0,0,0
0,35,,,75014,75,1,0,0,,1,...,,10,2008,,5,3,0,0,0,0
0,35,,,75014,75,0,0,0,,1,...,,10,2008,,5,3,0,0,0,0
1,35,,,75014,75,1,0,0,,1,...,,10,2008,,5,3,0,0,0,0
