In [1]:
import pyspark.sql.functions as func
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrame
import pyarrow.parquet as pq
import pandas as pd
import os

In [2]:
csv_path = '../src/test/resources/flattening/csv-table/PMSI'
path_SSR_flat = '../src/test/resources/flattening/parquet-table/flat_table/SSR_Flat'
parquet_path = "../src/test/resources/flattening/parquet-table/single_table/"

In [3]:
ssr_csv = [csv for csv in os.listdir(csv_path) if csv.startswith('T_SSR19')]
print(ssr_csv)

['T_SSR19CCAM.csv', 'T_SSR19C.csv', 'T_SSR19B.csv', 'T_SSR19D.csv', 'T_SSR19CSARR.csv']


In [4]:
bases_SSR = {}

for base in ssr_csv:
    path = csv_path + '/' + base
    temp_df = pd.read_csv(path, sep=';')
    temp_name = base.split('.')[0]
    temp_name = temp_name.replace('T_', '')
    temp_name = temp_name.replace('19', '_')
    bases_SSR[temp_name] = temp_df

In [5]:
for name, base in bases_SSR.items():
    os.mkdir(parquet_path + name)
    os.mkdir(parquet_path + name + '/year=2019')
    base.to_parquet(parquet_path + name + '/year=2019' + '/' + name + '.parquet', compression=None, index=False) 

In [6]:
bases_SSR.keys()

dict_keys(['SSR_CCAM', 'SSR_C', 'SSR_B', 'SSR_D', 'SSR_CSARR'])

In [7]:
for base in bases_SSR:
    if base != 'SSR_B':
        new_cols = [base + '__'+ col if col not in ['ETA_NUM', 'RHA_NUM', 'RHS_NUM'] else col for
                    col in bases_SSR[base].columns ]
        bases_SSR[base].columns = new_cols
    print(base,' : ',bases_SSR[base].shape, " ", bases_SSR[base].columns)

SSR_CCAM  :  (3, 12)   Index(['ETA_NUM', 'SSR_CCAM__RHA_VER', 'RHA_NUM', 'RHS_NUM',
       'SSR_CCAM__CCAM_DEL_ENT_UM', 'SSR_CCAM__CCAM_ACT', 'SSR_CCAM__EXT_PMSI',
       'SSR_CCAM__CCAM_PHA_ACT', 'SSR_CCAM__CCAM_COD_ACT',
       'SSR_CCAM__CCAM_EXT_DOC', 'SSR_CCAM__CCAM_NBR_REA',
       'SSR_CCAM__CCAM_VAL_DAT'],
      dtype='object')
SSR_C  :  (2, 14)   Index(['ETA_NUM', 'SSR_C__NIR_RET', 'SSR_C__NAI_RET', 'SSR_C__SEX_RET',
       'SSR_C__SEJ_RET', 'SSR_C__DAT_RET', 'SSR_C__NUM_ENQ', 'RHA_NUM',
       'SSR_C__ENT_DAT', 'SSR_C__SOR_DAT', 'SSR_C__EXE_SOI_DTD',
       'SSR_C__EXE_SOI_DTF', 'SSR_C__MOI_LUN_1S', 'SSR_C__ANN_LUN_1S'],
      dtype='object')
SSR_B  :  (3, 54)   Index(['ETA_NUM', 'RHA_VER', 'GEN_VER', 'GRC_VER', 'GRC_GME', 'GRC_RET',
       'GRC_TOP_ERR', 'GRG_VER', 'GRG_GME', 'GRG_RET', 'GRG_TOP_ERR',
       'TYP_GEN_RHA', 'RHA_NUM', 'RHS_NUM', 'AGE_ANN', 'COD_SEX', 'BDI_COD',
       'BDI_DEP', 'RHS_ANT_SEJ_ENT', 'MOI_ANN_SOR_SEJ', 'HOS_TYP_UM',
       'AUT_TYP_UM', 'ETA_NUM

# Construction de la base applatie

SSR_B / SSR_C

In [8]:
table_centrale = pd.merge(bases_SSR['SSR_B'], bases_SSR['SSR_C'], on=['ETA_NUM', 'RHA_NUM'], how='left')
print(table_centrale.shape)

(3, 66)


ajout de SSR_CCAM

In [9]:
temp_SSR_CCAM = pd.merge(table_centrale, bases_SSR['SSR_CCAM'], on=['ETA_NUM', 'RHA_NUM', 'RHS_NUM'], how='left')
print(temp_SSR_CCAM.shape)
SSR_flat = temp_SSR_CCAM

(3, 75)


ajout de SSR_CSARR

In [10]:
temp_SSR_CSARR = pd.merge(table_centrale, bases_SSR['SSR_CSARR'], on=['ETA_NUM', 'RHA_NUM', 'RHS_NUM'], how='left') 
print(temp_SSR_CSARR.shape)
SSR_flat = pd.concat([SSR_flat, temp_SSR_CSARR], sort=True)
print(SSR_flat.shape)

(3, 80)
(6, 89)


ajout de SSR_D

In [11]:
temp_SSR_D = pd.merge(table_centrale, bases_SSR['SSR_D'], on=['ETA_NUM', 'RHA_NUM', 'RHS_NUM'], how='left') 
print(temp_SSR_D.shape)
SSR_flat = pd.concat([SSR_flat, temp_SSR_D], sort=True)
SSR_flat['year'] = 2019
print(SSR_flat.shape) 
SSR_flat.to_parquet(path_SSR_flat + '/SSR_flat.parquet', compression=None, index=False) 

(3, 68)
(9, 92)


In [12]:
SSR_flat.columns

Index(['AGE_ANN', 'ALI_DEP', 'ANC_CHI', 'AUT_TYP_UM', 'BDI_COD', 'BDI_DEP',
       'COD_SEX', 'CON_DEP', 'CPT_DEP', 'DEB_FIN', 'DEB_SEM', 'DPL_DEP',
       'ENT_MOD', 'ENT_PRV', 'ETA_NUM', 'ETA_NUM_GEO', 'ETL_AFF', 'EXB_NBJ',
       'EXB_TOP', 'EXH_NBJ', 'FP_PEC', 'GEN_VER', 'GMT_NUM', 'GRC_GME',
       'GRC_RET', 'GRC_TOP_ERR', 'GRC_VER', 'GRG_GME', 'GRG_RET',
       'GRG_TOP_ERR', 'GRG_VER', 'HAB_DEP', 'HOS_TYP_UM', 'JP_HWE', 'JP_WE',
       'LIT_DEDIE', 'MOI_ANN', 'MOI_ANN_SOR_SEJ', 'MOR_PRP', 'NBR_CCAM',
       'NBR_CSARR', 'NBR_DGN', 'REHOS_PRJ_THP', 'REL_DEP', 'RHA_NUM',
       'RHA_VER', 'RHS_ANT_SEJ_ENT', 'RHS_NUM', 'SCORE_RR', 'SEJ_ANT',
       'SOR_DES', 'SOR_MOD', 'SSR_CCAM__CCAM_ACT', 'SSR_CCAM__CCAM_COD_ACT',
       'SSR_CCAM__CCAM_DEL_ENT_UM', 'SSR_CCAM__CCAM_EXT_DOC',
       'SSR_CCAM__CCAM_NBR_REA', 'SSR_CCAM__CCAM_PHA_ACT',
       'SSR_CCAM__CCAM_VAL_DAT', 'SSR_CCAM__EXT_PMSI', 'SSR_CCAM__RHA_VER',
       'SSR_CSARR__APP_SUP', 'SSR_CSARR__CLAS_NBR_PAT_PS',
       'SSR_