In [1]:
# Phcli Jupyter Python Template
# 
# 使用手册：
# 1. 请将全局变量定义在第一个输入区内
# 2. Phcli 会自动在第二个输入区初始化 Spark Session
# 3. 所有 print 会在 phcli maxauto dag 后自动转为 logger.debug() 方法
# 4. 请在第三个输入区开始编码，phcli maxauto dag 后会全部归类为一个方法


# Config defined in here

############## == config == ###################
job_name = "union_drug_analyse"
job_runtime = "python3"
job_command = "submit"
job_timeout = 720.0
############## == config == ###################


# Variables defined in here

############## == input args == ###################
g_input_paramater = 'Empty'
g_partition_num = 2
############## == input args == ###################

############## == output args == ###################
g_out_parameter = 'Empty'
############## == output args == ###################

############## == preset function == ###################
from phcli.ph_max_auto.ph_hook.get_abs_path import get_result_path
from phcli.ph_max_auto.ph_hook.get_abs_path import get_depends_path
# result_path_prefix = get_result_path({"name":job_name})
# depends_path = get_depends_path({"name":job_name})
############## == preset function == ###################


In [2]:
# Initialize the Spark Session
# YARN URL: http://161.189.223.227:8088/cluster
import os
from pyspark.sql import SparkSession, functions as F

# prepare
spark = SparkSession.builder \
    .master("yarn") \
    .appName("hadoop write his_doing.union_drug_analyse in jupyter using python3") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.instances", "1") \
    .config('spark.sql.codegen.wholeStage', False) \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .enableHiveSupport() \
    .getOrCreate()

access_key = os.getenv("AWS_ACCESS_KEY_ID", "AKIAWPBDTVEANKEW2XNC")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "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")
if access_key:
    spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key)
    spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key)
    spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
    spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.cn-northwest-1.amazonaws.com.cn")

In [3]:

from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType
from pyspark.sql.functions import col, date_format, count, isnull, lit
from pyspark.sql.functions import when, isnan, udf, pandas_udf, PandasUDFType
from pyspark.sql.window import Window
from pyspark.sql import functions as Func
from pyspark.sql import DataFrame, SparkSession    

from typing import Iterator

import pandas as pd
import re    


In [None]:
# %%

## 参数化文件读入


In [4]:
# %%
## ====== 输入文件和输出文件 ======

g_whether_save_result = True

p_main_dir = "s3://ph-origin-files/user/zazhao/2020年结果-csv/"


# 输入目录
p_patient_with_select_out =  p_main_dir + "HIS_result/patient_with_select_result"

# 输出文件
p_patient_union_drug_analyse_out = p_main_dir + "HIS_result/" + "patient_union_drug_analyse_result"
# 结果文件输出
p_result_table_0 = p_main_dir + "HIS_result/" + "Table_result/Table_result_0"
p_result_table_1 = p_main_dir + "HIS_result/" + "Table_result/Table_result_1"
p_result_table_2 = p_main_dir + "HIS_result/" + "Table_result/Table_result_2"


In [5]:
# %%

## 读取处方表
df_patient_object = spark.read.parquet(p_patient_with_select_out)


In [6]:
# %%

# 单药 和 联药分析


# 联用种类个数
df_data_temp = df_patient_object.select(["医院ID", "就诊类型", "患者ID", "OUT_ID", "标准处方日期", "MOLECULE_CATEGORY", "MOLECULE"])\
                                .withColumn("标准处方日期", col("标准处方日期").cast("int"))

df_data_a = df_patient_object.withColumn("RX_DATE_STD", col("标准处方日期")) \
                            .groupBy(["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"])\
                            .agg( Func.countDistinct("MOLECULE_CATEGORY").alias("分子种类数") )

# df_data_a.orderBy(["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"]).show(20)

# 联用方式
df_data_b = df_patient_object.withColumn("RX_DATE_STD", col("标准处方日期")) \
                            .groupBy(["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"])\
                            .agg(  Func.collect_set(col("MOLECULE_CATEGORY")).alias("formula"), \
                                 Func.collect_set( col("MOLECULE")  ).alias("mole_comb") )
df_data_b = df_data_b.withColumn("formula", Func.concat_ws("+", col("formula")) )\
                        .withColumn("mole_comb", Func.concat_ws("+", col("mole_comb")))



# 是否为初始药
win = Window.partitionBy(["医院ID", "就诊类型", "患者ID", "OUT_ID"])
df_data_c = df_data_a.withColumn("SEQ", Func.row_number().over( win.orderBy( col("RX_DATE_STD").desc() )  ))\
                        .withColumn("IF_FIRST_RX", when( col("SEQ")==1, 1).otherwise(0) ) \
                        .withColumn("MAX_SEQ", Func.max( col("SEQ") ).over( win )  )
# df_data_c.show()
# df_data_c_max = df_data_c.groupBy( ["医院ID", "就诊类型", "患者ID", "OUT_ID" ]).agg(Func.max("SEQ").alias("MAX_SEQ") )
# df_data_c = df_data_c.join( df_data_c_max, on=[ "医院ID", "就诊类型", "患者ID", "OUT_ID" ], how="inner")


# 合并上面三个表
df_data_d = df_data_c.join(df_data_b, on=["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"], how="left")

# 是否为换药
df_data_e = df_data_d.groupBy(["医院ID", "就诊类型", "患者ID", "OUT_ID" ])\
                        .agg( Func.countDistinct("formula").alias("formula_numbers") )\
                        .withColumn("IF_CHANGE_RX",  Func.when(col("formula_numbers")>1, 1).otherwise(0))

# 合并
df_data_f = df_data_d.join( df_data_e, on=["医院ID", "就诊类型", "患者ID", "OUT_ID" ], how="left")
df_data_f = df_data_f.withColumnRenamed("RX_DATE_STD", "标准处方日期" )

# 和处方数据进行匹配
df_patient_analyse_std = df_patient_object.join(df_data_f, on=["医院ID", "就诊类型", "患者ID", "OUT_ID", "标准处方日期"], how="left" )\
                                        .withColumn("single_or_formula", Func.when( col("formula").rlike("\+")
                                                            ,"联用").otherwise("单药") )
df_patient_analyse_std = df_patient_analyse_std.withColumn("single_or_formula", Func.when( col("formula").isin(
                                                                ["头孢菌素类+头孢菌素类","青霉素类+青霉素类","其他抗生素+其他抗生素",
                                                                   "头孢菌素酶抑制剂+头孢菌素酶抑制剂","四环素类+四环素类",
                                                                   "氨基糖甙+氨基糖甙","氟喹诺酮+氟喹诺酮"]
                                                            ),"单药").otherwise( col("single_or_formula") ) )


In [7]:
# %%

# 计算分组后的sum与count值并加入为新列
df_patient_std_ps = df_patient_analyse_std.groupBy(["年","月","就诊类型","标准医保类型","性别","年龄区间","标准诊断","severe_case","标准科室",
                           "single_or_formula","IF_FIRST_RX","IF_CHANGE_RX","formula","mole_comb",
                           "白细胞计数","c反应蛋白","降钙素原","嗜肺军团菌","肺炎衣原体","肺炎支原体","冠状病毒",
                           "合胞病毒","流感病毒","腺病毒","柯萨奇病毒","鲍曼氏不动杆菌","大肠埃希菌","肺炎克雷伯菌",
                           "肺炎链球菌","金黄色葡萄球菌","流感嗜血菌","嗜麦芽寡养单胞菌","嗜麦芽窄食单胞菌","铜绿假单胞菌",
                           "阴沟肠杆菌","混合感染","心律不齐","其他心血管疾病","脑血管疾病","神经系统疾病","高血糖","高血压",
                           "高血脂","肝功能异常","肾功能异常","结缔组织病","COPD","哮喘","支气管扩张","恶性实体瘤",
                           "HAP患者","seg1_grp1","seg1_grp2","seg2_grp1","seg3_grp1","seg3_grp2","seg3_grp3"]) \
                           .agg( Func.sum( col("金额") ).alias("sales"), Func.countDistinct("患者ID", "就诊序号").alias("patients")  )

rule_ps = ["年","月","就诊类型","标准医保类型","性别","年龄区间","标准诊断","severe_case","标准科室",
                           "single_or_formula","IF_FIRST_RX","IF_CHANGE_RX","formula","mole_comb",
                           "白细胞计数","c反应蛋白","降钙素原","嗜肺军团菌","肺炎衣原体","肺炎支原体","冠状病毒",
                           "合胞病毒","流感病毒","腺病毒","柯萨奇病毒","鲍曼氏不动杆菌","大肠埃希菌","肺炎克雷伯菌",
                           "肺炎链球菌","金黄色葡萄球菌","流感嗜血菌","嗜麦芽寡养单胞菌","嗜麦芽窄食单胞菌","铜绿假单胞菌",
                           "阴沟肠杆菌","混合感染","心律不齐","其他心血管疾病","脑血管疾病","神经系统疾病","高血糖","高血压",
                           "高血脂","肝功能异常","肾功能异常","结缔组织病","COPD","哮喘","支气管扩张","恶性实体瘤",
                           "HAP患者","seg1_grp1","seg1_grp2","seg2_grp1","seg3_grp1","seg3_grp2","seg3_grp3"]

df_patient_std_pha = df_patient_analyse_std.join(df_patient_std_ps,rule_ps,"left")


In [25]:
# %%
df_patient_std_pha = df_patient_std_pha.withColumn("Quarter",Func.quarter( Func.concat_ws( "-",df_patient_std_pha["年"],df_patient_std_pha["月"] ) ) )

In [8]:
# %%

# （sales  patients）单独分组计算的结果
df_table_zero = df_patient_std_pha.select(["年","Quarter","就诊类型","标准医保类型","性别","年龄区间","标准诊断","severe_case","标准科室",
                               "心律不齐","其他心血管疾病","脑血管疾病","神经系统疾病","高血糖","高血压","高血脂","肝功能异常",
                               "肾功能异常","结缔组织病","COPD","哮喘","支气管扩张","恶性实体瘤","IF_CHANGE_RX","HAP患者","sales","patients"])

#pfc  sales  patients 
df_table_one = df_patient_std_pha.select(["年","Quarter","就诊类型","标准医保类型","性别","年龄区间","标准诊断","severe_case","标准科室",
                               "PACK_ID","MOLECULE","MOLECULE_CATEGORY","BRAND","form","SPEC","PACK_NUMBER","MANUFACTURER",
                               "白细胞计数","c反应蛋白","降钙素原","嗜肺军团菌","肺炎衣原体","肺炎支原体","冠状病毒",
                               "合胞病毒","流感病毒","腺病毒","柯萨奇病毒","鲍曼氏不动杆菌","大肠埃希菌","肺炎克雷伯菌",
                               "肺炎链球菌","金黄色葡萄球菌","流感嗜血菌","嗜麦芽寡养单胞菌","嗜麦芽窄食单胞菌","铜绿假单胞菌",
                               "阴沟肠杆菌","混合感染","心律不齐","其他心血管疾病","脑血管疾病","神经系统疾病","高血糖","高血压",
                               "高血脂","肝功能异常","肾功能异常","结缔组织病","COPD","哮喘","支气管扩张","恶性实体瘤","sales","patients"])

# sales  patients 
df_table_two = df_patient_std_pha.select(["年","Quarter","就诊类型","标准医保类型","性别","年龄区间","标准诊断","severe_case","标准科室",
                                   "single_or_formula","IF_FIRST_RX","IF_CHANGE_RX","formula","mole_comb",
                                   "白细胞计数","c反应蛋白","降钙素原","嗜肺军团菌","肺炎衣原体","肺炎支原体","冠状病毒",
                                   "合胞病毒","流感病毒","腺病毒","柯萨奇病毒","鲍曼氏不动杆菌","大肠埃希菌","肺炎克雷伯菌",
                                   "肺炎链球菌","金黄色葡萄球菌","流感嗜血菌","嗜麦芽寡养单胞菌","嗜麦芽窄食单胞菌","铜绿假单胞菌",
                                   "阴沟肠杆菌","混合感染","心律不齐","其他心血管疾病","脑血管疾病","神经系统疾病","高血糖","高血压",
                                   "高血脂","肝功能异常","肾功能异常","结缔组织病","COPD","哮喘","支气管扩张","恶性实体瘤","sales","patients"])


In [13]:
# %%
df_table_zero = df_table_zero.withColumnRenamed("年","Year") \
                             .withColumnRenamed("标准医保类型","std_charge_type") \
                             .withColumnRenamed("年龄区间","age_range") \
                             .withColumnRenamed("标准诊断","std_diag") \
                             .withColumnRenamed("标准科室","std_dept") \
                             .withColumnRenamed("HAP患者","time_diff_larger_than_2_days")
                             
                             


df_table_one = df_table_one.withColumnRenamed("年","Year") \
                           .withColumnRenamed("标准医保类型","std_charge_type") \
                           .withColumnRenamed("年龄区间","age_range") \
                           .withColumnRenamed("标准诊断","std_diag") \
                           .withColumnRenamed("标准科室","std_dept") \
                           .withColumnRenamed("PACK_ID","pfc") \
                           .withColumnRenamed("MOLECULE_CATEGORY","std_mole_category")

df_table_two = df_table_two.withColumnRenamed("年","Year") \
                           .withColumnRenamed("标准医保类型","std_charge_type") \
                           .withColumnRenamed("年龄区间","age_range") \
                           .withColumnRenamed("标准诊断","std_diag") \
                           .withColumnRenamed("标准科室","std_dept")
                           

AnalysisException: cannot resolve '`月`' given input columns: [COPD, IF_CHANGE_RX, Quarter, Year, age_range, patients, sales, severe_case, std_charge_type, std_dept, std_diag, time_diff_larger_than_2_days, 其他心血管疾病, 哮喘, 就诊类型, 心律不齐, 性别, 恶性实体瘤, 支气管扩张, 神经系统疾病, 结缔组织病, 肝功能异常, 肾功能异常, 脑血管疾病, 高血压, 高血糖, 高血脂];;
'Project [Year#1839, CASE WHEN ('月 = 01) THEN 第一季度 WHEN ('月 = 02) THEN 第一季度 WHEN ('月 = 03) THEN 第一季度 WHEN ('月 = 04) THEN 第二季度 WHEN ('月 = 05) THEN 第二季度 WHEN ('月 = 06) THEN 第二季度 WHEN ('月 = 07) THEN 第三季度 WHEN ('月 = 08) THEN 第三季度 WHEN ('月 = 09) THEN 第三季度 WHEN ('月 = 10) THEN 第四季度 WHEN ('月 = 11) THEN 第四季度 WHEN ('月 = 12) THEN 第四季度 END AS Quarter#3841, 就诊类型#2, std_charge_type#1895, 性别#23, age_range#1923, std_diag#1951, severe_case#85, std_dept#1979, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
+- Project [Year#1839, Quarter#1867, 就诊类型#2, std_charge_type#1895, 性别#23, age_range#1923, std_diag#1951, severe_case#85, std_dept#1979, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
   +- Project [Year#1839, Quarter#1867, 就诊类型#2, std_charge_type#1895, 性别#23, age_range#1923, std_diag#1951, severe_case#85, 标准科室#51 AS std_dept#1979, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
      +- Project [Year#1839, Quarter#1867, 就诊类型#2, std_charge_type#1895, 性别#23, age_range#1923, 标准诊断#32 AS std_diag#1951, severe_case#85, 标准科室#51, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
         +- Project [Year#1839, Quarter#1867, 就诊类型#2, std_charge_type#1895, 性别#23, 年龄区间#83 AS age_range#1923, 标准诊断#32, severe_case#85, 标准科室#51, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
            +- Project [Year#1839, Quarter#1867, 就诊类型#2, 标准医保类型#52 AS std_charge_type#1895, 性别#23, 年龄区间#83, 标准诊断#32, severe_case#85, 标准科室#51, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
               +- Project [Year#1839, 月#30 AS Quarter#1867, 就诊类型#2, 标准医保类型#52, 性别#23, 年龄区间#83, 标准诊断#32, severe_case#85, 标准科室#51, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
                  +- Project [年#29 AS Year#1839, 月#30, 就诊类型#2, 标准医保类型#52, 性别#23, 年龄区间#83, 标准诊断#32, severe_case#85, 标准科室#51, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
                     +- Project [年#29, 月#30, 就诊类型#2, 标准医保类型#52, 性别#23, 年龄区间#83, 标准诊断#32, severe_case#85, 标准科室#51, 心律不齐#33, 其他心血管疾病#35, 脑血管疾病#36, 神经系统疾病#37, 高血糖#38, 高血压#39, 高血脂#40, 肝功能异常#41, 肾功能异常#42, 结缔组织病#43, COPD#44, 哮喘#45, 支气管扩张#46, 恶性实体瘤#47, IF_CHANGE_RX#786, ... 3 more fields]
                        +- Project [年#29, 月#30, 就诊类型#2, 标准医保类型#52, 性别#23, 年龄区间#83, 标准诊断#32, severe_case#85, 标准科室#51, single_or_formula#1222, IF_FIRST_RX#641, IF_CHANGE_RX#786, formula#616, mole_comb#624, 白细胞计数#60, C反应蛋白#61, 降钙素原#62, 嗜肺军团菌#63, 肺炎衣原体#64, 肺炎支原体#65, 冠状病毒#66, 合胞病毒#67, 流感病毒#68, 腺病毒#69, ... 83 more fields]
                           +- Join LeftOuter, (((((((((((((((((((((((((((((((((((((((((((((((((((((((((年#29 = 年#1526) AND (月#30 = 月#1527)) AND (就诊类型#2 = 就诊类型#1499)) AND (标准医保类型#52 = 标准医保类型#1549)) AND (性别#23 = 性别#1520)) AND (年龄区间#83 = 年龄区间#1580)) AND (标准诊断#32 = 标准诊断#1529)) AND (severe_case#85 = severe_case#1582)) AND (标准科室#51 = 标准科室#1548)) AND (single_or_formula#1222 = single_or_formula#1496)) AND (IF_FIRST_RX#641 = IF_FIRST_RX#1593)) AND (IF_CHANGE_RX#786 = IF_CHANGE_RX#1595)) AND (formula#616 = formula#1597)) AND (mole_comb#624 = mole_comb#1594)) AND (白细胞计数#60 = 白细胞计数#1557)) AND (C反应蛋白#61 = c反应蛋白#1558)) AND (降钙素原#62 = 降钙素原#1559)) AND (嗜肺军团菌#63 = 嗜肺军团菌#1560)) AND (肺炎衣原体#64 = 肺炎衣原体#1561)) AND (肺炎支原体#65 = 肺炎支原体#1562)) AND (冠状病毒#66 = 冠状病毒#1563)) AND (合胞病毒#67 = 合胞病毒#1564)) AND (流感病毒#68 = 流感病毒#1565)) AND (腺病毒#69 = 腺病毒#1566)) AND (柯萨奇病毒#70 = 柯萨奇病毒#1567)) AND (鲍曼氏不动杆菌#71 = 鲍曼氏不动杆菌#1568)) AND (大肠埃希菌#72 = 大肠埃希菌#1569)) AND (肺炎克雷伯菌#73 = 肺炎克雷伯菌#1570)) AND (肺炎链球菌#74 = 肺炎链球菌#1571)) AND (金黄色葡萄球菌#75 = 金黄色葡萄球菌#1572)) AND (流感嗜血菌#76 = 流感嗜血菌#1573)) AND (嗜麦芽寡养单胞菌#77 = 嗜麦芽寡养单胞菌#1574)) AND (嗜麦芽窄食单胞菌#78 = 嗜麦芽窄食单胞菌#1575)) AND (铜绿假单胞菌#79 = 铜绿假单胞菌#1576)) AND (阴沟肠杆菌#80 = 阴沟肠杆菌#1577)) AND (混合感染#84 = 混合感染#1581)) AND (心律不齐#33 = 心律不齐#1530)) AND (其他心血管疾病#35 = 其他心血管疾病#1532)) AND (脑血管疾病#36 = 脑血管疾病#1533)) AND (神经系统疾病#37 = 神经系统疾病#1534)) AND (高血糖#38 = 高血糖#1535)) AND (高血压#39 = 高血压#1536)) AND (高血脂#40 = 高血脂#1537)) AND (肝功能异常#41 = 肝功能异常#1538)) AND (肾功能异常#42 = 肾功能异常#1539)) AND (结缔组织病#43 = 结缔组织病#1540)) AND (COPD#44 = COPD#1541)) AND (哮喘#45 = 哮喘#1542)) AND (支气管扩张#46 = 支气管扩张#1543)) AND (恶性实体瘤#47 = 恶性实体瘤#1544)) AND (HAP患者#86 = HAP患者#1583)) AND (seg1_grp1#90 = seg1_grp1#1587)) AND (seg1_grp2#91 = seg1_grp2#1588)) AND (seg2_grp1#92 = seg2_grp1#1589)) AND (seg3_grp1#93 = seg3_grp1#1590)) AND (seg3_grp2#94 = seg3_grp2#1591)) AND (seg3_grp3#95 = seg3_grp3#1592))
                              :- Project [医院ID#1, 就诊类型#2, 患者ID#3, OUT_ID#5, 标准处方日期#7, uni_code#0, 就诊序号#4, MOLECULE#6, 标准入院时间#8, 标准出院时间#9, 药品名称#10, 规格#11, 剂型#12, 厂家#13, 医保类型#14, 科室#15, 省份#16, 城市#17, 医院等级#18, 处方日期#19, 入院时间#20, 出院时间#21, 年龄#22, 性别#23, ... 81 more fields]
                              :  +- Project [医院ID#1, 就诊类型#2, 患者ID#3, OUT_ID#5, 标准处方日期#7, uni_code#0, 就诊序号#4, MOLECULE#6, 标准入院时间#8, 标准出院时间#9, 药品名称#10, 规格#11, 剂型#12, 厂家#13, 医保类型#14, 科室#15, 省份#16, 城市#17, 医院等级#18, 处方日期#19, 入院时间#20, 出院时间#21, 年龄#22, 性别#23, ... 81 more fields]
                              :     +- Project [医院ID#1, 就诊类型#2, 患者ID#3, OUT_ID#5, 标准处方日期#7, uni_code#0, 就诊序号#4, MOLECULE#6, 标准入院时间#8, 标准出院时间#9, 药品名称#10, 规格#11, 剂型#12, 厂家#13, 医保类型#14, 科室#15, 省份#16, 城市#17, 医院等级#18, 处方日期#19, 入院时间#20, 出院时间#21, 年龄#22, 性别#23, ... 80 more fields]
                              :        +- Join LeftOuter, (((((医院ID#1 = 医院ID#917) AND (就诊类型#2 = 就诊类型#918)) AND (患者ID#3 = 患者ID#919)) AND (OUT_ID#5 = OUT_ID#921)) AND (标准处方日期#7 = 标准处方日期#902))
                              :           :- Relation[uni_code#0,医院ID#1,就诊类型#2,患者ID#3,就诊序号#4,OUT_ID#5,MOLECULE#6,标准处方日期#7,标准入院时间#8,标准出院时间#9,药品名称#10,规格#11,剂型#12,厂家#13,医保类型#14,科室#15,省份#16,城市#17,医院等级#18,处方日期#19,入院时间#20,出院时间#21,年龄#22,性别#23,... 72 more fields] parquet
                              :           +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207 AS 标准处方日期#902, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, formula#616, mole_comb#624, formula_numbers#779L, IF_CHANGE_RX#786]
                              :              +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, formula#616, mole_comb#624, formula_numbers#779L, IF_CHANGE_RX#786]
                              :                 +- Join LeftOuter, ((((医院ID#917 = 医院ID#794) AND (就诊类型#918 = 就诊类型#795)) AND (患者ID#919 = 患者ID#796)) AND (OUT_ID#921 = OUT_ID#798))
                              :                    :- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, formula#616, mole_comb#624]
                              :                    :  +- Join LeftOuter, (((((医院ID#917 = 医院ID#662) AND (就诊类型#918 = 就诊类型#663)) AND (患者ID#919 = 患者ID#664)) AND (OUT_ID#921 = OUT_ID#666)) AND (RX_DATE_STD#207 = RX_DATE_STD#410))
                              :                    :     :- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651]
                              :                    :     :  +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, MAX_SEQ#651]
                              :                    :     :     +- Window [max(SEQ#633) windowspecdefinition(医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS MAX_SEQ#651], [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921]
                              :                    :     :        +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641]
                              :                    :     :           +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, CASE WHEN (SEQ#633 = 1) THEN 1 ELSE 0 END AS IF_FIRST_RX#641]
                              :                    :     :              +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633]
                              :                    :     :                 +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, SEQ#633]
                              :                    :     :                    +- Window [row_number() windowspecdefinition(医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS SEQ#633], [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921], [RX_DATE_STD#207 DESC NULLS LAST]
                              :                    :     :                       +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L]
                              :                    :     :                          +- Aggregate [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207], [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, count(distinct MOLECULE_CATEGORY#998) AS 分子种类数#402L]
                              :                    :     :                             +- Project [uni_code#916, 医院ID#917, 就诊类型#918, 患者ID#919, 就诊序号#920, OUT_ID#921, MOLECULE#922, 标准处方日期#923, 标准入院时间#924, 标准出院时间#925, 药品名称#926, 规格#927, 剂型#928, 厂家#929, 医保类型#930, 科室#931, 省份#932, 城市#933, 医院等级#934, 处方日期#935, 入院时间#936, 出院时间#937, 年龄#938, 性别#939, ... 73 more fields]
                              :                    :     :                                +- Relation[uni_code#916,医院ID#917,就诊类型#918,患者ID#919,就诊序号#920,OUT_ID#921,MOLECULE#922,标准处方日期#923,标准入院时间#924,标准出院时间#925,药品名称#926,规格#927,剂型#928,厂家#929,医保类型#930,科室#931,省份#932,城市#933,医院等级#934,处方日期#935,入院时间#936,出院时间#937,年龄#938,性别#939,... 72 more fields] parquet
                              :                    :     +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, formula#616, concat_ws(+, mole_comb#608) AS mole_comb#624]
                              :                    :        +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, concat_ws(+, formula#606) AS formula#616, mole_comb#608]
                              :                    :           +- Aggregate [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410], [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, collect_set(MOLECULE_CATEGORY#743, 0, 0) AS formula#606, collect_set(MOLECULE#667, 0, 0) AS mole_comb#608]
                              :                    :              +- Project [uni_code#661, 医院ID#662, 就诊类型#663, 患者ID#664, 就诊序号#665, OUT_ID#666, MOLECULE#667, 标准处方日期#668, 标准入院时间#669, 标准出院时间#670, 药品名称#671, 规格#672, 剂型#673, 厂家#674, 医保类型#675, 科室#676, 省份#677, 城市#678, 医院等级#679, 处方日期#680, 入院时间#681, 出院时间#682, 年龄#683, 性别#684, ... 73 more fields]
                              :                    :                 +- Relation[uni_code#661,医院ID#662,就诊类型#663,患者ID#664,就诊序号#665,OUT_ID#666,MOLECULE#667,标准处方日期#668,标准入院时间#669,标准出院时间#670,药品名称#671,规格#672,剂型#673,厂家#674,医保类型#675,科室#676,省份#677,城市#678,医院等级#679,处方日期#680,入院时间#681,出院时间#682,年龄#683,性别#684,... 72 more fields] parquet
                              :                    +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, formula_numbers#779L, CASE WHEN (formula_numbers#779L > cast(1 as bigint)) THEN 1 ELSE 0 END AS IF_CHANGE_RX#786]
                              :                       +- Aggregate [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, count(distinct formula#616) AS formula_numbers#779L]
                              :                          +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, formula#616, mole_comb#624]
                              :                             +- Join LeftOuter, (((((医院ID#794 = 医院ID#662) AND (就诊类型#795 = 就诊类型#663)) AND (患者ID#796 = 患者ID#664)) AND (OUT_ID#798 = OUT_ID#666)) AND (RX_DATE_STD#207 = RX_DATE_STD#410))
                              :                                :- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651]
                              :                                :  +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, MAX_SEQ#651]
                              :                                :     +- Window [max(SEQ#633) windowspecdefinition(医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS MAX_SEQ#651], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798]
                              :                                :        +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641]
                              :                                :           +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, CASE WHEN (SEQ#633 = 1) THEN 1 ELSE 0 END AS IF_FIRST_RX#641]
                              :                                :              +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633]
                              :                                :                 +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, SEQ#633]
                              :                                :                    +- Window [row_number() windowspecdefinition(医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS SEQ#633], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798], [RX_DATE_STD#207 DESC NULLS LAST]
                              :                                :                       +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L]
                              :                                :                          +- Aggregate [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, count(distinct MOLECULE_CATEGORY#875) AS 分子种类数#402L]
                              :                                :                             +- Project [uni_code#793, 医院ID#794, 就诊类型#795, 患者ID#796, 就诊序号#797, OUT_ID#798, MOLECULE#799, 标准处方日期#800, 标准入院时间#801, 标准出院时间#802, 药品名称#803, 规格#804, 剂型#805, 厂家#806, 医保类型#807, 科室#808, 省份#809, 城市#810, 医院等级#811, 处方日期#812, 入院时间#813, 出院时间#814, 年龄#815, 性别#816, ... 73 more fields]
                              :                                :                                +- Relation[uni_code#793,医院ID#794,就诊类型#795,患者ID#796,就诊序号#797,OUT_ID#798,MOLECULE#799,标准处方日期#800,标准入院时间#801,标准出院时间#802,药品名称#803,规格#804,剂型#805,厂家#806,医保类型#807,科室#808,省份#809,城市#810,医院等级#811,处方日期#812,入院时间#813,出院时间#814,年龄#815,性别#816,... 72 more fields] parquet
                              :                                +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, formula#616, concat_ws(+, mole_comb#608) AS mole_comb#624]
                              :                                   +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, concat_ws(+, formula#606) AS formula#616, mole_comb#608]
                              :                                      +- Aggregate [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410], [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, collect_set(MOLECULE_CATEGORY#743, 0, 0) AS formula#606, collect_set(MOLECULE#667, 0, 0) AS mole_comb#608]
                              :                                         +- Project [uni_code#661, 医院ID#662, 就诊类型#663, 患者ID#664, 就诊序号#665, OUT_ID#666, MOLECULE#667, 标准处方日期#668, 标准入院时间#669, 标准出院时间#670, 药品名称#671, 规格#672, 剂型#673, 厂家#674, 医保类型#675, 科室#676, 省份#677, 城市#678, 医院等级#679, 处方日期#680, 入院时间#681, 出院时间#682, 年龄#683, 性别#684, ... 73 more fields]
                              :                                            +- Relation[uni_code#661,医院ID#662,就诊类型#663,患者ID#664,就诊序号#665,OUT_ID#666,MOLECULE#667,标准处方日期#668,标准入院时间#669,标准出院时间#670,药品名称#671,规格#672,剂型#673,厂家#674,医保类型#675,科室#676,省份#677,城市#678,医院等级#679,处方日期#680,入院时间#681,出院时间#682,年龄#683,性别#684,... 72 more fields] parquet
                              +- Aggregate [年#1526, 月#1527, 就诊类型#1499, 标准医保类型#1549, 性别#1520, 年龄区间#1580, 标准诊断#1529, severe_case#1582, 标准科室#1548, single_or_formula#1496, IF_FIRST_RX#1593, IF_CHANGE_RX#1595, formula#1597, mole_comb#1594, 白细胞计数#1557, c反应蛋白#1558, 降钙素原#1559, 嗜肺军团菌#1560, 肺炎衣原体#1561, 肺炎支原体#1562, 冠状病毒#1563, 合胞病毒#1564, 流感病毒#1565, 腺病毒#1566, ... 33 more fields], [年#1526, 月#1527, 就诊类型#1499, 标准医保类型#1549, 性别#1520, 年龄区间#1580, 标准诊断#1529, severe_case#1582, 标准科室#1548, single_or_formula#1496, IF_FIRST_RX#1593, IF_CHANGE_RX#1595, formula#1597, mole_comb#1594, 白细胞计数#1557, c反应蛋白#1558, 降钙素原#1559, 嗜肺军团菌#1560, 肺炎衣原体#1561, 肺炎支原体#1562, 冠状病毒#1563, 合胞病毒#1564, 流感病毒#1565, 腺病毒#1566, ... 35 more fields]
                                 +- Project [医院ID#1498, 就诊类型#1499, 患者ID#1500, OUT_ID#1502, 标准处方日期#1504, uni_code#1497, 就诊序号#1501, MOLECULE#1503, 标准入院时间#1505, 标准出院时间#1506, 药品名称#1507, 规格#1508, 剂型#1509, 厂家#1510, 医保类型#1511, 科室#1512, 省份#1513, 城市#1514, 医院等级#1515, 处方日期#1516, 入院时间#1517, 出院时间#1518, 年龄#1519, 性别#1520, ... 81 more fields]
                                    +- Project [医院ID#1498, 就诊类型#1499, 患者ID#1500, OUT_ID#1502, 标准处方日期#1504, uni_code#1497, 就诊序号#1501, MOLECULE#1503, 标准入院时间#1505, 标准出院时间#1506, 药品名称#1507, 规格#1508, 剂型#1509, 厂家#1510, 医保类型#1511, 科室#1512, 省份#1513, 城市#1514, 医院等级#1515, 处方日期#1516, 入院时间#1517, 出院时间#1518, 年龄#1519, 性别#1520, ... 81 more fields]
                                       +- Project [医院ID#1498, 就诊类型#1499, 患者ID#1500, OUT_ID#1502, 标准处方日期#1504, uni_code#1497, 就诊序号#1501, MOLECULE#1503, 标准入院时间#1505, 标准出院时间#1506, 药品名称#1507, 规格#1508, 剂型#1509, 厂家#1510, 医保类型#1511, 科室#1512, 省份#1513, 城市#1514, 医院等级#1515, 处方日期#1516, 入院时间#1517, 出院时间#1518, 年龄#1519, 性别#1520, ... 80 more fields]
                                          +- Join LeftOuter, (((((医院ID#1498 = 医院ID#917) AND (就诊类型#1499 = 就诊类型#918)) AND (患者ID#1500 = 患者ID#919)) AND (OUT_ID#1502 = OUT_ID#921)) AND (标准处方日期#1504 = 标准处方日期#902))
                                             :- Relation[uni_code#1497,医院ID#1498,就诊类型#1499,患者ID#1500,就诊序号#1501,OUT_ID#1502,MOLECULE#1503,标准处方日期#1504,标准入院时间#1505,标准出院时间#1506,药品名称#1507,规格#1508,剂型#1509,厂家#1510,医保类型#1511,科室#1512,省份#1513,城市#1514,医院等级#1515,处方日期#1516,入院时间#1517,出院时间#1518,年龄#1519,性别#1520,... 72 more fields] parquet
                                             +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207 AS 标准处方日期#902, 分子种类数#402L, SEQ#633, IF_FIRST_RX#1593, MAX_SEQ#651, formula#1597, mole_comb#1594, formula_numbers#779L, IF_CHANGE_RX#1595]
                                                +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#1593, MAX_SEQ#651, formula#1597, mole_comb#1594, formula_numbers#779L, IF_CHANGE_RX#1595]
                                                   +- Join LeftOuter, ((((医院ID#917 = 医院ID#794) AND (就诊类型#918 = 就诊类型#795)) AND (患者ID#919 = 患者ID#796)) AND (OUT_ID#921 = OUT_ID#798))
                                                      :- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#1593, MAX_SEQ#651, formula#1597, mole_comb#1594]
                                                      :  +- Join LeftOuter, (((((医院ID#917 = 医院ID#662) AND (就诊类型#918 = 就诊类型#663)) AND (患者ID#919 = 患者ID#664)) AND (OUT_ID#921 = OUT_ID#666)) AND (RX_DATE_STD#207 = RX_DATE_STD#410))
                                                      :     :- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#1593, MAX_SEQ#651]
                                                      :     :  +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#1593, MAX_SEQ#651, MAX_SEQ#651]
                                                      :     :     +- Window [max(SEQ#633) windowspecdefinition(医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS MAX_SEQ#651], [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921]
                                                      :     :        +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#1593]
                                                      :     :           +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, CASE WHEN (SEQ#633 = 1) THEN 1 ELSE 0 END AS IF_FIRST_RX#1593]
                                                      :     :              +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633]
                                                      :     :                 +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, SEQ#633]
                                                      :     :                    +- Window [row_number() windowspecdefinition(医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS SEQ#633], [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921], [RX_DATE_STD#207 DESC NULLS LAST]
                                                      :     :                       +- Project [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, 分子种类数#402L]
                                                      :     :                          +- Aggregate [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207], [医院ID#917, 就诊类型#918, 患者ID#919, OUT_ID#921, RX_DATE_STD#207, count(distinct MOLECULE_CATEGORY#998) AS 分子种类数#402L]
                                                      :     :                             +- Project [uni_code#916, 医院ID#917, 就诊类型#918, 患者ID#919, 就诊序号#920, OUT_ID#921, MOLECULE#922, 标准处方日期#923, 标准入院时间#924, 标准出院时间#925, 药品名称#926, 规格#927, 剂型#928, 厂家#929, 医保类型#930, 科室#931, 省份#932, 城市#933, 医院等级#934, 处方日期#935, 入院时间#936, 出院时间#937, 年龄#938, 性别#939, ... 73 more fields]
                                                      :     :                                +- Relation[uni_code#916,医院ID#917,就诊类型#918,患者ID#919,就诊序号#920,OUT_ID#921,MOLECULE#922,标准处方日期#923,标准入院时间#924,标准出院时间#925,药品名称#926,规格#927,剂型#928,厂家#929,医保类型#930,科室#931,省份#932,城市#933,医院等级#934,处方日期#935,入院时间#936,出院时间#937,年龄#938,性别#939,... 72 more fields] parquet
                                                      :     +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, formula#1597, concat_ws(+, mole_comb#608) AS mole_comb#1594]
                                                      :        +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, concat_ws(+, formula#606) AS formula#1597, mole_comb#608]
                                                      :           +- Aggregate [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410], [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, collect_set(MOLECULE_CATEGORY#743, 0, 0) AS formula#606, collect_set(MOLECULE#667, 0, 0) AS mole_comb#608]
                                                      :              +- Project [uni_code#661, 医院ID#662, 就诊类型#663, 患者ID#664, 就诊序号#665, OUT_ID#666, MOLECULE#667, 标准处方日期#668, 标准入院时间#669, 标准出院时间#670, 药品名称#671, 规格#672, 剂型#673, 厂家#674, 医保类型#675, 科室#676, 省份#677, 城市#678, 医院等级#679, 处方日期#680, 入院时间#681, 出院时间#682, 年龄#683, 性别#684, ... 73 more fields]
                                                      :                 +- Relation[uni_code#661,医院ID#662,就诊类型#663,患者ID#664,就诊序号#665,OUT_ID#666,MOLECULE#667,标准处方日期#668,标准入院时间#669,标准出院时间#670,药品名称#671,规格#672,剂型#673,厂家#674,医保类型#675,科室#676,省份#677,城市#678,医院等级#679,处方日期#680,入院时间#681,出院时间#682,年龄#683,性别#684,... 72 more fields] parquet
                                                      +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, formula_numbers#779L, CASE WHEN (formula_numbers#779L > cast(1 as bigint)) THEN 1 ELSE 0 END AS IF_CHANGE_RX#1595]
                                                         +- Aggregate [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, count(distinct formula#1597) AS formula_numbers#779L]
                                                            +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, formula#1597, mole_comb#624]
                                                               +- Join LeftOuter, (((((医院ID#794 = 医院ID#662) AND (就诊类型#795 = 就诊类型#663)) AND (患者ID#796 = 患者ID#664)) AND (OUT_ID#798 = OUT_ID#666)) AND (RX_DATE_STD#207 = RX_DATE_STD#410))
                                                                  :- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651]
                                                                  :  +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641, MAX_SEQ#651, MAX_SEQ#651]
                                                                  :     +- Window [max(SEQ#633) windowspecdefinition(医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS MAX_SEQ#651], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798]
                                                                  :        +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, IF_FIRST_RX#641]
                                                                  :           +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, CASE WHEN (SEQ#633 = 1) THEN 1 ELSE 0 END AS IF_FIRST_RX#641]
                                                                  :              +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633]
                                                                  :                 +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L, SEQ#633, SEQ#633]
                                                                  :                    +- Window [row_number() windowspecdefinition(医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS SEQ#633], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798], [RX_DATE_STD#207 DESC NULLS LAST]
                                                                  :                       +- Project [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, 分子种类数#402L]
                                                                  :                          +- Aggregate [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207], [医院ID#794, 就诊类型#795, 患者ID#796, OUT_ID#798, RX_DATE_STD#207, count(distinct MOLECULE_CATEGORY#875) AS 分子种类数#402L]
                                                                  :                             +- Project [uni_code#793, 医院ID#794, 就诊类型#795, 患者ID#796, 就诊序号#797, OUT_ID#798, MOLECULE#799, 标准处方日期#800, 标准入院时间#801, 标准出院时间#802, 药品名称#803, 规格#804, 剂型#805, 厂家#806, 医保类型#807, 科室#808, 省份#809, 城市#810, 医院等级#811, 处方日期#812, 入院时间#813, 出院时间#814, 年龄#815, 性别#816, ... 73 more fields]
                                                                  :                                +- Relation[uni_code#793,医院ID#794,就诊类型#795,患者ID#796,就诊序号#797,OUT_ID#798,MOLECULE#799,标准处方日期#800,标准入院时间#801,标准出院时间#802,药品名称#803,规格#804,剂型#805,厂家#806,医保类型#807,科室#808,省份#809,城市#810,医院等级#811,处方日期#812,入院时间#813,出院时间#814,年龄#815,性别#816,... 72 more fields] parquet
                                                                  +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, formula#1597, concat_ws(+, mole_comb#608) AS mole_comb#624]
                                                                     +- Project [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, concat_ws(+, formula#606) AS formula#1597, mole_comb#608]
                                                                        +- Aggregate [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410], [医院ID#662, 就诊类型#663, 患者ID#664, OUT_ID#666, RX_DATE_STD#410, collect_set(MOLECULE_CATEGORY#743, 0, 0) AS formula#606, collect_set(MOLECULE#667, 0, 0) AS mole_comb#608]
                                                                           +- Project [uni_code#661, 医院ID#662, 就诊类型#663, 患者ID#664, 就诊序号#665, OUT_ID#666, MOLECULE#667, 标准处方日期#668, 标准入院时间#669, 标准出院时间#670, 药品名称#671, 规格#672, 剂型#673, 厂家#674, 医保类型#675, 科室#676, 省份#677, 城市#678, 医院等级#679, 处方日期#680, 入院时间#681, 出院时间#682, 年龄#683, 性别#684, ... 73 more fields]
                                                                              +- Relation[uni_code#661,医院ID#662,就诊类型#663,患者ID#664,就诊序号#665,OUT_ID#666,MOLECULE#667,标准处方日期#668,标准入院时间#669,标准出院时间#670,药品名称#671,规格#672,剂型#673,厂家#674,医保类型#675,科室#676,省份#677,城市#678,医院等级#679,处方日期#680,入院时间#681,出院时间#682,年龄#683,性别#684,... 72 more fields] parquet


In [9]:
# %%

# 保存结果
df_patient_analyse_std = df_patient_analyse_std.repartition(g_partition_num)
df_patient_analyse_std.write.format("parquet") \
    .mode("overwrite").save(p_patient_union_drug_analyse_out)


In [13]:
# %%
df_table_zero = df_table_zero.repartition(g_partition_num)
df_table_zero.write.format("parquet") \
    .mode("overwrite").save(p_result_table_0)



In [None]:
# %%
df_table_one = df_table_one.repartition(g_partition_num)
df_table_one.write.format("parquet") \
    .mode("overwrite").save(p_result_table_1)



In [None]:
# %%
df_table_two = df_table_two.repartition(g_partition_num)
df_table_two.write.format("parquet") \
    .mode("overwrite").save(p_result_table_2)
