In [1]:
from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType, StructType
from pyspark.sql.functions import col, date_format, count, isnull, lit,first
from pyspark.sql.functions import when, isnan, udf, pandas_udf, PandasUDFType
from pyspark.sql import functions as Func
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from typing import Iterator
import pandas
import pandas as pd
import re

spark = SparkSession.builder \
    .master("yarn") \
    .appName("application name") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.instances", "2") \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .getOrCreate()

In [2]:
g_whether_save_result = True

p_main_dir = "s3://ph-origin-files/user/zazhao/2020年结果-csv/"
p_patient = p_main_dir + "病人"
p_detection = p_main_dir + "检测"
p_data_summary = p_main_dir+"条目数汇总表-2020.csv"

p_mapping_file = p_main_dir+"清洗规则/"
p_out_main_dir = p_main_dir+"输出文件/"

In [3]:
## 读取检测数据
df_raw_detection = spark.read.csv(p_detection, header=True).repartition(160)

df_raw_detection = df_raw_detection.select([ 'PATIENT_ID', 'VISIT_ID', 'ITEM_NAME', 'SUBJECT', 'REPORT_ITEM_NAME', 
                                            'RESULT', 'UNITS', 'ABNORMAL_INDICATOR', 'REQUESTED_DATE_TIME', 
                                            'RESULTS_RPT_DATE_TIME', 'DEPT_NAME'])

df_raw_detection = df_raw_detection.withColumn("VISIT_ID", Func.col("VISIT_ID").cast("int"))\
                                    .withColumnRenamed("VISIT_ID", "就诊序号") \
                                    .withColumn("REQUESTED_DATE_TIME_STD",  date_format("REQUESTED_DATE_TIME", "yyyMM")) \
                                    .withColumn("RESULTS_RPT_DATE_TIME_STD", date_format("RESULTS_RPT_DATE_TIME", "yyyMM")) 
# df_raw_detection.where( df_raw_detection["VISIT_ID"].isNull() ).count()

In [4]:
## 读取病人数据

df_raw_patient = spark.read.csv( "s3://ph-origin-files/user/zazhao/2020年结果-csv/病人/列表1-2020年-1.csv", header=True).repartition(16)
# df_raw_patient.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/1")

old_col = ['省份', '城市', '医院等级', '就诊类型', '医院ID', '患者ID', '就诊序号', 
        '处方日期', '入院时间', '出院时间', '年龄', 
        '性别', '医保类型', '诊断', '科室', 
        '药品名称', '规格', '剂型', '厂家', '金额', '数量', '数量单位']
new_col = ["PROVINCE", "CITY", "HOSP_LEVEL", "TREAMENT_TYPE", "HOSP_ID", "PATIENT_ID", "VISIT_ID",
          "PRESCRIPTION_DATE", "ADMISSION_DATE" , "DISCHARGE_DATE",  "AGE", 
         "GENDER", "HIS_TYPE", "DIAGNOISE", "df_dept_mapping_NAME",
          "DRUG_NAME", "SPECIFICATION", "FORM", "MANUFACTURES", "MONEY", "NUMBER", "NUMBER_UNIT"]
df_raw_patient = df_raw_patient.select(old_col)
# df_raw_patient.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/2")


## 去除字符串前后的空格,因为会影响到和其他表间进行匹配
df_raw_patient = df_raw_patient.select([Func.trim(col(i)).alias(i)  for i in df_raw_patient.columns])
# df_raw_patient.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/3")

###################### 以下是需要转换列名为英文时才需要 
# # 列名标准化
# data_patient = data_patient.select( list( map( lambda x:col(x[0]).alias(x[1]),  zip(old_col, new_col) ) ))
# # 转换日期格式
# data_patient = data_patient.withColumn("PRESCRIPTION_DATE_STD", date_format("PRESCRIPTION_DATE", "yyyMM") )\
#                                     .withColumn("ADMISSION_DATE_STD", date_format("ADMISSION_DATE", "yyyMM") )\
#                                     .withColumn("DISCHARGE_DATE_STD", date_format("DISCHARGE_DATE", "yyyMM") )
# ## 年龄转换成数字
# df_patient = df_patient.withColumn("AGE", col("AGE").cast("int"))
######################

## 日期格式转换
df_raw_patient = df_raw_patient.withColumn("标准处方日期", date_format("处方日期", "yyyMM") )\
                                    .withColumn("标准入院时间", date_format("入院时间", "yyyMM") )\
                                    .withColumn("标准出院时间", date_format("出院时间", "yyyMM") )
# df_raw_patient.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/4")

## 年龄转换成数字
df_raw_patient = df_raw_patient.withColumn("年龄", col("年龄").cast("int"))
# df_raw_patient.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/5")

In [5]:
## ============== 一. 确定样本医院 =================
df_sample_hospital = df_raw_patient.select(['医院ID','省份', '城市', '医院等级']).distinct()

city_mapping = {
    '北京':'1','上海':'1','深圳':'1','广州':'1',
    '成都':'2','杭州':'2','重庆':'2','武汉':'2',
    '苏州':'2','西安':'2','天津':'2','南京':'2',
    '郑州':'2','长沙':'2','沈阳':'2','青岛':'2',
    '宁波':'2','东莞':'2','无锡':'2','昆明':'2',
    '大连':'2','厦门':'2','合肥':'2','佛山':'2',
    '福州':'2','哈尔滨':'2','济南':'2','温州':'2',
    '长春':'2','石家庄':'2','常州':'2','泉州':'2',
    '南宁':'2','贵阳':'2','南昌':'2','南通':'2',
    '金华':'2','徐州':'2','太原':'2','嘉兴':'2',
    '烟台':'2','惠州':'2','保定':'2','台州':'2',
    '中山':'2','绍兴':'2','乌鲁木齐':'2','潍坊':'2',
    '兰州':'2'
}

hosp_mapping = {
    '特级':'三级','三甲':'三级','二乙':'三级','三丙':'三级',
    '二甲':'二级','二乙':'二级','二丙':'二级',
    '甲':'一级','乙':'一级','丙':'一级'
}

@pandas_udf("string", PandasUDFType.SCALAR )
def pudf_city(x : Iterator[ pd.Series ] ):
    return x.apply(lambda i:  city_mapping[i] if i in city_mapping else "null" )

@pandas_udf("string", PandasUDFType.SCALAR )
def pudf_hosp(x : Iterator[ pd.Series ]):
    return x.apply(lambda i: hosp_mapping[i] if i in hosp_mapping else "null")

df_sample_hospital = df_sample_hospital.withColumn("城市等级",pudf_city( col("城市" ) ))
df_sample_hospital = df_sample_hospital.withColumn("新医院等级",pudf_hosp( col("医院等级") ))
df_sample_hospital = df_sample_hospital.select(['医院ID','省份', '城市', '医院等级', "城市等级","新医院等级"]).orderBy("医院ID")
# df_sample_hospital.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/6")



In [6]:
## 清洗药品名称
@pandas_udf("string", PandasUDFType.SCALAR)
def pudf_change(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    new_iter = iterator.apply( lambda x: changeSpecification(x) )
    return new_iter

def changeSpecification(x ):
    # 处理读入的字符为空的情况
    if x==None:
        return "null"
    elif re.findall( r'.*(莫西沙星).*', x):
        new_name = "莫西沙星"
    elif re.findall( r'.*(左氧氟沙星).*', x):
        new_name = "左氧氟沙星"
    elif re.findall( r'.*(头孢曲松).*', x):
        new_name = "头孢曲松"
    elif re.findall( r'.*(阿奇霉素).*', x):
        new_name = "阿奇霉素"
    elif re.findall( r'.*(多西环素).*', x):
        new_name = "多西环素"
    elif re.findall( r'.*(米诺环素).*', x):
        new_name = "米诺环素"
    elif (re.findall( r'.*(他唑巴坦|他唑邦坦|三唑巴坦|他唑巴).*', x)!=list()) \
            & ( re.findall( r'.*(哌拉西林).*', x)!=list() ):
        new_name = "哌拉西林他唑巴坦纳"
    elif ( re.findall( r'.*(哌拉西林).*', x)!=list() )\
            & ( re.findall( r'.*(舒巴坦).*', x)!=list() ):
        new_name = "哌拉西林舒巴坦纳"
    elif re.findall( r'.*(哌拉西林).*', x):
        new_name = "哌拉西林纳"
    elif ( re.findall( r'.*(头孢哌酮).*', x)!=list() )\
            & ( re.findall( r'.*(舒巴坦).*', x)!=list() ):
        new_name = "头孢哌酮钠舒巴坦钠"
    elif ( re.findall( r'.*(头孢哌酮).*', x)!=list() )\
            & ( re.findall( r'.*(他唑巴坦).*', x)!=list() ):
        new_name = "头孢哌酮钠他唑巴坦钠"
    elif re.findall( r'.*(头孢哌酮).*', x):
        new_name = "头孢哌酮钠"
    elif ( re.findall( r'.*(美洛西林).*', x)!=list() )\
            & ( re.findall( r'.*(舒巴坦).*', x)!=list() ):
        new_name = "美洛西林钠舒巴坦钠"
    elif re.findall( r'.*(美洛西林).*', x):
        new_name = "美洛西林钠"
    elif re.findall( r'.*(依替米星).*', x):
        new_name = "依替米星"
    elif re.findall( r'.*(头孢米诺).*', x):
        new_name = "头孢米诺"
    elif re.findall( r'.*(替加环素).*', x):
        new_name = "替加环素"
    elif re.findall( r'.*(头孢西丁).*', x):
        new_name = "头孢西丁"
    elif re.findall( r'.*(头孢他啶).*', x):
        new_name = "头孢他啶"
    elif re.findall( r'.*(厄他培南).*', x):
        new_name = "厄他培南"
    elif re.findall( r'.*(利奈唑胺).*', x):
        new_name = "利奈唑胺"    
    elif re.findall( r'.*(万古霉素).*', x):
        new_name = "万古霉素"
    elif ( re.findall( r'.*(头孢噻肟).*', x)!=list()) & \
            ( re.findall( r'.*(舒巴坦).*', x)!=list()):
        new_name = "头孢噻肟舒巴坦钠"
    elif re.findall( r'.*(头孢噻肟).*', x):
        new_name = "头孢噻肟钠"
    elif re.findall( r'.*(拉氧头孢).*', x):
        new_name = "拉氧头孢"
    elif re.findall( r'.*(环丙沙星).*', x):
        new_name = "环丙沙星"
    else:
        new_name ="null"
    return new_name

# data_temp = data_patient.withColumn("DRUG_NAME_STD", changeSpecification( col("DRUG_NAME"))) 
# data_patient_drug = data_patient.withColumn("标准药品名称", pudf_change( data_patient["药品名称"]) ) 

In [7]:
# %%

## ============== 诊断清洗 ======================

###  日期提取

### 清洗诊断列
@pandas_udf("string" , PandasUDFType.SCALAR)
def pudf_standDiagnoise(x:pd.Series)->pd.Series:
    return x.apply(lambda i: changeDiagnoise(i))

def changeDiagnoise(x):
    if x==None:
        new_x = "其他"
    elif re.findall(r".*(肺部感染|肺内感染|肺感染|支原体感染|衣原体感染).*", x):
        new_x = "肺部感染"
    elif re.findall( r"(肺炎|肺部炎症)", x):
        new_x = "肺炎"
    elif re.findall( r"(社区获得|CAP)", x) and (re.findall(r"CPAD", x)==list() ):
        new_x = "社区获得性肺炎"
    elif re.findall( r"(呼吸道感染|呼感)", x):
        new_x = "呼吸道感染"
    elif re.findall( r"(支气管肺炎)", x):
        new_x = "支气管肺炎"
    elif re.findall( r"(气管炎|急支|慢支|支气管周围炎|支炎)", x):
        new_x = "支气管炎"
    elif re.findall( r"(上感|上呼吸道感染)", x):
        new_x = "上呼吸道感染"
    elif re.findall( r"(扁桃体炎|扁桃体感染|扁桃体周围炎|化扁)", x):
        new_x = "扁桃体炎"
    elif re.findall( r"(咽炎|喉炎|咽峡炎|咽部感染|会厌炎)", x):
        new_x = "咽炎"
    elif re.findall( r"(流感|流行性感冒|甲流|乙流)", x):
        new_x = "流感"
    elif re.findall( r"(蜂窝织炎|蜂窝组织炎|丹毒|坏死性感染|化脓性感染|软组织感染|软组织炎)", x):
        new_x = "皮肤软组织感染"
    elif re.findall( r"(皮肤感染|皮炎|皮疹|湿疹|痤疮|毛囊炎|疖|外伤|烧伤|痈|疣)", x):
        new_x = "其他皮肤病"
    elif re.findall( r"(结膜炎|角膜炎|LASIK|睑板腺炎|睑腺炎|白内障|中耳炎|耳道炎|牙周炎|鼻炎|冠周炎|龈炎|睑缘炎|鼻窦炎)", x):
        new_x = "五官类疾病"
    elif re.findall( r"(胃炎|肠炎|食管炎|幽门螺杆菌感染|腹痛|阑尾炎|胆囊炎|胰腺炎|肠道感染|幽门螺旋杆菌|Hp感染|HP感染|腹泻)", x):
        new_x = "消化系统感染"
    elif re.findall( r"(泌尿系感染|尿路感染|尿道炎|前列腺炎|阴道炎|宫颈炎|尿道感染|尿路结石伴感染|盆腔炎)", x):
        new_x = "泌尿生殖系统感染"
    elif re.findall( r"(炎|感染)", x):
        new_x = "其他感染/炎症"
    elif re.findall( r"(呼吸困难|呼吸衰竭)", x):
        new_x = "呼吸困难"
    elif re.findall( r"(发烧|发热)", x):
        new_x = "发热"
    elif re.findall( r"(咳痰|有痰)", x):
        new_x = "咳痰"
    elif re.findall( r"(咳)", x):
        new_x = "咳嗽"
    elif re.findall( r"(感冒)", x):
        new_x = "普通感冒"
    elif re.findall( r"(咽痛|喉痛)", x):
        new_x = "咽痛"
    else:
        new_x = "其他"
    return new_x
    
df_patient_diagnois =  df_raw_patient.withColumn("标准诊断", pudf_standDiagnoise(  col("诊断") )) 
# df_patient_diagnois.show(1)

In [8]:
##  添加新的列
df_patient_diagnois_target  = df_patient_diagnois.withColumn("心律不齐",  when( col("诊断").\
                        rlike(r'心率失常|心律失常|心律不齐|心率不齐|心动过速|心动过缓|早搏|房室|QT|房颤|纤颤'), 1).otherwise(0) )\
                                                    .withColumn("心衰", when( col("诊断").rlike('心衰|心力衰竭'), 1 ).otherwise(0))\
                                                    .withColumn("其他心血管疾病", when( col("诊断").rlike('心功能|冠心病|冠状|动脉|心梗|心肌|心血管|心绞痛|心脏病'), 1 ).otherwise(0))\
                                                    .withColumn("脑血管疾病", when( col("诊断").rlike('脑梗|脑血管|中风|脑血栓|脑出血'), 1 ).otherwise(0))\
                                                    .withColumn("神经系统疾病", when( col("诊断").rlike('癫痫|EP|高颅压|颅内压增高|颅内高压|帕金森|阿尔兹海默|'+\
                                                                                                       '痴呆|神经炎|颅内感染|脑神经损害|脊神经|神经病|周围神经系统'), 1 ).otherwise(0))\
                                                    .withColumn("高血糖", when( col("诊断").rlike('高血糖'), 1 ).otherwise(0))\
                                                    .withColumn("高血压", when( col("诊断").rlike('高血压'), 1 ).otherwise(0))\
                                                    .withColumn("高血脂", when( col("诊断").rlike('高血脂|高脂|胆固醇'), 1 ).otherwise(0))\
                                                    .withColumn("肝功能异常", when( col("诊断").rlike('肝炎|肝损|肝功|肝硬|肝病|肝衰|肝纤维|药肝|脂肪肝'), 1 ).otherwise(0))\
                                                    .withColumn("肾功能异常", when( col("诊断").rlike('CRF|肾功|肾衰|肾病|透析|肾小管|肾小球|CAPD|尿毒|肾炎'), 1 ).otherwise(0))\
                                                    .withColumn("结缔组织病", when( col("诊断").rlike('结缔|风湿|关节炎'), 1 ).otherwise(0))\
                                                    .withColumn("COPD", when( col("诊断").rlike('COPD|慢性阻塞性肺|慢阻肺'), 1 ).otherwise(0))\
                                                    .withColumn("哮喘", when( col("诊断").rlike('哮喘|哮支'), 1 ).otherwise(0))\
                                                    .withColumn("支气管扩张", when( col("诊断").rlike('支气管扩张'), 1 ).otherwise(0))\
                                                    .withColumn("恶性实体瘤", when( ( col("诊断").rlike('癌|恶性肿瘤|恶性瘤|占位|放疗|化疗|CA|原位|转移|黑色素瘤') )
                                                                              &(col("诊断").rlike('CAPD|CAP')== False ) ,  1  ) .otherwise(0)) \
                                                    .withColumn("原始诊断字符数", Func.length( col("诊断") ) )
# df_diagnois_with_target.where(col("恶性实体瘤")==1).show(1, False, vertical=True)
# df_patient_diagnois_target.show(1,  vertical=True)

In [9]:
## ======================  清洗医保、科室清洗、医院ID ======================

p_dept_mapping = p_mapping_file + "科室清洗规则-重新划分Hanhui.csv"
df_dept_mapping = spark.read.csv(p_dept_mapping,header=True).withColumnRenamed("std_dept", "标准科室")


# 科室清洗
df_patient_dept = df_patient_diagnois_target.join(df_dept_mapping, on=["科室"], how="left")
# print("无法匹配到的科室  ", df_raw_patient.join(df_dept_mapping, on=["科室"], how="anti").count() )
# left_anti， leftouter


# 医保类型清洗
p_medical_insurance_mapping = p_mapping_file + "医保清洗规则.csv"
df_medical_insurance_mapping = spark.read.csv(p_medical_insurance_mapping,header=True) \
                                        .withColumnRenamed("std_charge_type", "标准医保类型")
# print(df_medical_insurance_mapping.columns)
df_patient_std = df_patient_dept.join(df_medical_insurance_mapping,  on="医保类型", how="left")

## 为 Null的医保类型转换成 其他
df_patient_std = df_patient_std.fillna("其他", subset=["医保类型", "标准医保类型"])
# df_patient_std.select("标准医保类型").groupBy("标准医保类型").agg( Func.count("*").alias("样本数") ).show()
# print("无法匹配到的医保类型:  ", df_patient_dept.join(df_medical_insurance_mapping,  on="医保类型", how="anti").count() )

In [10]:

## 输出无法匹配的科室 
# temp = df_raw_patient.join(df_dept_mapping, on=["科室"], how="anti").groupBy("科室").agg( Func.count("*").alias("样本数") )
# temp.repartition(1).write.mode("overwrite").csv(p_out_main_dir+"无法匹配到的科室", sep=',', header="true", encoding="utf-8")

# temp = df_patient_dept.join(df_medical_insurance_mapping,  on="医保类型", how="anti")
# temp.groupBy("医保类型").agg( Func.count("*").alias("样本数") ).show()

In [11]:
# 性别清洗

# df_patient.select("GENDER").distinct().show()
df_patient_std = df_patient_std.withColumn("标准性别",  Func.when(   ~( col("性别")=="男") & ( ~(col("性别")=="女") ) |\
                                                           col("性别").isNull()
                                                           , "其他" ) .otherwise( col("性别"))  )
df_patient_std.persist()
# df_sample_hospital.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/7")

DataFrame[医保类型: string, 科室: string, 省份: string, 城市: string, 医院等级: string, 就诊类型: string, 医院ID: string, 患者ID: string, 就诊序号: string, 处方日期: string, 入院时间: string, 出院时间: string, 年龄: int, 性别: string, 诊断: string, 药品名称: string, 规格: string, 剂型: string, 厂家: string, 金额: string, 数量: string, 数量单位: string, 标准处方日期: string, 标准入院时间: string, 标准出院时间: string, 标准诊断: string, 心律不齐: int, 心衰: int, 其他心血管疾病: int, 脑血管疾病: int, 神经系统疾病: int, 高血糖: int, 高血压: int, 高血脂: int, 肝功能异常: int, 肾功能异常: int, 结缔组织病: int, COPD: int, 哮喘: int, 支气管扩张: int, 恶性实体瘤: int, 原始诊断字符数: int, 标准清洗结果: string, 诊疗人次数占比: string, 标准科室: string, 标准医保类型: string, 标准性别: string]

In [12]:
# #1、诊断清洗的筛查
# # 判断有空的
df_patient_null_result =  df_patient_std.select([ Func.count( when(  Func.isnull(c)| col(c).isNull(), c ) ).alias(c) 
                                            for c in df_patient_std.columns])

# 诊断特殊字符 及 空值
df_temp = df_patient_std.where(    (col("心律不齐") == 0 ) & (col("其他心血管疾病") == 0 ) & \
                                ( col( "脑血管疾病" ) == 0) & ( col("神经系统疾病") == 0 ) & 
                                ( col("高血糖") == 0 ) & ( col("高血压") == 0 )& \
                                ( col("高血脂") == 0 ) & ( col("肝功能异常") == 0) & \
                                ( col("肾功能异常") == 0) & (col("结缔组织病") == 0) & \
                                ( col("COPD") == 0) & (col("哮喘") == 0 )& \
                                ( col("支气管扩张") == 0 )& ( col("恶性实体瘤") == 0 )& \
                                ( col("标准诊断") == '其他' )&  (col("原始诊断字符数") <= 1) )

# print( df_temp.count() )

# # 未清洗的数据
# ##患者层面未能清洗的数据

# ##医保

# ##科室

# df_temp.show(1)

In [13]:
#  二、药品信息清洗-产品匹配

# df_patient = df_patient.dropna()

# 读入药品标准表
p_drug_mapping = p_mapping_file + "raw_done.csv"
df_drug_mapping = spark.read.csv(p_drug_mapping, header=True,encoding="gbk")
df_drug_mapping = df_drug_mapping.withColumnRenamed("pfc", "PACK_ID")\
                                    .withColumnRenamed("brand", "BRAND")\
                                    .withColumnRenamed("molecule", "MOLECULE")\
                                    .withColumnRenamed("for", "FORM")\
                                    .withColumnRenamed("spec", "SPEC")\
                                    .withColumnRenamed("pack_number", "PACK_NUMBER")\
                                    .withColumnRenamed("manufacturer", "MANUFACTURER")

# print(df_drug_mapping)
# left 方式匹配
df_patient_std_ = df_patient_std.join(df_drug_mapping, on=["药品名称", "规格", "剂型", "厂家"], how="left")

# print("无法匹配到的产品名称  ", df_patient_std.join(df_drug_mapping, on=["药品名称", "规格", "剂型", "厂家"], how="anti").count() )

In [14]:
# p_drug_mapping = p_mapping_file + "raw_done.csv"
# df_drug_mapping = spark.read.csv(p_drug_mapping, header=True,encoding="gbk")
# df_drug_mapping.show(1, vertical=True)
# df_patient_std_.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/6")

In [15]:
# 三、给病人打标签

## 原始为feather格式文件,转换为CSV格式
p_patient_target = p_mapping_file + "标签病人层面_测试用.csv"
df_target_patient_mapping = spark.read.csv(p_patient_target, header=True) \
                        .withColumnRenamed("PATIENT_ID", "患者ID")\
                        .withColumnRenamed("VISIT_ID", "就诊序号")
df_target_patient_mapping = df_target_patient_mapping.withColumn("标准病人层面处方日期", date_format("处方日期", "yyyyMMdd") )\
                                    .withColumn("标准病人层面入院时间", date_format("入院时间", "yyyyMMdd") )\
                                    .withColumn("标准病人层面出院时间", date_format("出院时间", "yyyyMMdd") )

need_col = ['医院ID', '患者ID', '就诊序号',  '白细胞计数', 'C反应蛋白', '降钙素原', '嗜肺军团菌',
            '肺炎衣原体', '肺炎支原体', '冠状病毒', '合胞病毒', '流感病毒', '腺病毒', '柯萨奇病毒', '鲍曼氏不动杆菌', '大肠埃希菌', 
            '肺炎克雷伯菌', '肺炎链球菌', '金黄色葡萄球菌', '流感嗜血菌', '嗜麦芽寡养单胞菌', '嗜麦芽窄食单胞菌', '铜绿假单胞菌', 
            '阴沟肠杆菌', 'OUT_ID', '标准病人层面处方日期', '标准病人层面入院时间', '标准病人层面出院时间']


df_target_patient_mapping = df_target_patient_mapping.select( need_col )
# 因join产生的重复的列是改名还是直接删除
df_patient_std_1 = df_patient_std_.join( df_target_patient_mapping, on=["医院ID", "患者ID", "就诊序号"], how="left")

# df_patient_std_1 = df_patient_std_.join( df_target_patient_mapping, on=["医院ID","患者ID", "就诊序号", 
#                                                                  "处方日期", "入院时间"  ], how="anti")

# df_patient_std_1.count()
# print( "无法被打标签的病人样本:  ", df_patient_std_.join( df_target_patient_mapping, on=["医院ID","患者ID", ], how="anti").count() )
# df_patient_target.show(2, vertical=True)
# df_patient_with_target.show(2, vertical=True)
# print(df_patient_target.columns)
# df_patient_std_.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/8")

In [16]:
# p_patient_target = p_mapping_file + "标签病人层面_测试用.csv"
# df_target_patient_mapping = spark.read.csv(p_patient_target, header=True)
# df_target_patient_mapping.show(1)
# df_patient_std_1.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/9")

In [17]:
p_molecule = "s3://ph-origin-files/user/zazhao/2020年结果-csv/清洗规则/20个分子分类.csv"
df_molecule_class = spark.read.csv(p_molecule, header=True)\
                        .select(['分子名', 'Molecule', 'mole_category'])\
                        .withColumnRenamed("Molecule", "MOLECULE_OTHER")\
                        .withColumnRenamed("分子名", "MOLECULE")\
                        .withColumnRenamed("mole_category", "MOLECULE_CATEGORY")
# df_molecule_class.show(1)
# df_patient_std_1.show(1)
# print( "无法被匹配到标准分子类别的:  ", df_patient_std_.join(df_molecule_class, on="MOLECULE",how="anti" ).count() )
df_patient_std_2 =  df_patient_std_1.join(df_molecule_class, on="MOLECULE",how="left" )
# df_patient_with_mol_class.show(1, vertical=True)

In [18]:
# 五、标签列生成

df_patient_std_3 = df_patient_std_2.withColumn("年龄区间",  Func.when( col("年龄")<8, lit("<8") )\
                                                  .when( (col("年龄") >=8)&(col("年龄") <=14), lit("8-14") )\
                                                  .when( (col("年龄") >=15)&(col("年龄") <=18), lit("15-18") )\
                                                  .when( (col("年龄") >=19)&(col("年龄") <=45), lit("19-15") )\
                                                  .when( (col("年龄") >45)&(col("年龄") <=65), lit("46-65") )\
                                                  .when( (col("年龄") >65), lit(">65") ) )\
                                                .withColumn("混合感染", Func.when( ( col("鲍曼氏不动杆菌").contains("阳") )| \
                                                           (col("大肠埃希菌").contains("阳"))| (col("肺炎克雷伯菌").contains("阳"))| \
                                                           (col("肺炎链球菌").contains("阳"))| (col("金黄色葡萄球菌").contains("阳"))| \
                                                           (col("流感嗜血菌").contains("阳"))| (col("嗜麦芽寡养单胞菌").contains("阳"))| \
                                                           (col("嗜麦芽窄食单胞菌").contains("阳"))| (col("铜绿假单胞菌").contains("阳"))| \
                                                           (col("阴沟肠杆菌").contains("阳")),  10).otherwise(0)
                                                           )
df_patient_std_3 = df_patient_std_3.withColumn("混合感染", Func.when( ( col("冠状病毒").contains("阳") )| \
                                                           (col("合胞病毒").contains("阳"))| (col("流感病毒").contains("阳"))| \
                                                             (col("腺病毒").contains("阳")), col("混合感染")+10 ).otherwise( col("混合感染") )  )  
df_patient_std_3 = df_patient_std_3.withColumn("混合感染", Func.when( ( col("肺炎支原体").contains("阳") )| \
                                                           (col("肺炎衣原体").contains("阳"))| (col("嗜肺军团菌").contains("阳")), \
                                                              col("混合感染")+10 ).otherwise( col("混合感染") )  )  
# df_patient_with_target_.show(1, vertical=True)
a = '严重感染|重型感染|重度肺炎|重型肺炎|重度呼吸|重度上呼吸|重型呼吸|'+\
     '重型上呼吸|重症肺炎|严重肺炎|重症呼吸|重症上呼吸|重度感染|重症感染|'+\
     '高危感染|危重感染|感染\\（重|感染\\（中重|感染\\（高|感染\\（危|炎\\（重|炎\\（中重|炎\\（高|'+\
     '炎\\（危|感染\\(重|感染\\(中重|感染\\(高|感染\\(危|炎\\(重|炎\\(中重|炎\\(高|炎\\(危'

df_patient_std_3 = df_patient_std_3.withColumn("severe_case",Func.when( col("诊断").rlike(a),"Y").otherwise("N"))

In [19]:
## OUT_ID 匹配
p_out_id_mapping = p_mapping_file+"门诊诊疗周期.csv"
df_out_id_mapping = spark.read.csv(p_out_id_mapping, header=True)

# OUT_ID匹配  （对门诊部分14天为1诊疗周期的内容进行生成，代码未找到，但找到了结果文件，可直接进行匹配）
# 门诊部分信息
df_out_id_mapping = df_out_id_mapping.select(["HCODE", "PATIENT_ID", "VISIT_ID", "OUT_ID"]).distinct()
df_out_id_mapping = df_out_id_mapping.select([ col("HCODE").alias("医院ID"),   col("PATIENT_ID").alias("患者ID"),
                                              col("VISIT_ID").alias("就诊序号"),    col("OUT_ID")])

##########################################  注意此处需要讨论 具体是和 那个表匹配
df_patient_std_4 = df_patient_std_3.drop("OUT_ID")\
                    .join(df_out_id_mapping,on=["医院ID", "患者ID", "就诊序号"], how="left") 
# 如果OUT_ID 为空就用 就诊序号代替            
df_patient_std_4 = df_patient_std_4.withColumn( "OUT_ID", when( col("OUT_ID").isNull(), df_patient_std_4["就诊序号"]) \
                                                    .otherwise( col("OUT_ID") ) )

# df_patient_std_4.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/11")

In [20]:
df_patient_std_4.show(1)

+---------+--------------------+--------+---------------+------------------------------+----+----+----------+--------+--------+----+----+--------+--------+--------------------+--------------------+--------------------+----+----+-----+-----+----+--------+------------+------------+------------+--------+--------+----+--------------+----------+------------+------+------+------+----------+----------+----------+----+----+----------+----------+--------------+------------+--------------+--------+------------+--------+-------+------+------+----+-----------+--------------------+----------+---------+--------+----------+----------+----------+--------+--------+--------+------+----------+--------------+----------+------------+----------+--------------+----------+----------------+----------------+------------+----------+--------------------+--------------------+--------------------+--------------------+-----------------+--------+--------+-----------+------+
|   医院ID|              患者ID|就诊序号|       MOL

In [21]:

# ## 筛选研究范围

# df_patient_std_4 = df_patient_std_2.withColumn("其他心血管疾病",when(col("诊断").rlike('心功能|冠心病|冠状|动脉|心梗|心肌|心血管|心绞痛|心脏病'),1).otherwise(0)) \
#             .withColumn("心衰",when(col("诊断").rlike('心衰|心力衰竭'),1).otherwise(0)) \
#             .withColumn("age_range",when(col("年龄") < 8,"<8") \
#                     .when(( col("年龄") >=8) & ( col("年龄") <=14),"8-14") \
#                     .when( ( col("年龄") >=15) & ( col("年龄") <=18), "15-18") \
#                     .when(( col("年龄") <=19) & ( col("年龄") <=45),"19-45") \
#                     .when(( col("年龄") >45) & ( col("年龄") <=65),"46-65") \
#                     .when(( col("年龄") >65),">65"))
df_patient_std_4 =  df_patient_std_4.withColumn("uni_code",df_patient_std_4["医院ID"] + df_patient_std_4["患者ID"])

# 使用替加环素的患者 
## 及只要使用过 替加环素的患者,就找出来
df_data_part = df_patient_std_4.filter(col("MOLECULE") == "替加环素").select("uni_code").distinct()
            

df_patient_std_5 = df_patient_std_4.filter(  col("标准诊断").rlike('社区获得性肺炎|肺部感染|呼吸道感染|支气管肺炎|肺炎') ) \
            .filter(col("MOLECULE_CATEGORY").isNotNull()) \
            .filter(~col("剂型").rlike('滴耳剂|眼用凝胶|滴眼剂|凝胶剂|软膏剂') )

## 过滤不含替加环素的患者 的数据
df_patient_std_5 = df_patient_std_5.join(df_data_part, on="uni_code", how="left_anti" )

## 是否需要筛选住院部分
## 是否需要 分类类别 重新定义
# delivery_base = delivery_base.withColumn("mole_category",   Func.when(col("mole_category") == "环素类","四环素类") \
#                                                                  .otherwise(col("mole_category")) )


In [22]:
## 添加额外标签
df_patient_std_5 = df_patient_std_5.withColumn("细菌感染", Func.when( (col("鲍曼氏不动杆菌").contains("阳"))|(col("大肠埃希菌").contains("阳"))|\
                                                                 (col("肺炎克雷伯菌").contains("阳")) |(col("肺炎链球菌").contains("阳"))|\
                                                                 (col("金黄色葡萄球菌").contains("阳")) | (col("流感嗜血菌").contains("阳"))|\
                                                                 (col("嗜麦芽寡养单胞菌").contains("阳")) | (col("嗜麦芽窄食单胞菌").contains("阳"))|\
                                                                 (col("铜绿假单胞菌").contains("阳")) | (col("阴沟肠杆菌").contains("阳"))
                                                            ,1).otherwise(0) )\
                                .withColumn("病毒感染", Func.when( (col("冠状病毒").contains("阳"))| (col("合胞病毒").contains("阳"))|\
                                                                 (col("流感病毒").contains("阳"))| (col("腺病毒").contains("阳")) 
                                                            ,1).otherwise(0) )\
                                .withColumn("非典型病原菌感染", Func.when( (col("肺炎支原体").contains("阳"))| (col("肺炎衣原体").contains("阳")) | \
                                                                 (col("嗜肺军团菌").contains("阳"))
                                                            ,1).otherwise(0)) \
                                .withColumn("seg1_grp1", Func.when( (col("年龄区间").contains("8-14")) |(col("年龄区间").contains("15-18") )
                                                            ,1).otherwise(0)) \
                                .withColumn("seg1_grp2", Func.when( (col("神经系统疾病")==1 ) | (col("心律不齐")==1 ) |\
                                                                    (col("心衰")==1 )
                                                            ,1).otherwise(0)) \
                                .withColumn("seg2_grp1", Func.when( (col("年龄区间").contains("8-14")) |(col("年龄区间").contains("15-18") )|\
                                                                    (col("心律不齐")==1 ) | (col("心衰")==1 )| \
                                                                    (col("神经系统疾病")==1 ) | (col("肝功能异常")==1 )| \
                                                                    (col("肾功能异常")==1 ) | (col("COPD")==1 )| \
                                                                    (col("恶性实体瘤")==1 ) 
                                                            ,1).otherwise(0)) \
                                .withColumn("seg3_grp1", Func.when( ( (col("非典型病原菌感染")==1 ) & (col("细菌感染")==1 ) )|\
                                                                    ( (col("非典型病原菌感染")==1 ) & (col("病毒感染")==1 ) )
                                                            ,1).otherwise(0)) \
                                .withColumn("seg3_grp2", Func.when( (col("嗜麦芽窄食单胞菌").contains("阳") ) |\
                                                                    (col("鲍曼氏不动杆菌").contains("阳") )| \
                                                                    (col("金黄色葡萄球菌").contains("阳") ) |
                                                                    (col("大肠埃希菌").contains("阳") ) 
                                                            ,1).otherwise(0)) \
                                .withColumn("seg3_grp3", Func.when( col("seg3_grp2").isNull()
                                                            ,0).otherwise( col("seg3_grp2"))) 
                                
# df_patient_std_5.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/12")

In [23]:
df_patient_std_5.show(1)

+--------+---------+--------------------+--------+---------------+------------------------------+----+----+----------+--------+--------+----+----+--------+--------+--------------------+--------------------+--------------------+----+----+---------+------+----+--------+------------+------------+------------+--------+--------+----+--------------+----------+------------+------+------+------+----------+----------+----------+----+----+----------+----------+--------------+------------+--------------+--------+------------+--------+-------+------+------+----+-----------+--------------------+----------+---------+--------+----------+----------+----------+--------+--------+--------+------+----------+--------------+----------+------------+----------+--------------+----------+----------------+----------------+------------+----------+--------------------+--------------------+--------------------+--------------------+-----------------+--------+--------+-----------+------+--------+--------+------------

In [24]:
from pyspark.sql import Window

# @pandas_udf("int", PandasUDFType)
# def computeCategory(x:pd.Series):
#     return 

# 单药 和 联药分析
# 输入数据是  病人层面表 + 分子标准名称表 + 门诊诊疗周期 + F(额外标签)

# 联用种类个数
df_data_temp = df_patient_std_2.select(["医院ID", "就诊类型", "患者ID", "OUT_ID", "标准处方日期", "MOLECULE_CATEGORY", "MOLECULE"])\
                                .withColumn("标准处方日期", col("标准处方日期").cast("int"))

df_data_a = df_patient_std_2.withColumn("RX_DATE_STD", col("标准处方日期")) \
                            .groupBy(["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"])\
                            .agg( Func.countDistinct("MOLECULE_CATEGORY").alias("分子种类数") )

# df_data_a.orderBy(["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"]).show(20)

# 联用方式
df_data_b = df_patient_std_2.withColumn("RX_DATE_STD", col("标准处方日期")) \
                            .groupBy(["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"])\
                            .agg(  Func.collect_set(col("MOLECULE_CATEGORY")).alias("formula"), \
                                 Func.collect_set( col("MOLECULE")  ).alias("mole_comb") )
df_data_b = df_data_b.withColumn("formula", Func.concat_ws("+", col("formula")) )\
                        .withColumn("mole_comb", Func.concat_ws("+", col("mole_comb")))



# 是否为初始药
win = Window.partitionBy(["医院ID", "就诊类型", "患者ID", "OUT_ID"]).orderBy( col("RX_DATE_STD").desc() )
df_data_c = df_data_a.withColumn("SEQ", Func.row_number().over( win ))\
                        .withColumn("IF_FIRST_RX", when( col("SEQ")==1, 1).otherwise(0) )
# df_data_c.show()
df_data_c_max = df_data_c.groupBy( ["医院ID", "就诊类型", "患者ID", "OUT_ID" ]).agg(Func.max("SEQ").alias("MAX_SEQ") )
# df_data_c_max.show()

df_data_c = df_data_c.join( df_data_c_max, on=[ "医院ID", "就诊类型", "患者ID", "OUT_ID" ], how="inner")
# df_data_c_max.show()


# 合并上面三个表
df_data_d = df_data_c.join(df_data_b, on=["医院ID", "就诊类型", "患者ID", "OUT_ID", "RX_DATE_STD"], how="left")

# 是否为换药
df_data_e = df_data_d.groupBy(["医院ID", "就诊类型", "患者ID", "OUT_ID" ])\
                        .agg( Func.countDistinct("formula").alias("formula_numbers") )\
                        .withColumn("IF_CHANGE",  Func.when(col("formula_numbers")>1, 1).otherwise(0))

# 合并
df_data_f = df_data_d.join( df_data_e, on=["医院ID", "就诊类型", "患者ID", "OUT_ID" ], how="left")
df_data_f = df_data_f.withColumnRenamed("RX_DATE_STD", "标准处方日期" )

# 和处方数据进行匹配
df_patient_std_2_1 = df_patient_std_5.join(df_data_f, on=["医院ID", "就诊类型", "患者ID", "OUT_ID", "标准处方日期"], how="left" )\
                                        .withColumn("single_or_formula", Func.when( col("formula").rlike("\+")
                                                            ,"联用").otherwise("单药") )
df_patient_std_2_1 = df_patient_std_2_1.withColumn("single_or_formula", Func.when( col("formula").isin(
                                                                ['头孢菌素类+头孢菌素类','青霉素类+青霉素类','其他抗生素+其他抗生素',
                                                                   '头孢菌素酶抑制剂+头孢菌素酶抑制剂','四环素类+四环素类',
                                                                   '氨基糖甙+氨基糖甙','氟喹诺酮+氟喹诺酮']
                                                            ),"单药").otherwise( col("single_or_formula") ) )


# df_patient_std_2_1.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/13")

# df_data_f.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/14")
# df_patient_std_2_1.show(1)

In [32]:
# hap_tag该字段也暂时未知  ,"hap_tag"
patient_tag = df_patient_std_2_1.distinct().drop_duplicates([c for c in df_patient_diagnois_target.columns if c in ["医院ID","患者ID","OUT_ID","就诊类型","心律不齐","其他心血管疾病",
                                                                                                                                    "脑血管疾病","神经系统疾病","高血糖","高血脂","肝功能异常","肾功能异常",
                                                                                                                                    "结缔组织病","COPD","哮喘","支气管扩张","恶性实体瘤","心衰","白细胞计数","C反应蛋白","降钙素原",
                                                                                                                                    "嗜肺军团菌",
                                                                                                                                    "肺炎衣原体","肺炎支原体","冠状病毒","合胞病毒","流感病毒","腺病毒",
                                                                                                                                    "柯萨奇病毒","鲍曼氏不动杆菌","大肠埃希菌","肺炎克雷伯菌","肺炎链球菌","金黄色葡萄球菌",
                                                                                                                                    "流感嗜血菌","嗜麦芽寡养单胞菌","嗜麦芽窄食单胞菌","铜绿假单胞菌","阴沟肠杆菌",
                                                                                                                                    "seg1_grp1","seg1_grp2","seg2_grp1","seg3_grp1","seg3_grp2","seg3_grp3"]])
 

patient_tag = patient_tag.drop_duplicates(c for c in patient_tag.columns if c in ["医院ID","患者ID","OUT_ID","就诊类型"])
patient_tag.show(1)

+---------+--------+--------------------+------+------------+--------+--------+----------+----------------------+-------------------+----+----------+--------+--------------+----+----+--------+--------------------+--------------------+--------------------+----+----+--------------------------------+-----+----+--------+------------+------------+--------+--------+----+--------------+----------+------------+------+------+------+----------+----------+----------+----+----+----------+----------+--------------+------------+--------------+--------+------------+--------+-------+-----+--------+-------+-----------+--------------------------+----------+---------+--------+----------+----------+----------+--------+--------+--------+------+----------+--------------+----------+------------+----------+--------------+----------+----------------+----------------+------------+----------+--------------------+--------------------+--------------------+--------------+-----------------+--------+--------+--------

In [26]:
table_4_m = df_patient_std_2_1.groupBy(["医院ID","患者ID","OUT_ID","就诊类型","标准医保类型","标准性别","年龄","标准诊断",
                             "severe_case","标准科室","formula","mole_comb","single_or_formula","SEQ","标准处方日期"]).agg( Func.sum(col("金额")).alias("sales") ) \
                             .withColumn("uni_code",Func.concat(col("医院ID"),col("患者ID"),col("OUT_ID"),col("就诊类型")))\
                            .withColumn("标准处方日期", col("标准处方日期").cast("int"))

# table_4_m初始喹诺酮类换药
quinolone_before = table_4_m.filter( (col("formula").rlike('氟喹诺酮') )
                                    & (col("SEQ") == "1"))


win1 = Window.partitionBy("uni_code")
quinolone_after = table_4_m.join( quinolone_before.select("uni_code").distinct(), on=["uni_code"], how="inner")

quinolone_after = quinolone_after.withColumn("first_formula",  Func.lit("氟喹诺酮"))\
                                    .filter( ~( col("formula") == col("first_formula")) )\
                                    .withColumn("MIN_标准处方日期",  Func.min("标准处方日期").over(win1) )\
                                    .where(col("标准处方日期")==col("MIN_标准处方日期"))
                        

quinolone_before = quinolone_before.join(quinolone_after.select("uni_code"), on="uni_code", how="inner")

quinolone_before = quinolone_before.dropDuplicates( ["uni_code"])

# 将字段名换成对应的
quinolone_before = quinolone_before.withColumnRenamed("severe_case","severe_case_before") \
                .withColumnRenamed("formula","formula_before") \
                .withColumnRenamed("mole_comb","mole_comb_before") \
                .withColumnRenamed("single_or_formula","single_or_formula_before") \
                .withColumnRenamed("SEQ","SEQ_before") \
                .withColumnRenamed("标准处方日期","std_rx_date_before") \
                .withColumnRenamed("sales","sales_before")

# 为了防止字段重复·影响操作
quinolone_before = quinolone_before.select("医院ID","患者ID","OUT_ID","就诊类型","severe_case_before","formula_before","mole_comb_before"
                                          ,"single_or_formula_before","SEQ_before","std_rx_date_before","sales_before")

quinolone_after = quinolone_after.withColumnRenamed("severe_case","severe_case_after")\
                .withColumnRenamed("formula","formula_after") \
                .withColumnRenamed("mole_comb","mole_comb_after") \
                .withColumnRenamed("single_or_formula","single_or_formula_after") \
                .withColumnRenamed("SEQ","SEQ_after") \
                .withColumnRenamed("标准处方日期","std_rx_date_after") \
                .withColumnRenamed("sales","sales_after")
# 这段具体只出现三次  貌似没用
# mapping_inpatients_tag <- read_feather('L:/HIS Raw data/奥玛环素项目/07_标签/标签病人层面_0125.feather')
# mapping_inpatients_tag <- table_2[,c(4:6,79,29:42,80,55:75,103)] %>% 
#   distinct(患者ID, OUT_ID, .keep_all = T)

quinolone_delivery = quinolone_after.join(quinolone_before, on=["医院ID","患者ID","OUT_ID","就诊类型"], how="left")

quinolone_delivery.withColumn("std_dept.x",when(quinolone_delivery["标准科室"] == "重症医学科","ICU").otherwise(col("标准诊断")))

# quinolone_delivery = quinolone_delivery.join(patient_tag,["医院ID","患者ID","OUT_ID","就诊类型"],"left")

In [33]:
quinolone_delivery.show(1)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 60478)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/hadoop/.local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/hadoop/.local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/home/hadoop/.local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1212, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
  File "/usr/lib64/python3.7/socketserver.py", line 316, in _handl

Py4JError: An error occurred while calling o1656.showString

In [28]:
#初始头孢类换药
cephalosporin_before = table_4_m.filter( (col("formula").rlike('头孢菌素类') )
                                    & (col("SEQ") == "1"))

win2 = Window.partitionBy("uni_code")
cephalosporin_after = table_4_m.join( cephalosporin_before.select("uni_code").distinct(), on=["uni_code"], how="inner")

cephalosporin_after = cephalosporin_after.withColumn("first_formula",  Func.lit("头孢菌素类"))\
                                    .filter( ~( col("formula") == col("first_formula")) )\
                                    .withColumn("MIN_标准处方日期",  Func.min("标准处方日期").over(win2) )\
                                    .where(col("标准处方日期")==col("MIN_标准处方日期"))

cephalosporin_before = cephalosporin_before.join(cephalosporin_after.select("uni_code"), on="uni_code", how="inner")
cephalosporin_before = cephalosporin_before.drop_duplicates(["uni_code"])
# %%%%%%%%%%%%
cephalosporin_before = cephalosporin_before.withColumnRenamed("severe_case","severe_case_before") \
                .withColumnRenamed("formula","formula_before") \
                .withColumnRenamed("mole_comb","mole_comb_before") \
                .withColumnRenamed("single_or_formula","single_or_formula_before") \
                .withColumnRenamed("SEQ","SEQ_before") \
                .withColumnRenamed("标准处方日期","std_rx_date_before") \
                .withColumnRenamed("sales","sales_before")

cephalosporin_before = cephalosporin_before.select("uni_code","医院ID","患者ID","OUT_ID","就诊类型","severe_case_before","formula_before","mole_comb_before",
                                           "single_or_formula_before","SEQ_before","std_rx_date_before","sales_before")

cephalosporin_after = cephalosporin_after.withColumnRenamed("severe_case","severe_case_after")\
                .withColumnRenamed("formula","formula_after") \
                .withColumnRenamed("mole_comb","mole_comb_after") \
                .withColumnRenamed("single_or_formula","single_or_formula_after") \
                .withColumnRenamed("SEQ","SEQ_after") \
                .withColumnRenamed("标准处方日期","std_rx_date_after") \
                .withColumnRenamed("sales","sales_after")

# ************
cephalosporin_delivery = cephalosporin_after.join(cephalosporin_before,["医院ID","患者ID","OUT_ID","就诊类型"],"left")

cephalosporin_delivery.withColumn("std_dept.x",when(cephalosporin_delivery["标准科室"] == "重症医学科","ICU").otherwise(col("标准诊断")))

cephalosporin_delivery = cephalosporin_delivery.join(patient_tag,["医院ID","患者ID","OUT_ID","就诊类型"],"left")



In [None]:
cephalosporin_delivery.show(1)

In [29]:
# ,"hap_tag"暂时未知
tag_all = df_patient_std_2_1.select("就诊类型","患者ID","OUT_ID","seg1_grp1","seg1_grp2","seg2_grp1","seg3_grp1","seg3_grp2","seg3_grp3") \
                         .drop_duplicates(c for c in patient_tag.columns if c in ["患者ID","OUT_ID","就诊类型"])

quinolone_delivery = quinolone_delivery.join(tag_all,["患者ID","OUT_ID","就诊类型"],"left")
cephalosporin_delivery = cephalosporin_delivery.join(tag_all,["患者ID","OUT_ID","就诊类型"],"left")


In [30]:
# quinolone_delivery.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/20")

# cephalosporin_delivery.write.mode("overwrite").parquet("s3://ph-origin-files/user/zazhao/2020年结果-csv/temp/21")
quinolone_delivery.show(1)

+--------------------+------+--------+---------+--------------------+------------+--------+----+--------+-----------------+--------+-----------------+----------------------------+-----------------------+---------+-----------------+------------------+-------------+----------------+------------------+-----------------+----------------------------+------------------------+----------+------------------+------------------+------------+--------+--------+-----------------+---------------------------------+--------------+----+--------+--------+--------------------+----+----+--------+--------------------+--------------------+--------------------+----+----+------------------------------+-----+----+--------+------------+------------+--------+--------+----+--------------+----------+------------+------+------+------+----------+----------+----------+----+----+----------+----------+--------------+------------+--------------+--------+------------+--------+-------+--------------------------+------+----

In [31]:
cephalosporin_delivery.show(1)

+--------------------+------+--------+---------+--------------------+------------+--------+----+--------+-----------------+--------+-------------------+--------------------------+-----------------------+---------+-----------------+-----------------+-------------+----------------+--------------------+------------------+-------------------+--------------------------+------------------------+----------+------------------+-----------------+------------+--------+--------+--------+--------------+------------+----+--------+--------+--------------------+----+----+--------+--------------------+--------------------+--------------------+----+----+------------------------------------+-----+----+--------+------------+------------+--------+--------+----+--------------+----------+------------+------+------+------+----------+----------+----------+----+----+----------+----------+--------------+------------+--------------+--------+------------+--------+-------+--------------+------+-----+-----------+---