In [1]:

import os
import sys
from datetime import datetime
from pyspark.sql import SparkSession, DataFrame, functions as F, types as T
from pyspark.sql.window import Window
import pyspark.pandas as ps

os.environ['YARN_CONF_DIR'] = '/opt/hadoop/etc/hadoop/'
spark = SparkSession.builder.master("yarn").appName("acg_testing").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/30 13:05:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/04/30 13:05:11 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/04/30 13:05:14 WARN DataStreamer: Exception for BP-384813690-10.44.1.78-1677874809887:blk_1078131150_4487649
java.io.IOException: Bad response ERROR for BP-384813690-10.44.1.78-1677874809887:blk_1078131150_4487649 from datanode DatanodeInfoWithStorage[10.30.10.109:9866,DS-8c9823d7-0bc8-4648-bfeb-608c6127f82c,DISK]
	at org.apache.hadoop.hdfs.DataStreamer$ResponseProcessor.run(DataStreamer.java:1175)
24/04/30 13:05:14 WARN DataStreamer: Error Recovery for BP-384813690-10.44.1.78-1677874809887:blk_1078131150_4487649 in pipeline [DatanodeInfoWithStorage[10.30.10.108:9866,DS-e69df1a

In [2]:
# time check
start_time = datetime.now()

cohort_start_date='2021-11-01'
cohort_end_date='2022-10-31'

person_raw = spark.read.parquet('/data/pedsnet_dcc_v52/dcc_pedsnet/person')


person_input_df = person_raw\
    .filter(person_raw.gender_concept_id.isin(8507, 8532))\
    .withColumn('sex',F.when(F.col('gender_concept_id')==8507,'M').otherwise('F'))\
    .select('person_id','sex',F.col('birth_date').alias('date_of_birth'))
#Limit to male or female sex

visits_df_raw = spark.read.parquet('/data/pedsnet_dcc_v52/dcc_pedsnet/visit_occurrence')
visits_df_intermediate = visits_df_raw\
    .join(person_raw.select(['person_id','birth_date']),on='person_id',how='inner')\
    .filter(F.col('visit_concept_id').isin([9201,2000000048,2000001532, 9202,2000000469,44814711, 9203, 2000000088]))\ 
    .filter(F.col('visit_start_date').between(cohort_start_date,cohort_end_date) &
            F.col('visit_end_date').between(cohort_start_date,cohort_end_date) &
            (F.col('visit_start_date')>=F.col('birth_date')))\
    .select('visit_occurrence_id','person_id','site','visit_concept_id','visit_start_date','visit_end_date')\
    .withColumn('op_ed_visit',F.when(F.col('visit_concept_id').isin([9202, 44814711, 9203, 2000000048]),1).otherwise(0))
#Limit to standard set of visits


persons_df = visits_df_intermediate\
    .join(person_input_df,on='person_id',how='inner')\
    .filter(F.col('op_ed_visit')>0)\
    .select('person_id','sex','date_of_birth')\
    .distinct()\
    .withColumn('age',F.floor(F.datediff(F.lit(cohort_end_date),F.col('date_of_birth'))/365.25).cast(T.IntegerType()))\
    .filter(F.col('age').between(0,34))\
    .persist()

persons_df\
    .select(F.col('person_id').alias('patient_id'),'sex','date_of_birth','age')\
    .write.csv('/data/pedsnet_dcc_v52/acg_input/patient_services',header=True,mode='overwrite')

visits_df = persons_df\
    .select('person_id')\
    .join(visits_df_intermediate, 'person_id', 'inner')\
    .persist()


#The below code chunk shows how many conditions are problem list conditions v not by site
'''
spark.read.parquet('/data/pedsnet_dcc_v52/dcc_pedsnet/condition_occurrence')\
    .withColumn('problem_list',F.when(F.col('condition_type_concept_id').isin([2000000089,2000000090]),1).otherwise(0))\
    .withColumn('not_pl',F.when(F.col('condition_type_concept_id').isin([32879, 2000000089,2000000090]),0).otherwise(1))\
    .groupBy('site').agg(F.sum('problem_list').alias('pl'),F.sum('not_pl').alias('pl_not')).withColumn('total',F.col('pl')+F.col('pl_not')).show()
'''

                                                                                

"\nspark.read.parquet('/data/pedsnet_dcc_v52/dcc_pedsnet/condition_occurrence')    .withColumn('problem_list',F.when(F.col('condition_type_concept_id').isin([2000000089,2000000090]),1).otherwise(0))    .withColumn('not_pl',F.when(F.col('condition_type_concept_id').isin([32879, 2000000089,2000000090]),0).otherwise(1))    .groupBy('site').agg(F.sum('problem_list').alias('pl'),F.sum('not_pl').alias('pl_not')).withColumn('total',F.col('pl')+F.col('pl_not')).show()\n"

In [3]:
print('All persons', person_raw.select('person_id').distinct().count()) 
print('All persons with M/F gender', person_input_df.select('person_id').distinct().count())
print('All persons with a op/ed visit in the study period',visits_df_intermediate.join(person_input_df,on='person_id',how='inner').filter(F.col('op_ed_visit')>0).select('person_id').distinct().count())
print('All persons aged between 0 and 34 at the end of the study period ', persons_df.select('person_id').distinct().count())

                                                                                

All persons 13876553


                                                                                

All persons with M/F gender 13861023


                                                                                

All persons with a op/ed visit in the study period 4429817




All persons aged between 0 and 34 at the end of the study period  4319115


                                                                                

In [4]:


conditions_df_raw = spark.read.parquet('/data/pedsnet_dcc_v52/dcc_pedsnet/condition_occurrence')
concept_df_raw = spark.read.parquet('/data/pedsnet_dcc_v52/vocabulary/concept') 
concept_relationship_df_raw = spark.read.parquet('/data/pedsnet_dcc_v52/vocabulary/concept_relationship') 

#map the vocabulary_ids to the ACG equivalent
#NOTE: ICD9CM may not be appropriate to map to 9, which is meant to be ICD9. check with Chris.
vocab_dict = {'ICD10':'10', 'ICD10CM':'10CM', 'ICD9CM':'9','SNOMED':'S'}
map_col = F.create_map([F.lit(x) for i in vocab_dict.items() for x in i])

concept_df = concept_df_raw\
    .filter(F.col('vocabulary_id').isin(['SNOMED','ICD9CM','ICD10CM','ICD10']))\
    .withColumn('dx_version', map_col[F.col('vocabulary_id')])\
    .select(concept_df_raw.concept_id.alias('condition_source_concept_id'),
            concept_df_raw.concept_code.alias('dx_cd'),
            'dx_version')

window = Window.partitionBy('visit_occurrence_id')

# Get all conditions with the our cohorts visits, remove those that are registry or problem list conditions,
# then limit conditions to those with the highest "certainty" for each visit 
# Most to least certain: Final Diagnosis, Clinical Diagnosis, Admitting Diagnosis, Null/No matching concept
# Avg conditions per visit anywhere from 2.8 (national) to 6.5 (Lurie)
medical_services_df_raw = visits_df.select("visit_occurrence_id")\
    .join(conditions_df_raw,on='visit_occurrence_id',how='inner')\
    .filter(~F.col('condition_type_concept_id').isin([32879, 2000000089,2000000090]))\
    .withColumn('condition_status_order', 
                F.when(F.col('condition_status_concept_id').eqNullSafe(4230359),1)\
                .when(F.col('condition_status_concept_id').eqNullSafe(4309119),2)\
                .when(F.col('condition_status_concept_id').eqNullSafe(4203942),3)\
                .otherwise(4))\
    .withColumn('condition_status_filter',F.rank().over(window.orderBy("condition_status_order")))\
    .filter(F.col('condition_status_filter').eqNullSafe(1))\
    .select('visit_occurrence_id','site','condition_status_concept_id','condition_source_concept_id')\
    .distinct()\
    .withColumn('dx_n',F.row_number().over(window.orderBy('site')))\
    .withColumn('dx_n_ceiling',F.when(~(F.col('dx_n')%10==0),F.col('dx_n')%10).otherwise(10))\
    .withColumn('visit_dummy',F.floor((F.col('dx_n')-1)/10))\
    .repartition('visit_occurrence_id')\
    .join(F.broadcast(concept_df),on='condition_source_concept_id',how='inner')\
    .persist()

In [5]:

pivoted_df= None
dx_n_list = sorted([int(row.dx_n_ceiling) for row in medical_services_df_raw.select('dx_n_ceiling').distinct().collect()])
for i in dx_n_list:
    dx_codes = medical_services_df_raw.filter(F.col('dx_n_ceiling')==i)\
                .select(medical_services_df_raw.visit_occurrence_id,
                        medical_services_df_raw.visit_dummy,
                        medical_services_df_raw.dx_cd.alias('dx_cd_'+str(i)),
                        medical_services_df_raw.dx_version.alias('dx_version_'+str(i)))
    if isinstance(pivoted_df, DataFrame):
        pivoted_df = pivoted_df.join(dx_codes,on=['visit_occurrence_id','visit_dummy'],how='left')
        
    else:
        pivoted_df = dx_codes


join_visits = visits_df\
    .withColumn('service_place',
                F.when(F.col('visit_concept_id').isin([9201,2000000048,2000001532]),'IP')\
                .when(F.col('visit_concept_id').isin([9202,2000000469,44814711]),'OP')\
                .when(F.col('visit_concept_id').eqNullSafe(9203),'ED').otherwise('OBS'))\
    .select(F.col("person_id").alias('patient_id'), "site", "visit_occurrence_id", 
            "service_place", F.col("visit_start_date").alias('service_begin_date'), F.col("visit_end_date").alias('service_end_date'))

medical_services_df = pivoted_df.join(join_visits,on='visit_occurrence_id',how='inner')

medical_services_df.drop('visit_occurrence_id','visit_dummy').write.csv('/data/pedsnet_dcc_v52/acg_input/medical_services',header=True,mode='overwrite')
medical_services_df_raw.unpersist()

                                                                                

DataFrame[condition_source_concept_id: int, visit_occurrence_id: bigint, site: string, condition_status_concept_id: int, dx_n: int, dx_n_ceiling: int, visit_dummy: bigint, dx_cd: string, dx_version: string]

In [22]:
#tester=medical_services_df.filter(F.col('patient_id').isin(['14610971','14601198','14601200','14601202','14601203','14601204','14601210','14601216','14601219','14601222'])).persist()
#tester = spark.read.option("header", True).csv('/data/pedsnet_dcc_v52/acg_input/medical_services').filter(F.col('site').isin(['national'])).persist()

visits_df.select("visit_occurrence_id")\
    .join(conditions_df_raw,on='visit_occurrence_id',how='inner')\
    .filter(F.col('site').eqNullSafe('national'))\
    .filter(~F.col('condition_type_concept_id').isin([32879, 2000000089,2000000090]))\
    .withColumn('condition_status_order', 
                F.when(F.col('condition_status_concept_id').eqNullSafe(4230359),1)\
                .when(F.col('condition_status_concept_id').eqNullSafe(4309119),2)\
                .when(F.col('condition_status_concept_id').eqNullSafe(4203942),3)\
                .otherwise(4))\
    .withColumn('condition_status_filter',F.rank().over(window.orderBy("condition_status_order")))\
    .filter(F.col('condition_status_filter').eqNullSafe(1))\
    .select('condition_source_concept_id').distinct().show(10)
    



+---------------------------+
|condition_source_concept_id|
+---------------------------+
|                          0|
+---------------------------+



                                                                                

In [10]:
tester.show(2)

[Stage 291:(22 + 2) / 200][Stage 293:>(0 + 0) / 200][Stage 295:>(0 + 0) / 200]

Py4JJavaError: An error occurred while calling o534.showString.
: java.util.concurrent.CancellationException
	at java.util.concurrent.FutureTask.report(FutureTask.java:121)
	at java.util.concurrent.FutureTask.get(FutureTask.java:206)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:209)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:517)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeBroadcast$1(SparkPlan.scala:208)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:204)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:206)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareRelation(BroadcastHashJoinExec.scala:220)
	at org.apache.spark.sql.execution.joins.HashJoin.codegenInner(HashJoin.scala:390)
	at org.apache.spark.sql.execution.joins.HashJoin.codegenInner$(HashJoin.scala:389)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenInner(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.joins.HashJoin.doConsume(HashJoin.scala:356)
	at org.apache.spark.sql.execution.joins.HashJoin.doConsume$(HashJoin.scala:354)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:196)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:151)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:42)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:89)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:196)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:151)
	at org.apache.spark.sql.execution.FilterExec.consume(basicPhysicalOperators.scala:216)
	at org.apache.spark.sql.execution.FilterExec.doConsume(basicPhysicalOperators.scala:265)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:196)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:151)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:485)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:458)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:97)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.FilterExec.doProduce(basicPhysicalOperators.scala:242)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:97)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.FilterExec.produce(basicPhysicalOperators.scala:216)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:55)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:97)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:42)
	at org.apache.spark.sql.execution.joins.HashJoin.doProduce(HashJoin.scala:351)
	at org.apache.spark.sql.execution.joins.HashJoin.doProduce$(HashJoin.scala:350)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:97)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:55)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:97)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:42)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:660)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:723)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:135)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:135)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:169)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:167)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:189)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:132)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:135)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:135)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:169)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:167)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:189)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:132)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.doExecute(WholeStageCodegenExec.scala:513)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.inputRDDs(SortMergeJoinExec.scala:396)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.buildBuffers(InMemoryRelation.scala:273)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.cachedColumnBuffers(InMemoryRelation.scala:222)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.filteredCachedBatches(InMemoryTableScanExec.scala:152)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD$lzycompute(InMemoryTableScanExec.scala:98)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD(InMemoryTableScanExec.scala:84)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.doExecute(InMemoryTableScanExec.scala:163)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:364)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:498)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4177)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3382)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:284)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:323)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


[Stage 291:(41 + 2) / 200][Stage 293:>(0 + 0) / 200][Stage 295:>(0 + 0) / 200]

In [5]:

concept_relationship_df_raw = spark.read.parquet('/data/pedsnet_dcc_v52/vocabulary/concept_relationship') 
rx_norm_concepts = concept_df_raw.filter(concept_df_raw.vocabulary_id.isin(['RxNorm','RxNorm Extension'])).select(F.col('concept_id').alias('concept_id_rxnorm'))
ndc_concepts= concept_df_raw.filter(concept_df_raw.vocabulary_id.eqNullSafe('NDC')).select('concept_id','concept_code')

w = Window().partitionBy("concept_id_rxnorm").orderBy("concept_id")
rxnorm_ndc_map = concept_relationship_df_raw\
    .filter(F.col('relationship_id').eqNullSafe('Maps to'))\
    .join(rx_norm_concepts,on=F.col('concept_id_1')==F.col('concept_id_rxnorm'),how='inner')\
    .join(ndc_concepts,
        on=F.col('concept_id_2')==F.col('concept_id'),how='inner')\
    .select(F.col('concept_id_rxnorm'),F.first('concept_id').over(w).alias('concept_id_ndc'),'concept_code')\
    .distinct()

drug_exposure_raw = spark.read.parquet('/data/pedsnet_dcc_v52/dcc_pedsnet/drug_exposure')
drug_exposure_input = visits_df\
    .select("visit_occurrence_id")\
    .join(drug_exposure_raw,on='visit_occurrence_id',how='inner')\
    .select('person_id','site','visit_occurrence_id','drug_exposure_start_date',F.col('drug_concept_id').alias('concept_id'))\
    .persist()

ndc_rx = drug_exposure_input\
    .join(ndc_concepts.select('concept_id','concept_code'),on='concept_id',how='inner')\
    .select('person_id','site','drug_exposure_start_date','concept_id','concept_code')
rxnorm_rx_mapped = drug_exposure_input\
    .join(rxnorm_ndc_map.select('concept_id_rxnorm','concept_id_ndc','concept_code'),
        on=F.col('concept_id')==F.col('concept_id_rxnorm'),
        how='inner').select('person_id','site','drug_exposure_start_date',F.col('concept_id_ndc').alias('concept_id'),'concept_code')

pharmacy = ndc_rx.union(rxnorm_rx_mapped)\
    .select(F.col('person_id').alias('patient_id'),
            F.col('concept_code').alias('rx_cd'),
            F.col('drug_exposure_start_date').alias('rx_fill_date'),
            'site')\
    .withColumn('rx_code_type',F.lit('N'))

pharmacy.write.csv('/data/pedsnet_dcc_v52/acg_input/pharmacy_services',header=True,mode='overwrite')

drug_exposure_input.unpersist()
visits_df.unpersist()
persons_df.unpersist()


                                                                                

DataFrame[person_id: bigint, sex: string, date_of_birth: date, age: int]

In [6]:
#about 27 minutes 12 seconds, minimal resource allocation (~1% of cluster)
print(datetime.now()-start_time)

0:27:09.497394


In [7]:
spark.stop()