In [133]:
# spark.stop()

In [134]:
import os
from pyspark.sql import SparkSession, types as t, functions as F

In [135]:
spark = (
    SparkSession
    .builder
    .appName("Testing Transformations")
    .config("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar") # GCS Connector
    .getOrCreate()
)

# Google Cloud Service Account Credentials
spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.json.keyfile",os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"))

spark

In [136]:
patient_schema = t.StructType(
    [
        t.StructField('patientid',t.StringType(),True),
        t.StructField('patientagegroup',t.IntegerType(),True),
        t.StructField('patientonsetage',t.IntegerType(),True),
        t.StructField('patientonsetageunit',t.IntegerType(),True),
        t.StructField('patientsex',t.IntegerType(),True),
        t.StructField('patientweight',t.FloatType(),True),
        t.StructField('serious',t.IntegerType(),True),
        t.StructField('seriousnessdeath',t.IntegerType(),True),
        t.StructField('seriousnesshospitalization',t.StringType(),True),
        t.StructField('seriousnessdisabling',t.StringType(),True),
        t.StructField('seriousnesslifethreatening',t.StringType(),True),
        t.StructField('seriousnessother',t.StringType(),True),
        t.StructField('receivedate',t.StringType(),True),
        t.StructField('receiptdate',t.StringType(),True),
        t.StructField('safetyreportid',t.StringType(),True)
    ]
)


drug_schema = t.StructType(
    [
        t.StructField('patientid',t.StringType(),True),
        t.StructField('medicinalproduct',t.StringType(),True),
        t.StructField('activesubstancename',t.StringType(),True),
        t.StructField('drugadministrationroute',t.StringType(),True),
        t.StructField('drugstartdate',t.StringType(),True),
        t.StructField('drugenddate',t.StringType(),True),
        t.StructField('drugdosagetext',t.StringType(),True),
        t.StructField('drugstructuredosagenumb',t.StringType(),True),
        t.StructField('drugstructuredosageunit',t.StringType(),True),
        t.StructField('drugtreatmentduration',t.StringType(),True),
        t.StructField('drugtreatmentdurationunit',t.StringType(),True),
        t.StructField('drugrecurreadministration',t.IntegerType(),True),
    ]
)

reaction_schema = t.StructType(
    [
        t.StructField('patientid',t.StringType(),True),
        t.StructField('reactionmeddrapt',t.StringType(),True),
        t.StructField('reactionoutcome',t.IntegerType(),True),
    ]
)

In [137]:
bucket='gs://zoomcamp-454219-ade-pipeline/data/pq/'

In [138]:
patient = (
    spark
    .read
    .parquet(bucket+'patient/*')
    )
patient.printSchema()

root
 |-- patientid: string (nullable = true)
 |-- patientagegroup: string (nullable = true)
 |-- patientonsetage: string (nullable = true)
 |-- patientonsetageunit: string (nullable = true)
 |-- patientsex: string (nullable = true)
 |-- patientweight: string (nullable = true)
 |-- serious: integer (nullable = true)
 |-- seriousnessdeath: integer (nullable = true)
 |-- seriousnesshospitalization: integer (nullable = true)
 |-- seriousnessdisabling: integer (nullable = true)
 |-- seriousnesslifethreatening: integer (nullable = true)
 |-- seriousnessother: integer (nullable = true)
 |-- receivedate: integer (nullable = true)
 |-- receiptdate: integer (nullable = true)
 |-- safetyreportid: integer (nullable = true)



We can add data quality checks:  
- `number of failed casts`  
- `Data usability like null value count`  

In [139]:
# Will work on failed casts later

In [140]:
# Cast
patient = (
    patient
    .withColumn("patientagegroup", F.col('patientagegroup').cast(t.IntegerType()))
    .withColumn("patientonsetage", F.col('patientonsetage').cast(t.IntegerType()))
    .withColumn("patientonsetageunit", F.col('patientonsetageunit').cast(t.IntegerType()))
    .withColumn("patientsex", F.col('patientsex').cast(t.IntegerType()))
    .withColumn("patientweight", F.col('patientweight').cast(t.FloatType()))
    )
patient.printSchema()

root
 |-- patientid: string (nullable = true)
 |-- patientagegroup: integer (nullable = true)
 |-- patientonsetage: integer (nullable = true)
 |-- patientonsetageunit: integer (nullable = true)
 |-- patientsex: integer (nullable = true)
 |-- patientweight: float (nullable = true)
 |-- serious: integer (nullable = true)
 |-- seriousnessdeath: integer (nullable = true)
 |-- seriousnesshospitalization: integer (nullable = true)
 |-- seriousnessdisabling: integer (nullable = true)
 |-- seriousnesslifethreatening: integer (nullable = true)
 |-- seriousnessother: integer (nullable = true)
 |-- receivedate: integer (nullable = true)
 |-- receiptdate: integer (nullable = true)
 |-- safetyreportid: integer (nullable = true)



In [141]:
drug = (
    spark
    .read
    .parquet(bucket+'drug/*')
    )

drug.printSchema()

root
 |-- patientid: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- drugadministrationroute: string (nullable = true)
 |-- drugstartdate: string (nullable = true)
 |-- drugenddate: string (nullable = true)
 |-- drugdosagetext: string (nullable = true)
 |-- drugstructuredosagenumb: string (nullable = true)
 |-- drugstructuredosageunit: string (nullable = true)
 |-- drugtreatmentduration: string (nullable = true)
 |-- drugtreatmentdurationunit: string (nullable = true)
 |-- drugrecurreadministration: string (nullable = true)



In [142]:
# Cast
drug = (
    drug
    .withColumn("drugrecurreadministration", F.col("drugrecurreadministration").cast(t.IntegerType()))
)
drug.printSchema()

root
 |-- patientid: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- drugadministrationroute: string (nullable = true)
 |-- drugstartdate: string (nullable = true)
 |-- drugenddate: string (nullable = true)
 |-- drugdosagetext: string (nullable = true)
 |-- drugstructuredosagenumb: string (nullable = true)
 |-- drugstructuredosageunit: string (nullable = true)
 |-- drugtreatmentduration: string (nullable = true)
 |-- drugtreatmentdurationunit: string (nullable = true)
 |-- drugrecurreadministration: integer (nullable = true)



In [143]:
reaction = (
    spark
    .read
    .parquet(bucket+'reaction/*')
)

reaction.printSchema()

root
 |-- patientid: string (nullable = true)
 |-- reactionmeddrapt: string (nullable = true)
 |-- reactionoutcome: string (nullable = true)



In [144]:
# Cast
reaction = (
    reaction
    .withColumn("reactionoutcome", F.col("reactionoutcome").cast(t.IntegerType()))
)
reaction.printSchema()

root
 |-- patientid: string (nullable = true)
 |-- reactionmeddrapt: string (nullable = true)
 |-- reactionoutcome: integer (nullable = true)



### Transformations

In [None]:
# Patient Column transformations

patient = patient.withColumn(
    "patientagegroup",
    (
        F
        .when(F.col("patientagegroup") == 1, "Neonate")
        .when(F.col("patientagegroup") == 2, "Infant")
        .when(F.col("patientagegroup") == 3, "Child")
        .when(F.col("patientagegroup") == 4, "Adolescent")
        .when(F.col("patientagegroup") == 5, "Adult")
        .when(F.col("patientagegroup") == 6, "Elderly")
        .otherwise(None)
     )
)

# Normalize patientage
patient = patient.withColumn(
    "patientage",
    (
        F
        .when(F.col("patientonsetageunit") == 800, F.col("patientonsetage") * 10)
        .when(F.col("patientonsetageunit") == 801, F.col("patientonsetage") * 1)
        .when(F.col("patientonsetageunit") == 802, F.col("patientonsetage") / 12)
        .when(F.col("patientonsetageunit") == 803, F.col("patientonsetage") / 52.143)
        .when(F.col("patientonsetageunit") == 804, F.col("patientonsetage") / 365.25)
        .when(F.col("patientonsetageunit") == 805, F.col("patientonsetage") / 8766)
        .otherwise(None)
    )
).drop("patientonsetageunit", "patientonsetage")



In [None]:
patient.groupBy('patientage').()



+-------------------+-----+
|         patientage|count|
+-------------------+-----+
| 62.704996577686515|    1|
|  0.594518919126249|    6|
|  52.42436687200548|    1|
|              16.75|    3|
|  47.75359342915811|    1|
|  78.76522929500342|    1|
|  68.80492813141684|    1|
| 0.4166666666666667|  329|
|              14.75|    2|
|               70.0|20335|
|  72.12594113620807|    1|
|  54.98973305954826|    1|
|  69.99863107460644|    1|
|               67.0|18478|
|                8.0| 2124|
| 59.833333333333336|    6|
|  64.88432580424367|    1|
|0.19178029649233838|   16|
| 63.553730321697465|    1|
|               56.5|    5|
+-------------------+-----+
only showing top 20 rows



                                                                                

In [160]:
spark.sql(
"""
SELECT safetyreportid
FROM patient
GROUP BY safetyreportid
""").show()



+--------------+
|safetyreportid|
+--------------+
|          null|
+--------------+



                                                                                

In [None]:
patient.groupBy((F.round(F.col('patientage')).cast(t.IntegerType())).alias("patientage")).count().show()



+----------+------+
|patientage| count|
+----------+------+
|        31|  7316|
|        85|  6666|
|        65| 20134|
|        53| 14620|
|        78| 13706|
|        34|  8380|
|       101|    43|
|        81| 10257|
|        28|  7148|
|        76| 15335|
|        27|  6780|
|        26|  6681|
|        44| 10049|
|       103|    24|
|        12|  3954|
|        91|  2250|
|        22|  5961|
|        93|  1394|
|        47| 11099|
|      null|705732|
+----------+------+
only showing top 20 rows



                                                                                

25/04/26 02:43:19 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-9a92fc5f-cada-44e0-8913-dcec057c82e0. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-9a92fc5f-cada-44e0-8913-dcec057c82e0
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:171)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:110)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:91)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1206)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:374)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:370)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach