In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Pratice - 4").getOrCreate()

In [3]:
spark

### Read csv files in different ways

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, FloatType

In [5]:
# Define the schema for patients

schema_patients = StructType([
    StructField("PatientID", StringType(), True),
    StructField("PatientGender", StringType(), True),
    StructField("PatientDateOfBirth", TimestampType(), True),
    StructField("PatientRace", StringType(), True),
    StructField("PatientMaritalStatus", StringType(), True),
    StructField("PatientLanguage", StringType(), True),
    StructField("PatientPopulationPercentageBelowPoverty", FloatType(), True)
])


In [6]:
# Read PatientCorePopulatedTable.csv file with Schema

df_patients = spark.read.csv("/Users/sateeshreddypatlolla/Downloads/PatientCorePopulatedTable.csv", sep='\t', header = True, schema = schema_patients)

In [7]:
#Get Columns

df_patients.columns

['PatientID',
 'PatientGender',
 'PatientDateOfBirth',
 'PatientRace',
 'PatientMaritalStatus',
 'PatientLanguage',
 'PatientPopulationPercentageBelowPoverty']

In [8]:
# Get column count

len(df_patients.columns)

7

In [9]:
# Get Columns with data types

df_patients.dtypes

[('PatientID', 'string'),
 ('PatientGender', 'string'),
 ('PatientDateOfBirth', 'timestamp'),
 ('PatientRace', 'string'),
 ('PatientMaritalStatus', 'string'),
 ('PatientLanguage', 'string'),
 ('PatientPopulationPercentageBelowPoverty', 'float')]

In [10]:
# Get Schema

df_patients.printSchema()

root
 |-- PatientID: string (nullable = true)
 |-- PatientGender: string (nullable = true)
 |-- PatientDateOfBirth: timestamp (nullable = true)
 |-- PatientRace: string (nullable = true)
 |-- PatientMaritalStatus: string (nullable = true)
 |-- PatientLanguage: string (nullable = true)
 |-- PatientPopulationPercentageBelowPoverty: float (nullable = true)



In [11]:
# Get Schema

df_patients.schema

StructType([StructField('PatientID', StringType(), True), StructField('PatientGender', StringType(), True), StructField('PatientDateOfBirth', TimestampType(), True), StructField('PatientRace', StringType(), True), StructField('PatientMaritalStatus', StringType(), True), StructField('PatientLanguage', StringType(), True), StructField('PatientPopulationPercentageBelowPoverty', FloatType(), True)])

DataFrame.collect() collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver side because it collects all the data from executors to the driver side.

In [12]:
# # Get results (WARNING: in-memory) as list of PySpark Rows

# df_patients.collect()

In order to avoid throwing an out-of-memory exception, use DataFrame.take() or DataFrame.tail().

In [13]:
df_patients.take(10)

[Row(PatientID='FB2ABB23-C9D0-4D09-8464-49BF0B982F0F', PatientGender='Male', PatientDateOfBirth=datetime.datetime(1947, 12, 28, 2, 45, 40, 547000), PatientRace='Unknown', PatientMaritalStatus='Married', PatientLanguage='Icelandic', PatientPopulationPercentageBelowPoverty=18.079999923706055),
 Row(PatientID='64182B95-EB72-4E2B-BE77-8050B71498CE', PatientGender='Male', PatientDateOfBirth=datetime.datetime(1952, 1, 18, 19, 51, 12, 917000), PatientRace='African American', PatientMaritalStatus='Separated', PatientLanguage='English', PatientPopulationPercentageBelowPoverty=13.029999732971191),
 Row(PatientID='DB22A4D9-7E4D-485C-916A-9CD1386507FB', PatientGender='Female', PatientDateOfBirth=datetime.datetime(1970, 7, 25, 13, 4, 20, 717000), PatientRace='Asian', PatientMaritalStatus='Married', PatientLanguage='English', PatientPopulationPercentageBelowPoverty=6.670000076293945),
 Row(PatientID='6E70D84D-C75F-477C-BC37-9177C3698C66', PatientGender='Male', PatientDateOfBirth=datetime.datetime(19

In [14]:
# # Get results (WARNING: in-memory) as list of Python dicts

# dicts = [row.asDict(recursive=True) for row in df_patients.collect()]
# print(dicts)

In [15]:
# Get Row Count

df_patients.count()

100

In [16]:
from pyspark.sql import functions as F, types as T

## Filtering

In [17]:
# Filter on equals condition

df_asians = df_patients.filter(df_patients.PatientRace == 'Asian')\
.select("PatientGender", "PatientRace", "PatientMaritalStatus").show(truncate = False)

+-------------+-----------+--------------------+
|PatientGender|PatientRace|PatientMaritalStatus|
+-------------+-----------+--------------------+
|Female       |Asian      |Married             |
|Male         |Asian      |Married             |
|Male         |Asian      |Divorced            |
|Male         |Asian      |Single              |
|Male         |Asian      |Married             |
|Male         |Asian      |Single              |
|Female       |Asian      |Divorced            |
|Female       |Asian      |Married             |
|Female       |Asian      |Unknown             |
|Male         |Asian      |Single              |
|Female       |Asian      |Separated           |
|Female       |Asian      |Unknown             |
|Female       |Asian      |Single              |
|Female       |Asian      |Single              |
|Female       |Asian      |Single              |
|Female       |Asian      |Unknown             |
|Female       |Asian      |Married             |
|Female       |Asian

In [18]:
# Compare against a list of allowed values

df_aaw = df_patients.filter(F.col("PatientRace").isin("Asian", "African American", "White"))\
        .select("PatientGender", "PatientRace", "PatientMaritalStatus").show(truncate = False)

+-------------+----------------+--------------------+
|PatientGender|PatientRace     |PatientMaritalStatus|
+-------------+----------------+--------------------+
|Male         |African American|Separated           |
|Female       |Asian           |Married             |
|Male         |White           |Married             |
|Female       |White           |Married             |
|Male         |White           |Married             |
|Male         |Asian           |Married             |
|Female       |White           |Married             |
|Female       |White           |Single              |
|Male         |White           |Married             |
|Male         |Asian           |Divorced            |
|Male         |White           |Single              |
|Male         |White           |Married             |
|Female       |African American|Single              |
|Female       |White           |Married             |
|Male         |African American|Unknown             |
|Male         |White        

In [19]:
# Filter rows where PatientPopulationPercentageBelowPoverty is between 10 and 15

df_patients.filter((F.col("PatientPopulationPercentageBelowPoverty") >= 10) & \
                   (F.col("PatientPopulationPercentageBelowPoverty") <= 15))\
            .select("PatientID", "PatientRace", "PatientPopulationPercentageBelowPoverty")\
            .show(truncate = False)

+------------------------------------+----------------+---------------------------------------+
|PatientID                           |PatientRace     |PatientPopulationPercentageBelowPoverty|
+------------------------------------+----------------+---------------------------------------+
|64182B95-EB72-4E2B-BE77-8050B71498CE|African American|13.03                                  |
|7FD13988-E58A-4A5C-8680-89AC200950FA|White           |12.41                                  |
|C60FE675-CA52-4C55-A233-F4B27E94987F|Asian           |12.8                                   |
|B7E9FC4C-5182-4A34-954E-CEF5FC07E96D|Unknown         |11.43                                  |
|1A40AF35-C6D4-4D46-B475-A15D84E8A9D5|White           |11.25                                  |
|0A9BA3E4-CF3C-49C4-9774-5EEA2EE7D123|White           |14.28                                  |
|7C788499-7798-484B-A027-9FCDC4C0DADB|White           |11.89                                  |
|6985D824-3269-4D12-A9DD-B932D640E26E|Wh

## GroupBy

In [20]:
# Group by PatientRace and count the number of males and females

df_patients.groupBy("PatientRace").agg(
            F.count(F.when(df_patients["PatientGender"] == "Male", 1)).alias("MaleCount")
            , F.count(F.when(df_patients["PatientGender"] == "Female", 1)).alias("FemaleCount")
).orderBy("PatientRace").show()

+----------------+---------+-----------+
|     PatientRace|MaleCount|FemaleCount|
+----------------+---------+-----------+
|African American|        9|          6|
|           Asian|        8|         15|
|         Unknown|        4|          9|
|           White|       27|         22|
+----------------+---------+-----------+



In [21]:
# Group by PatientRace and count the number of single, separated, and married for each gender

df_patients.groupBy("PatientRace").agg(
        F.count(F.when((df_patients["PatientGender"] == "Male") & \
                (df_patients["PatientMaritalStatus"] == "Single"), 1))\
        .alias("SingleMaleCount"),
        F.count(F.when((df_patients["PatientGender"] == "Female") & \
               (df_patients["PatientMaritalStatus"] == "Single"), 1))\
        .alias("SingleFemaleCount"),
        F.count(F.when((df_patients["PatientGender"] == "Male") & \
                    (df_patients["PatientMaritalStatus"] == "Separated"), 1))\
        .alias("SeparatedMaleCount"),
        F.count(F.when((df_patients["PatientGender"] == "Female") & \
                   (df_patients["PatientMaritalStatus"] == "Separated"), 1))\
        .alias("SeparatedFemaleCount"),
        F.count(F.when((df_patients["PatientGender"] == "Male") & \
               (df_patients["PatientMaritalStatus"] == "Married"), 1))\
        .alias("MarriedMaleCount"),
        F.count(F.when((df_patients["PatientGender"] == "Female") & \
                   (df_patients["PatientMaritalStatus"] == "Married"), 1))\
        .alias("MarriedFemaleCount")
).show()

+----------------+---------------+-----------------+------------------+--------------------+----------------+------------------+
|     PatientRace|SingleMaleCount|SingleFemaleCount|SeparatedMaleCount|SeparatedFemaleCount|MarriedMaleCount|MarriedFemaleCount|
+----------------+---------------+-----------------+------------------+--------------------+----------------+------------------+
|African American|              2|                2|                 2|                   0|               4|                 4|
|         Unknown|              2|                1|                 0|                   0|               2|                 7|
|           White|              8|                9|                 0|                   1|              12|                 8|
|           Asian|              3|                5|                 0|                   2|               4|                 4|
+----------------+---------------+-----------------+------------------+--------------------+-----

In [22]:
# Group by PatientRace and 
# Find the average PatientPopulationPercentageBelowPoverty for each PatientRace

df_patients.groupBy("PatientRace").agg(
        F.round(F.avg(F.col("PatientPopulationPercentageBelowPoverty")),2)\
        .alias("Average_PatientPopulationPercentageBelowPoverty")
).show()

+----------------+-----------------------------------------------+
|     PatientRace|Average_PatientPopulationPercentageBelowPoverty|
+----------------+-----------------------------------------------+
|African American|                                          19.49|
|         Unknown|                                          19.99|
|           White|                                          21.24|
|           Asian|                                          26.84|
+----------------+-----------------------------------------------+



In [23]:
#Group by PatientRace and PatientGender, 
#and Calculate the Average PatientPopulationPercentageBelowPoverty 
#Rounded to 2 Decimal Places:

df_patients.groupBy("PatientRace", "PatientGender").agg(
        F.round(F.avg(F.col("PatientPopulationPercentageBelowPoverty")),2)\
        .alias("Average_PatientPopulationPercentageBelowPoverty")
).orderBy("PatientRace", "PatientGender").show()

+----------------+-------------+-----------------------------------------------+
|     PatientRace|PatientGender|Average_PatientPopulationPercentageBelowPoverty|
+----------------+-------------+-----------------------------------------------+
|African American|       Female|                                          26.59|
|African American|         Male|                                          14.75|
|           Asian|       Female|                                          28.93|
|           Asian|         Male|                                          22.92|
|         Unknown|       Female|                                          24.62|
|         Unknown|         Male|                                           9.58|
|           White|       Female|                                          18.65|
|           White|         Male|                                          23.35|
+----------------+-------------+-----------------------------------------------+



In [24]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

In [25]:
# Define the schema for labs

schema_labs = StructType([
    StructField("PatientID", StringType(), True),
    StructField("AdmissionID", IntegerType(), True),
    StructField("LabName", StringType(), True),
    StructField("LabValue", DoubleType(), True),
    StructField("LabUnits", StringType(), True),
    StructField("LabDateTime", TimestampType(), True)
])

In [26]:
df_labs = spark.read.load("/Users/sateeshreddypatlolla/Downloads/LabsCorePopulatedTable.csv", format = "csv", sep='\t', header = "true", schema = schema_labs, )

In [27]:
df_labs.printSchema()

root
 |-- PatientID: string (nullable = true)
 |-- AdmissionID: integer (nullable = true)
 |-- LabName: string (nullable = true)
 |-- LabValue: double (nullable = true)
 |-- LabUnits: string (nullable = true)
 |-- LabDateTime: timestamp (nullable = true)



In [28]:
df_labs.show(10, truncate = False)

+------------------------------------+-----------+---------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                    |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+---------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: RED BLOOD CELLS|1.8     |rbc/hpf |1992-07-01 01:36:17.91 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: GLUCOSE         |103.3   |mg/dL   |1992-06-30 09:35:52.383|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: MCH                   |35.8    |pg      |1992-06-30 03:50:11.777|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: CALCIUM         |8.9     |mg/dL   |1992-06-30 12:09:46.107|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: RED BLOOD CELL COUNT  |4.8     |m/cumm  |1992-07-01 01:31:08.677|
|1A8791E3-A61C-455A-8DEE-763EB90

## String Filters

#### Contains - col.contains(string)

In [29]:
## Filter rows where LabName contains "URINALYSIS"


df_labs.filter(F.col("LabName").contains("URINALYSIS")).show(10, truncate = False)

+------------------------------------+-----------+-----------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                      |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+-----------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: RED BLOOD CELLS  |1.8     |rbc/hpf |1992-07-01 01:36:17.91 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: PH               |4.9     |no unit |1992-07-01 01:25:54.887|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: SPECIFIC GRAVITY |1.0     |no unit |1992-07-01 02:40:06.887|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: WHITE BLOOD CELLS|2.0     |wbc/hpf |1992-06-30 14:34:52.29 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: WHITE BLOOD CELLS|2.5     |wbc/hpf |1992-07-01 22:13:00.61 |
|1A8791E3-A61C-4

#### Starts With - col.startswith(string)

In [30]:
# Filter rows where LabUnits startswith "rbc"

df_labs.filter(F.col("LabUnits").startswith("rbc")).show(10, truncate = False)

+------------------------------------+-----------+---------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                    |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+---------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: RED BLOOD CELLS|1.8     |rbc/hpf |1992-07-01 01:36:17.91 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: RED BLOOD CELLS|2.5     |rbc/hpf |1992-07-01 10:38:47.503|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|2          |URINALYSIS: RED BLOOD CELLS|2.0     |rbc/hpf |2005-07-26 18:14:43.88 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|2          |URINALYSIS: RED BLOOD CELLS|2.7     |rbc/hpf |2005-07-31 20:50:27.503|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|2          |URINALYSIS: RED BLOOD CELLS|3.0     |rbc/hpf |2005-08-01 13:28:14.757|
|1A8791E3-A61C-455A-8DEE-763EB90

#### Ends With - col.endswith(string)

In [31]:
df_labs.filter(F.col("LabName").endswith("CYTES")).show(10, truncate = False)

+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                  |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: LYMPHOCYTES         |2.2     |k/cumm  |1992-07-01 02:42:24.957|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: ABSOLUTE LYMPHOCYTES|33.3    |%       |1992-06-30 09:39:02.83 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: MONOCYTES           |0.8     |k/cumm  |1992-07-01 03:28:46.073|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: LYMPHOCYTES         |1.6     |k/cumm  |1992-07-01 09:43:24.983|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: ABSOLUTE LYMPHOCYTES|26.0    |%       |1992-07-01 03:38:07.08 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|2         

###  Example wildcard searches using PySpark SQL functions

In [32]:
# Filter rows where LabName starts with "METABOLIC"

df_labs.filter(F.col("LabName").like("METABOLIC%")).show(10, truncate = False)

+------------------------------------+-----------+------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                 |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: GLUCOSE      |103.3   |mg/dL   |1992-06-30 09:35:52.383|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: CALCIUM      |8.9     |mg/dL   |1992-06-30 12:09:46.107|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: TOTAL PROTEIN|7.5     |gm/dL   |1992-06-30 17:58:08.513|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: CHLORIDE     |111.5   |mmol/L  |1992-06-30 14:03:11.313|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: SODIUM       |130.4   |mmol/L  |1992-06-30 14:25:18.26 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABO

In [33]:
# Filter rows where LabUnits contains "mmol"

df_labs.filter(F.col("LabUnits").like("%mmol%")).show(10, truncate = False)

+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                  |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: CHLORIDE      |111.5   |mmol/L  |1992-06-30 14:03:11.313|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: SODIUM        |130.4   |mmol/L  |1992-06-30 14:25:18.26 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: POTASSIUM     |5.5     |mmol/L  |1992-07-01 07:20:54.017|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: ANION GAP     |11.2    |mmol/L  |1992-07-01 04:25:57.077|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: CHLORIDE      |105.3   |mmol/L  |1992-07-01 20:58:22.167|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1         

In [34]:
# Filter rows where Labname endswith "LYMPHOCYTES"

df_labs.filter(F.col("Labname").like("%LYMPHOCYTES")).show(10, truncate = False)

+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                  |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: LYMPHOCYTES         |2.2     |k/cumm  |1992-07-01 02:42:24.957|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: ABSOLUTE LYMPHOCYTES|33.3    |%       |1992-06-30 09:39:02.83 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: LYMPHOCYTES         |1.6     |k/cumm  |1992-07-01 09:43:24.983|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: ABSOLUTE LYMPHOCYTES|26.0    |%       |1992-07-01 03:38:07.08 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|2          |CBC: ABSOLUTE LYMPHOCYTES|18.2    |%       |2005-07-27 12:03:25.64 |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|2         

In [35]:
# Extract numeric value from LabUnits where it contains numbers (regex extraction)

df_labs.withColumn("NumericLabUnits", F.regexp_extract(F.col("LabUnits"), r"(\d+\.?\d*)", 1)).show(10, truncate = False)

+------------------------------------+-----------+---------------------------+--------+--------+-----------------------+---------------+
|PatientID                           |AdmissionID|LabName                    |LabValue|LabUnits|LabDateTime            |NumericLabUnits|
+------------------------------------+-----------+---------------------------+--------+--------+-----------------------+---------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |URINALYSIS: RED BLOOD CELLS|1.8     |rbc/hpf |1992-07-01 01:36:17.91 |               |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: GLUCOSE         |103.3   |mg/dL   |1992-06-30 09:35:52.383|               |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: MCH                   |35.8    |pg      |1992-06-30 03:50:11.777|               |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: CALCIUM         |8.9     |mg/dL   |1992-06-30 12:09:46.107|               |
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1  

In [36]:
# Filter rows where LabName matches a regex pattern

df_labs.filter(F.col("LabName").rlike(r"METABOLIC|CBC")).show(10, truncate = False)

+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|PatientID                           |AdmissionID|LabName                  |LabValue|LabUnits|LabDateTime            |
+------------------------------------+-----------+-------------------------+--------+--------+-----------------------+
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: GLUCOSE       |103.3   |mg/dL   |1992-06-30 09:35:52.383|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: MCH                 |35.8    |pg      |1992-06-30 03:50:11.777|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: CALCIUM       |8.9     |mg/dL   |1992-06-30 12:09:46.107|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |CBC: RED BLOOD CELL COUNT|4.8     |m/cumm  |1992-07-01 01:31:08.677|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1          |METABOLIC: TOTAL PROTEIN |7.5     |gm/dL   |1992-06-30 17:58:08.513|
|1A8791E3-A61C-455A-8DEE-763EB90C9B2C|1         

In [37]:
# Filter rows where LabUnits matches a regex pattern (contains numbers)

df_labs.filter(F.col("LabUnits").rlike(r"\d")).show(10, truncate = False)

+---------+-----------+-------+--------+--------+-----------+
|PatientID|AdmissionID|LabName|LabValue|LabUnits|LabDateTime|
+---------+-----------+-------+--------+--------+-----------+
+---------+-----------+-------+--------+--------+-----------+



In [38]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [39]:
# Define the schema for diagnoses

schema_diagnoses = StructType([
    StructField("PatientID", StringType(), True),
    StructField("AdmissionID", IntegerType(), True),
    StructField("PrimaryDiagnosisCode", StringType(), True),
    StructField("PrimaryDiagnosisDescription", StringType(), True)
])


In [40]:
df_diagnoses = spark.read.format("csv")\
            .option("sep", "\t")\
            .option("schema", "schema_diagnoses")\
            .option("header", "true")\
            .load("/Users/sateeshreddypatlolla/Downloads/AdmissionsDiagnosesCorePopulatedTable.csv")

In [41]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

In [42]:
# Define the schema for admissions
schema_admissions = StructType([
    StructField("PatientID", StringType(), True),
    StructField("AdmissionID", IntegerType(), True),
    StructField("AdmissionStartDate", TimestampType(), True),
    StructField("AdmissionEndDate", TimestampType(), True)
])

In [43]:
df_admissions = spark.read.format("csv").load("/Users/sateeshreddypatlolla/Downloads/AdmissionsCorePopulatedTable.csv", sep = "\t", schema = schema_admissions, header = "true")

#### What does df_patients contain?

In [44]:
df_patients.columns

['PatientID',
 'PatientGender',
 'PatientDateOfBirth',
 'PatientRace',
 'PatientMaritalStatus',
 'PatientLanguage',
 'PatientPopulationPercentageBelowPoverty']

In [45]:
df_patients.dtypes

[('PatientID', 'string'),
 ('PatientGender', 'string'),
 ('PatientDateOfBirth', 'timestamp'),
 ('PatientRace', 'string'),
 ('PatientMaritalStatus', 'string'),
 ('PatientLanguage', 'string'),
 ('PatientPopulationPercentageBelowPoverty', 'float')]

In [46]:
df_patients.schema

StructType([StructField('PatientID', StringType(), True), StructField('PatientGender', StringType(), True), StructField('PatientDateOfBirth', TimestampType(), True), StructField('PatientRace', StringType(), True), StructField('PatientMaritalStatus', StringType(), True), StructField('PatientLanguage', StringType(), True), StructField('PatientPopulationPercentageBelowPoverty', FloatType(), True)])