In [1]:
import os
import sys

In [2]:
os.environ["PYSPARK_PYTHON"]="C:\Spark\spark-3.3.2-bin-hadoop2\python"
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-18.0.2.1"
os.environ["SPARK_HOME"] = "C:\Spark\spark-3.3.2-bin-hadoop2"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.9.5-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip") 

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("r_eda").getOrCreate()
spark

In [None]:
data = spark.read.option("header","true").csv("../data/r_hosp_demo_dataset.csv")
data.printSchema()

In [None]:
data.count()

In [None]:
data.describe().toPandas()

- All the rows has subject, gender, age, admission id (hadm_id) and charlson_comorbidity_index.
- The min and max age values looks good.

To Do
- Check for valid gender values and remove invalid genders (those other than 'M' and 'F').
- Remove columns with more than 50% missing values
- impute missing values by calculating the average of the value between the last & next reading for the same subject.
- Remove columns with more than 25% missing values after impuatation.
- remove rows which has data in less than 25% of the columns.

#### Check for valid gender values and remove invalid genders (those other than 'M' and 'F').

In [None]:
#checking for gender values
data.groupby("gender").count().show()

Gender column values looks good.

#### Remove columns with more than 50% missing values

In [None]:
dt = data.describe().toPandas()
dt

In [None]:
# identify and get columns with < 50% missing values
dt_t = dt.T
dt_t.columns = dt_t.iloc[0]
dt_t.drop(dt_t.index[0], inplace=True)
#dt_t.drop(dt_t.index[89], inplace=True)
dt_t["count"] = dt_t['count'].astype(int)
dt_t['missing_percentage'] = 100 - (dt_t["count"] / data.count())*100
print(dt_t)
dt_t = dt_t[dt_t["missing_percentage"] < 50]
print(dt_t)
print(dt_t.shape)

After removing all the columns with missing value % >= 50, we get remaining 45 feature fields which are as follows.

In [None]:
print(dt_t.shape)
dt_t.index

In [None]:
# Filter out only the identified column data from the data
data = data.select(dt_t.index.values.tolist())

In [None]:
# round off age by 2 decimal point
print(data.printSchema())
data.describe().toPandas()

#### type conversion for the columns data

Except for Gender, all other columns contain decimal values. Hence converting every column type to decimal with 2 decimal values.

In [None]:
# converting all number columns to double of precision 2 except for gender
import pyspark.sql.functions as F

cols = data.columns
cols.remove("gender")
data2 = data.select(*(F.round(F.col(c).cast("double"), 2).alias(c) for c in cols), "gender")
print(data2.columns)
print(data2.printSchema())
data2.describe().toPandas()

#### Impute missing values - Calculating the average of the values between the prev & next admission reading for the same subject.

In [None]:
# sample for verification
data2.filter((data2.subject_id == 10040025)).select("subject_id","age","basophils_abs", "db_wbc", "platelet", "ast").orderBy("age").show()

In [None]:
# Before impute
data2.toPandas().to_csv("../data/EDA/before_imputation.csv")

In [None]:
# Use window function to impute missing values.
# https://sqlrelease.com/get-the-first-non-null-value-per-group-spark-dataframe
from pyspark.sql.window import Window
subject_win_prev = Window.partitionBy("subject_id").orderBy(F.desc("age")).rowsBetween(Window.currentRow+1,Window.unboundedFollowing)
subject_win_next = Window.partitionBy("subject_id").orderBy("age").rowsBetween(Window.currentRow+1,Window.unboundedFollowing)
#wi_next = Window.partitionBy("subject_id").orderBy(F.desc("age"))
for c in data2.columns:
    if (c not in ("subject_id", "age", "gender", "hadm_id", "charlson_comorbidity_index")):
            data2 = data2.withColumn('temp_' + c + 'prev', F.first(c, ignorenulls = True).over(subject_win_prev)) \
            .withColumn('temp_' + c + 'next', F.first(c, ignorenulls = True).over(subject_win_next)) \
            .withColumn(c , F.when(F.col(c).isNotNull() ,F.col(c)) \
                        .when(F.col(c).isNull() & F.col('temp_' + c + 'prev').isNull(), F.col('temp_' + c + 'next')) \
                        .when(F.col(c).isNull() & F.col('temp_' + c + 'next').isNull(), F.col('temp_' + c + 'prev')) \
                        .otherwise(((F.col('temp_' + c + 'prev') + F.col('temp_' + c + 'next'))/2))) \
            .drop('temp_' + c + 'prev', 'temp_' + c + 'next')

In [None]:
# same sample for verification
data2.filter((data2.subject_id == 10040025)).select("subject_id","age","basophils_abs", "db_wbc", "platelet", "ast").orderBy("age").show()

In [None]:
data2.toPandas().to_csv("../data/EDA/after_imputation.csv")

#### Remove columns with more than 25% missing values after impuatation.

In [None]:
# identify and get columns with < 25% missing values
dt = data2.describe().toPandas()
print(dt)
dt_t = dt.T
dt_t.columns = dt_t.iloc[0]
dt_t.drop(dt_t.index[0], inplace=True)
dt_t["count"] = dt_t['count'].astype(int)
dt_t['missing_percentage'] = 100 - (dt_t["count"] / data.count())*100
print(dt_t)
print(dt_t.shape)
dt_t = dt_t[dt_t["missing_percentage"] < 25]
print(dt_t)
print(dt_t.shape)
dt_t.index

In [None]:
print(dt_t.shape)
dt_t.index

In [None]:
# Filter out only the identified column data from the data
print(data2.columns)
print(len(data2.columns))
data2 = data2.select(dt_t.index.values.tolist())
print(data2.columns)
print(len(data2.columns))

No columns removed.

#### retain rows which has missing values less than 25% of the columns.

In [None]:
# calculate missing percentage for every row
from operator import add
from functools import reduce
for c in data2.columns:
    if ('missing_' not in c) and (c not in ("subject_id", "age", "gender", "hadm_id", "charlson_comorbidity_index")):
            data2 = data2.withColumn('missing_' + c, F.when(F.col(c).isNull(), 1).otherwise(0))

data2 = data2.withColumn('missing_percentage', (reduce(add, [F.col(x) for x in data2.columns if "missing_" in x])/(len(data2.columns)-5))*100)
data2.toPandas()

In [None]:
data2.groupBy("missing_percentage").count().toPandas()

In [None]:
data3 = data2.filter("missing_percentage < 25")

In [None]:
print(data2.count())
print(data3.count())


None of the rows has missing % >= 25

In [None]:
print(len(data3.columns))
condition = lambda x: ("missing_" in x)
data3 = data3.drop(*filter(condition, data3.columns))
len(data3.columns)

In [None]:
data3.toPandas().to_csv("../data/EDA/after_eda.csv")

In [None]:
data4 = spark.read.option("header","true").csv("../data/EDA/after_eda.csv")
data4.printSchema()

In [None]:
# get the first admission readings and last admission co-morbidity index value
from pyspark.sql.window import Window
import pyspark.sql.functions as F
subject_win = Window.partitionBy("subject_id").orderBy(("age"))
base_data = data4.withColumn("row",F.row_number().over(subject_win)) \
  .filter(F.col("row") == 1).drop("row", "charlson_comorbidity_index")

In [None]:
base_data.count()

In [None]:
subject_win_predict = Window.partitionBy("subject_id").orderBy(F.desc("age"))
base_data_predict = data4.withColumn("row",F.row_number().over(subject_win_predict)) \
  .filter(F.col("row") == 1).select("subject_id", "charlson_comorbidity_index")
base_data_predict.count()

In [None]:
print(base_data.columns)
print(base_data_predict.columns)

In [None]:
base_data.toPandas()

In [None]:
base_data = base_data.drop("_c0")

In [None]:
base_data.toPandas()

In [None]:
base_data_predict.toPandas()

In [None]:
base_data.toPandas().to_csv("../data/EDA/clustering_data.csv")
base_data_predict.toPandas().to_csv("../data/EDA/prediction_value.csv")

In [4]:
# fill the missing values with the standard normal values in medical terms
import pyspark.sql.functions as F
cluster_data = spark.read.option("header","true").csv("../data/EDA/clustering_data.csv")
cluster_data = cluster_data.drop("_c0")

cols = cluster_data.columns
cols.remove("gender")
cluster_data = cluster_data.select(*(F.round(F.col(c).cast("double"), 2).alias(c) for c in cols), "gender")
print(cluster_data.printSchema())

root
 |-- subject_id: double (nullable = true)
 |-- hadm_id: double (nullable = true)
 |-- age: double (nullable = true)
 |-- db_wbc: double (nullable = true)
 |-- basophils_abs: double (nullable = true)
 |-- eosinophils_abs: double (nullable = true)
 |-- lymphocytes_abs: double (nullable = true)
 |-- monocytes_abs: double (nullable = true)
 |-- neutrophils_abs: double (nullable = true)
 |-- basophils: double (nullable = true)
 |-- eosinophils: double (nullable = true)
 |-- lymphocytes: double (nullable = true)
 |-- monocytes: double (nullable = true)
 |-- neutrophils: double (nullable = true)
 |-- aniongap: double (nullable = true)
 |-- bicarbonate: double (nullable = true)
 |-- bun: double (nullable = true)
 |-- calcium: double (nullable = true)
 |-- chloride: double (nullable = true)
 |-- creatinine: double (nullable = true)
 |-- glucose: double (nullable = true)
 |-- sodium: double (nullable = true)
 |-- potassium: double (nullable = true)
 |-- inr: double (nullable = true)
 |-- pt

In [5]:
cluster_data.describe().show()

+-------+-----------------+-----------------+------------------+-----------------+-------------------+-------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+------------------+------------------+------+
|summary|       subject_id|          hadm_id|               age|           db_wbc|      basophils_abs|    eosinophils_abs|  lymphocytes_abs|     monocytes_abs|   neutrophils_abs|

In [16]:
len(cluster_data.columns)

44

In [15]:
cluster_data.fillna(0,["db_wbc"]) \
        .fillna(0,["basophils_abs"]) \
        .fillna(0,["eosinophils_abs"]) \
        .fillna(0,["lymphocytes_abs"]) \
        .fillna(0,["monocytes_abs"]) \
        .fillna(0,["neutrophils_abs"]) \
        .fillna(0,["basophils"]) \
        .fillna(0,["eosinophils"]) \
        .fillna(0,["lymphocytes"]) \
        .fillna(0,["monocytes"]) \
        .fillna(0,["neutrophils"]) \
        .fillna(0,["aniongap"]) \
        .fillna(0,["bicarbonate"]) \
        .fillna(0,["bun"]) \
        .fillna(0,["calcium"]) \
        .fillna(0,["chloride"]) \
        .fillna(0,["creatinine"]) \
        .fillna(0,["glucose"]) \
        .fillna(0,["sodium"]) \
        .fillna(0,["potassium"]) \
        .fillna(0,["inr"]) \
        .fillna(0,["pt"]) \
        .fillna(0,["ptt"]) \
        .fillna(0,["hematocrit"]) \
        .fillna(0,["hemoglobin"]) \
        .fillna(0,["mch"]) \
        .fillna(0,["mchc"]) \
        .fillna(0,["mcv"]) \
        .fillna(0,["platelet"]) \
        .fillna(0,["rdw"]) \
        .fillna(0,["wbc"]) \
        .fillna(0,["scr_min"]) \
        .fillna(0,["ckd"]) \
        .fillna(0,["mdrd_est"]) \
        .fillna(0,["scr_baseline"]) \
        .fillna(0,["alt"]) \
        .fillna(0,["alp"]) \
        .fillna(0,["ast"]) \
        .fillna(0,["bilirubin_total"]) \
        .describe().show()

+-------+-----------------+-----------------+------------------+-----------------+--------------------+-------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+-------------------+------------------+-----------------+-----------------+------------------+------------------+------+
|summary|       subject_id|          hadm_id|               age|           db_wbc|       basophils_abs|    eosinophils_abs|  lymphocytes_abs|     monocytes_abs|  neutrophils_abs|  