In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"]="C:\\Spark\\spark-3.3.2-bin-hadoop2\\python"
#os.environ["PYSPARK_PYTHON"]="C:\\Users\\snksh\\OneDrive\\Desktop\\Research\\sw\\python\\python"
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-18.0.2.1"
os.environ["SPARK_HOME"] = "C:\Spark\spark-3.3.2-bin-hadoop2"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.9.5-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

MAX_MEMORY = "14G"
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("data_model") \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()    
spark

In [2]:
import pandas as pd

pd.set_option('display.max_columns', None)

In [3]:
cluster_data = spark.read.option("header","true").option("inferSchema","true").csv("../data/EDA/final_data_for_modelling.csv")
cluster_data = cluster_data.drop("_c0")
cluster_data.printSchema()
cluster_data.count()

root
 |-- age_imputed: double (nullable = true)
 |-- bg2_lactate_imputed: double (nullable = true)
 |-- basophils_imputed: double (nullable = true)
 |-- eosinophils_imputed: double (nullable = true)
 |-- lymphocytes_imputed: double (nullable = true)
 |-- monocytes_imputed: double (nullable = true)
 |-- neutrophils_imputed: double (nullable = true)
 |-- albumin_imputed: double (nullable = true)
 |-- aniongap_imputed: double (nullable = true)
 |-- bicarbonate_imputed: double (nullable = true)
 |-- bun_imputed: double (nullable = true)
 |-- calcium_imputed: double (nullable = true)
 |-- chloride_imputed: double (nullable = true)
 |-- creatinine_imputed: double (nullable = true)
 |-- glucose_imputed: double (nullable = true)
 |-- sodium_imputed: double (nullable = true)
 |-- potassium_imputed: double (nullable = true)
 |-- inr_imputed: double (nullable = true)
 |-- pt_imputed: double (nullable = true)
 |-- ptt_imputed: double (nullable = true)
 |-- hematocrit_imputed: double (nullable = tr

63887

In [None]:
# convert all values to double
#import pyspark.sql.functions as F
#numeric_cols = model_data.columns
#numeric_cols.remove("gender")
#model_data = model_data.select(*(F.round(F.col(c).cast("double"), 2).alias(c) for c in numeric_cols), "gender")
#print(model_data.printSchema())

#### Data model building

#### data pre-processing

- datatype conversion
- outlier handling

In [4]:
feature_cols =  cluster_data.columns
feature_cols.remove("charlson_comorbidity_index_imputed")
feature_cols.remove("gender")
feature_cols.append("gender_imputed")
print(feature_cols)

categorical_cols = ["gender"]
print(categorical_cols)

['age_imputed', 'bg2_lactate_imputed', 'basophils_imputed', 'eosinophils_imputed', 'lymphocytes_imputed', 'monocytes_imputed', 'neutrophils_imputed', 'albumin_imputed', 'aniongap_imputed', 'bicarbonate_imputed', 'bun_imputed', 'calcium_imputed', 'chloride_imputed', 'creatinine_imputed', 'glucose_imputed', 'sodium_imputed', 'potassium_imputed', 'inr_imputed', 'pt_imputed', 'ptt_imputed', 'hematocrit_imputed', 'hemoglobin_imputed', 'mch_imputed', 'mchc_imputed', 'mcv_imputed', 'platelet_imputed', 'rbc_imputed', 'rdw_imputed', 'wbc_imputed', 'scr_min_imputed', 'ckd_imputed', 'mdrd_est_imputed', 'scr_baseline_imputed', 'alt_imputed', 'alp_imputed', 'ast_imputed', 'bilirubin_total_imputed', 'gender_imputed']
['gender']


In [5]:
# Feature transformation for categorical features
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
si = StringIndexer(inputCols=categorical_cols, outputCols=[c+'_idx' for c in categorical_cols], handleInvalid="keep")
ohe = OneHotEncoder(inputCols=[c+'_idx' for c in categorical_cols], outputCols=[c+'_imputed' for c in categorical_cols], handleInvalid="keep")

In [6]:
from pyspark.ml.feature import StandardScaler

assembler = VectorAssembler( inputCols = feature_cols, outputCol = 'features')
scaler = StandardScaler( inputCol = 'features', outputCol = 'standardized')

In [7]:
from pyspark.ml import Pipeline

pipeline =  Pipeline(stages = [si, ohe, assembler, scaler])
data_model = pipeline.fit(cluster_data)

In [8]:
# save data model
data_model.write().overwrite().save("../model/data_model.pkl")

#### Prediction model

In [None]:
model_data = data

In [None]:
# Splitting the data into train and test (Remember you are expected to compare the model later)
train, test = model_data.randomSplit([0.7,0.3])
print("Size of training data {0}".format(train.count()))
print("Size of test data {0}".format(test.count()))

In [None]:
# model train
from pyspark.ml.regression import LinearRegression
lr = LinearRegression( featuresCol="standardized", labelCol="charlson_comorbidity_index_imputed")

model_input = data_model.transform(train)
model = lr.fit(model_input)

# save model
model.write().overwrite().save("gender_predict_model.pkl")

#### model evaluation

In [None]:
predictionoutput = model.transform(model_input)
predictionoutput.toPandas()

In [None]:
predictionoutput.select("charlson_comorbidity_index", "prediction").toPandas()

In [None]:
# evaluate with test data

model_input_test = data_model.transform(test)

pred_results = model.evaluate(model_input_test)
pred_results.predictions.select("charlson_comorbidity_index", "prediction").toPandas()

#### model from hyperparameter tuning


In [None]:
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from  pyspark.ml.evaluation import RegressionEvaluator
lr2 = LinearRegression( featuresCol="standardized", labelCol="charlson_comorbidity_index_imputed")
paramGrid = ParamGridBuilder()\
    .baseOn({lr2.labelCol: 'charlson_comorbidity_index_imputed'}) \
    .baseOn([lr2.predictionCol, 'prediction']) \
    .addGrid(lr2.regParam, [0.1, 0.01]) \
    .addGrid(lr2.fitIntercept, [False, True])\
    .addGrid(lr2.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()



# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
eva = RegressionEvaluator()
eva.setLabelCol("charlson_comorbidity_index_imputed")
tvs = TrainValidationSplit(estimator=lr2,
                           estimatorParamMaps=paramGrid,
                           evaluator=eva,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model_input = data_model.transform(train)
model = tvs.fit(model_input)

In [None]:
predictionoutput = model.transform(model_input)
predictionoutput.select("charlson_comorbidity_index_imputed", "prediction").toPandas()

In [None]:

model_input_test = data_model.transform(test)

pred_results = model.transform(model_input_test)
pred_results.select("charlson_comorbidity_index_imputed", "prediction").toPandas()

In [None]:
model.validationMetrics

In [None]:
params = [{p.name: v for p, v in m.items()} for m in model.getEstimatorParamMaps()]

pd.DataFrame.from_dict([
    {model.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params, model.validationMetrics)
])

In [None]:
model.bestModel.extractParamMap()

In [None]:
bm = model.bestModel
model_input_test = data_model.transform(test)

pred_results = bm.evaluate(model_input_test)
print(pred_results.meanSquaredError)
print(pred_results.r2)
print(pred_results.rootMeanSquaredError)
print(pred_results.r2adj)

In [None]:
bm = model.bestModel
model_input_test = data_model.transform(test)

pred_results = bm.evaluate(model_input_test)
print(pred_results.meanSquaredError)
print(pred_results.r2)
print(pred_results.rootMeanSquaredError)
print(pred_results.r2adj)

In [None]:
bm = model.bestModel
model_input_train = data_model.transform(train)

pred_results = bm.evaluate(model_input_train)
print(pred_results.meanSquaredError)
print(pred_results.r2)
print(pred_results.rootMeanSquaredError)
print(pred_results.r2adj)

In [None]:
bm = model.bestModel
model_input_train = data_model.transform(train)

pred_results = bm.evaluate(model_input_train)
print(pred_results.meanSquaredError)
print(pred_results.r2)
print(pred_results.rootMeanSquaredError)
print(pred_results.r2adj)

In [None]:
pred_results.residuals.toPandas()