In [1]:
import findspark
findspark.init()

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from pyspark.ml import Pipeline
from __future__ import print_function
import seaborn as sns
import imblearn
import pandas as pd
from imblearn.over_sampling import SMOTE
import math 
import pyspark.sql.functions as F
from sklearn.preprocessing import LabelEncoder

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
sc = SparkContext.getOrCreate();
spark = SparkSession \
    .builder \
    .appName("HealthCarePrediction") \
    .getOrCreate()

In [3]:
rawDF = spark.read.csv('healthcare-dataset-stroke-data.csv', header=True, inferSchema=True)

In [4]:
rawDF = rawDF.dropna()
rawDF = rawDF.filter(rawDF['bmi'] != "N/A")
rawDF = rawDF.filter(rawDF['gender'] != "Other")
rawDF = rawDF.withColumn("bmi",rawDF.bmi.cast('double'))

In [5]:
from pyspark.ml.feature import OneHotEncoder, StandardScaler, StringIndexer, VectorAssembler, MinMaxScaler
cat_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
stringIndexedDF = rawDF
for features in cat_features:
    # Index Categorical Features
    string_indexer = StringIndexer(inputCol=features, outputCol=features + "_index")
    stringIndexedDF = string_indexer.fit(stringIndexedDF).transform(stringIndexedDF)
for features in cat_features:     
    stringIndexedDF = stringIndexedDF.withColumn(features+"_index",stringIndexedDF[features+"_index"].cast('int'))

stringIndexedDF = stringIndexedDF.drop(*cat_features)

In [12]:
stringIndexedDF.groupBy('ever_married_index').count().show()

+------------------+-----+
|ever_married_index|count|
+------------------+-----+
|                 1| 1704|
|                 0| 3204|
+------------------+-----+



In [17]:
stringIndexedDF.filter(stringIndexedDF.id == 64778).show()

+-----+----+------------+-------------+-----------------+----+------+------------+------------------+---------------+--------------------+--------------------+
|   id| age|hypertension|heart_disease|avg_glucose_level| bmi|stroke|gender_index|ever_married_index|work_type_index|Residence_type_index|smoking_status_index|
+-----+----+------------+-------------+-----------------+----+------+------------+------------------+---------------+--------------------+--------------------+
|64778|82.0|           0|            1|            208.3|32.5|     1|           1|                 0|              0|                   1|                   1|
+-----+----+------------+-------------+-----------------+----+------+------------+------------------+---------------+--------------------+--------------------+



In [6]:
X = stringIndexedDF.drop('stroke')
Y = stringIndexedDF.select('stroke')
stk = SMOTE(random_state=42)
X_res,y_res = stk.fit_resample(X.toPandas(),Y.toPandas())
joinDF = pd.concat([X_res, y_res], axis=1, join="inner")
balancedData = spark.createDataFrame(joinDF)

In [7]:
def select_features_to_scale(df=balancedData, lower_skew=-2, upper_skew=2, dtypes='double'):
    
    # Empty Selected Feature List for Output
    selected_features = []
    
    # Select Features to Scale based on Inputs ('in32' type, drop 'ID' columns or others, skew bounds)
    feature_list = list(df.toPandas().select_dtypes(include=[dtypes]).columns)
    
    # Loop through 'feature_list' to select features based on Kurtosis / Skew
    for feature in feature_list:

        if df.toPandas()[feature].kurtosis() < -2 or df.toPandas()[feature].kurtosis() > 2:
            
            selected_features.append(feature)
    
    # Return feature list to scale
    return selected_features

In [46]:
index_features = ['gender_index', 'ever_married_index', 'work_type_index', 'Residence_type_index', 'smoking_status_index']

encoderDF = balancedData

for features in index_features:
    encoder = OneHotEncoder(inputCols=[features],
                                    outputCols=[features + "_class_vec"])
    encoderDF = encoder.fit(encoderDF).transform(encoderDF)

# encoderDF = encoderDF.drop(*index_features)

In [47]:
label = 'stroke'
stages = []
num_features = ['age','hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
label_str_index =  StringIndexer(inputCol=label, outputCol="label_index")

# Scale Feature: Select the Features to Scale using helper 'select_features_to_scale' function above and Standardize 
unscaled_features = select_features_to_scale(df=encoderDF, lower_skew=-2, upper_skew=2, dtypes='double')

unscaled_assembler = VectorAssembler(inputCols=unscaled_features, outputCol="unscaled_features")
scaler = StandardScaler(inputCol="unscaled_features", outputCol="scaled_features")

stages += [unscaled_assembler, scaler]

# Create list of Numeric Features that Are Not Being Scaled
num_unscaled_diff_list = list(set(num_features) - set(unscaled_features))

# Assemble or Concat the Categorical Features and Numeric Features
assembler_inputs = [feature + "_class_vec" for feature in index_features] + num_unscaled_diff_list

assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs") 

stages += [label_str_index, assembler]

# Assemble Final Training Data of Scaled, Numeric, and Categorical Engineered Features
assembler_final = VectorAssembler(inputCols=["scaled_features","assembled_inputs"], outputCol="features")

stages += [assembler_final]

In [48]:
pipeline = Pipeline(stages=stages)

pipeline_model = pipeline.fit(encoderDF)

df_transform = pipeline_model.transform(encoderDF)

In [60]:
df_transform.select('gender_index_class_vec','ever_married_index_class_vec','work_type_index_class_vec','Residence_type_index_class_vec','smoking_status_index_class_vec','unscaled_features','scaled_features','assembled_inputs').filter(df_transform.id == 64778).show(1, False)

+----------------------+----------------------------+-------------------------+------------------------------+------------------------------+-----------------+-------------------+--------------------------------------------------+
|gender_index_class_vec|ever_married_index_class_vec|work_type_index_class_vec|Residence_type_index_class_vec|smoking_status_index_class_vec|unscaled_features|scaled_features    |assembled_inputs                                  |
+----------------------+----------------------------+-------------------------+------------------------------+------------------------------+-----------------+-------------------+--------------------------------------------------+
|(1,[],[])             |(1,[0],[1.0])               |(4,[0],[1.0])            |(1,[],[])                     |(3,[1],[1.0])                 |[32.5]           |[4.826355838293409]|(14,[1,2,8,10,11,12],[1.0,1.0,1.0,82.0,208.3,1.0])|
+----------------------+----------------------------+-----------------------

In [58]:
# df_transform.select('features').show(truncate=False)
from pyspark.ml.classification import RandomForestClassificationModel,GBTClassificationModel,DecisionTreeClassificationModel

# rfModel = RandomForestClassificationModel.load('model/random_forest')
gbtLoadedModel = GBTClassificationModel.load('model/gbt')
# dtModel = DecisionTreeClassificationModel.load('model/decision_tree')

predict = gbtLoadedModel.transform(df_transform)
# gbtModel.transform(df_transform)
# dtModel.transform(df_transform)


In [59]:
# predict.filter(predict.id == 64778).select('id','gender_index','age','hypertension','heart_disease','ever_married_index','work_type_index','Residence_type_index','avg_glucose_level','bmi','smoking_status_index').show()
# predict.filter(predict.id == 64778).select('prediction').show()
predict.filter(predict.id == 64778).select('features','rawPrediction','probability','prediction').show(1, False)

+----------------------------------------------------------------------+----------------------------------------+----------------------------------------+----------+
|features                                                              |rawPrediction                           |probability                             |prediction|
+----------------------------------------------------------------------+----------------------------------------+----------------------------------------+----------+
|(15,[0,2,3,9,11,12,13],[4.826355838293409,1.0,1.0,1.0,82.0,208.3,1.0])|[-0.5633275137631617,0.5633275137631617]|[0.24477893187401564,0.7552210681259843]|1.0       |
+----------------------------------------------------------------------+----------------------------------------+----------------------------------------+----------+



In [35]:
sql_row = encoderDF.filter(predict.id == 64778)
row_transform = pipeline_model.transform(sql_row)
predict = gbtModel.transform(row_transform)
x = predict.select('id','prediction').rdd.collect()
predictMess = ";".join([",".join(map(str, item)) for item in x])
print(predictMess)

64778,1.0


In [51]:
df_transform_fin = df_transform.select('features','label_index')
# df_transform_fin.show()
# df_transform_fin.count()
print ("So lan xuat hien cua stroke la 0: ",df_transform_fin.filter(df_transform_fin['label_index'] == 0).count())
print ("So lan xuat hien cua stroke la 1: ",df_transform_fin.filter(df_transform_fin['label_index'] == 1).count())

So lan xuat hien cua stroke la 0:  4699
So lan xuat hien cua stroke la 1:  4699


In [107]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label_index", featuresCol="features")
train_data, test_data = df_transform_fin.randomSplit([.7, .3])
test_data.drop("label_index")
model = dt.fit(train_data)
predictions = model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
# model.save('model/decision_tree')
print(accuracy)

0.8169515669515669


In [93]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label_index", featuresCol="features", numTrees=10)
train_data, test_data = df_transform_fin.randomSplit([.7, .3])
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)
accuracy = evaluator.evaluate(predictions)
# rfModel.save('model/random_forest')
print(accuracy)

0.805401405845357


In [54]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="label_index", featuresCol="features", maxIter=10)
train_data, test_data = df_transform_fin.randomSplit([.7, .3])
gbtModel = gbt.fit(train_data)
gbtPredictions = gbtModel.transform(test_data)
accuracy = evaluator.evaluate(gbtPredictions)
gbtModel.save('model/gbt')
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.15229
