In [1]:
import findspark
# findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
findspark.init('../spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('multiple_iterations').getOrCreate()

from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier,\
                                       LogisticRegression, MultilayerPerceptronClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import ChiSqSelector

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/23 14:22:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Read the previouly cleaned dataset

In [6]:
df = spark.read.csv('cleaned.csv', header=True)
# turn type string into double
cols = ['HighBP', 'HighChol', 'CholCheck',
       'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits',
       'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
       'DiffWalk', 'Sex', 'BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age',
       'Education', 'Income']
df = df.withColumn('Diabetes',col('Diabetes').cast('double'))
for c in cols:
    df = df.withColumn(c,col(c).cast('double'))
    
assembler = VectorAssembler(inputCols=cols, outputCol="features")
df = assembler.transform(df)

from pyspark.sql.functions import col, explode, array, lit
major_df = df.filter(col("Diabetes") == 0)
minor_df = df.filter(col("Diabetes") == 1)
ratio = int(major_df.count()/minor_df.count())
sampled_majority_df = major_df.sample(False, 1/ratio)
balanced_data = sampled_majority_df.unionAll(minor_df)



#### Instead of keeping 15 features, we can iterate with more or less number of features.

13 Features:

In [7]:
selector_13 = ChiSqSelector(featuresCol="features", outputCol="selected_features", \
                         labelCol="Diabetes", numTopFeatures=13)
model_13 = selector_13.fit(balanced_data)
df_13 = model_13.transform(balanced_data)
selected_indices_13 = model_13.selectedFeatures
selected_feature_names_13 = [df_13.columns[index] for index in selected_indices_13]
selected_feature_names_13

                                                                                

['Diabetes',
 'HighBP',
 'HighChol',
 'CholCheck',
 'Smoker',
 'Stroke',
 'HeartDiseaseorAttack',
 'PhysActivity',
 'Fruits',
 'Veggies',
 'AnyHealthcare',
 'NoDocbcCost',
 'DiffWalk']

18 Features:

In [8]:
selector_18 = ChiSqSelector(featuresCol="features", outputCol="selected_features", \
                         labelCol="Diabetes", numTopFeatures=18)
model_18 = selector_18.fit(balanced_data)
df_18 = model_18.transform(balanced_data)
selected_indices_18 = model_18.selectedFeatures
selected_feature_names_18 = [df_18.columns[index] for index in selected_indices_18]
selected_feature_names_18

                                                                                

['Diabetes',
 'HighBP',
 'HighChol',
 'CholCheck',
 'Smoker',
 'Stroke',
 'HeartDiseaseorAttack',
 'PhysActivity',
 'Fruits',
 'Veggies',
 'AnyHealthcare',
 'NoDocbcCost',
 'DiffWalk',
 'Sex',
 'BMI',
 'GenHlth',
 'MentHlth',
 'PhysHlth']

In [10]:
df_18.printSchema()

root
 |-- Diabetes: double (nullable = true)
 |-- HighBP: double (nullable = true)
 |-- HighChol: double (nullable = true)
 |-- CholCheck: double (nullable = true)
 |-- Smoker: double (nullable = true)
 |-- Stroke: double (nullable = true)
 |-- HeartDiseaseorAttack: double (nullable = true)
 |-- PhysActivity: double (nullable = true)
 |-- Fruits: double (nullable = true)
 |-- Veggies: double (nullable = true)
 |-- HvyAlcoholConsump: double (nullable = true)
 |-- AnyHealthcare: double (nullable = true)
 |-- NoDocbcCost: double (nullable = true)
 |-- DiffWalk: double (nullable = true)
 |-- Sex: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- GenHlth: double (nullable = true)
 |-- MentHlth: double (nullable = true)
 |-- PhysHlth: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Education: double (nullable = true)
 |-- Income: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- selected_features: vector (nullable = true)



Train test data set split:

In [11]:
train_data_13,test_data_13 = df_13.select(['Diabetes', 'features']).randomSplit([0.8,0.2])
train_data_18,test_data_18 = df_18.select(['Diabetes', 'features']).randomSplit([0.8,0.2])

Fit Random Forest models:

In [14]:
rfc_13 = RandomForestClassifier(labelCol='Diabetes',featuresCol='features',numTrees=5)
rfc_model_13 = rfc_13.fit(train_data_13)
rfc_predictions_13 = rfc_model_13.transform(test_data_13)

rfc_18 = RandomForestClassifier(labelCol='Diabetes',featuresCol='features',numTrees=5)
rfc_model_18 = rfc_18.fit(train_data_18)
rfc_predictions_18 = rfc_model_18.transform(test_data_18)

                                                                                

In [19]:
# Random Forest Feature Importances
rfc_importances_13 = rfc_model_13.featureImportances
print("Random Forest Feature Importances with 13 features selected:")
for i, (col, importance) in enumerate(zip(selected_feature_names_13, rfc_importances_13)):
    print(f"{col}: {importance}")
    
rfc_importances_18 = rfc_model_18.featureImportances
print("\nRandom Forest Feature Importances with 18 features selected:")
for i, (col, importance) in enumerate(zip(selected_feature_names_18, rfc_importances_18)):
    print(f"{col}: {importance}")

Random Forest Feature Importances with 13 features selected:
Diabetes: 0.18336728882452255
HighBP: 0.17514792065220497
HighChol: 0.0003839592809668961
CholCheck: 0.0
Smoker: 0.0012287003797629764
Stroke: 0.011562354891281778
HeartDiseaseorAttack: 0.0021514216384969165
PhysActivity: 0.0
Fruits: 0.0
Veggies: 0.0005246152380332262
AnyHealthcare: 0.0
NoDocbcCost: 0.0
DiffWalk: 0.008593583657584279

Random Forest Feature Importances with 18 features selected:
Diabetes: 0.2640150234610442
HighBP: 0.1748094618204812
HighChol: 0.0006902259828370694
CholCheck: 0.0007931794568797951
Smoker: 0.0007975972721726623
Stroke: 0.00960876473339492
HeartDiseaseorAttack: 0.0015539061682457566
PhysActivity: 0.0
Fruits: 0.0
Veggies: 0.0
AnyHealthcare: 0.0
NoDocbcCost: 0.0002962330983736624
DiffWalk: 0.007349837468864783
Sex: 0.0
BMI: 0.07137310864550205
GenHlth: 0.36872025446699663
MentHlth: 0.0002591567620108128
PhysHlth: 0.002035654977815754
