### pyspark -RANDOMFOREST Classifictaion model for Stroke prediction

In [50]:
#set environment
import os
import sys
 
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [51]:
#import Sparksession driver
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Classification of Car Dataset") \
    .getOrCreate()

In [52]:
df = spark.read.csv('new_classification2_try.csv',inferSchema=True,header=True)
df.show()

+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+
|gender| age|hypertension|heart_disease|Marital_status|    work_type|Residence_type|avg_glucose_level|        bmi| smoking_status|stroke|
+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+
|  Male|67.0|           0|            1|           Yes|      Private|         Urban|           228.69|       36.6|formerly smoked|     1|
|Female|61.0|           0|            0|           Yes|Self-employed|         Rural|           202.21|28.89323691|   never smoked|     1|
|  Male|80.0|           0|            1|           Yes|      Private|         Rural|           105.92|       32.5|   never smoked|     1|
|Female|49.0|           0|            0|           Yes|      Private|         Urban|           171.23|       34.4|         smokes|     1|
|Female|79.0|           1|        

In [53]:
df= df.drop("_c0")

In [54]:
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- Marital_status: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [55]:
#Check for missing values
for col in df.columns:
    print("no. of cells in column", col, "with null values:", df.filter(df[col].isNull()).count())

no. of cells in column gender with null values: 0
no. of cells in column age with null values: 0
no. of cells in column hypertension with null values: 0
no. of cells in column heart_disease with null values: 0
no. of cells in column Marital_status with null values: 0
no. of cells in column work_type with null values: 0
no. of cells in column Residence_type with null values: 0
no. of cells in column avg_glucose_level with null values: 0
no. of cells in column bmi with null values: 0
no. of cells in column smoking_status with null values: 0
no. of cells in column stroke with null values: 0


In [56]:
#Label encoder
from pyspark.ml.feature import StringIndexer
indexed = df
for col in ['gender','Marital_status','work_type','Residence_type','smoking_status']:
    stringIndexer = StringIndexer(inputCol=col, outputCol=col+"_encoded")
    indexed = stringIndexer.fit(indexed).transform(indexed)
indexed.show()

+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+--------------+----------------------+-----------------+----------------------+----------------------+
|gender| age|hypertension|heart_disease|Marital_status|    work_type|Residence_type|avg_glucose_level|        bmi| smoking_status|stroke|gender_encoded|Marital_status_encoded|work_type_encoded|Residence_type_encoded|smoking_status_encoded|
+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+--------------+----------------------+-----------------+----------------------+----------------------+
|  Male|67.0|           0|            1|           Yes|      Private|         Urban|           228.69|       36.6|formerly smoked|     1|           1.0|                   0.0|              0.0|                   0.0|                   2.0|
|Female|61.0|           0|            0|

In [57]:
#One hot encoder
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="work_type_encoded",outputCol="work_type_vec",dropLast=True)
encoded = encoder.transform(indexed)
encoded.show()

+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+--------------+----------------------+-----------------+----------------------+----------------------+-------------+
|gender| age|hypertension|heart_disease|Marital_status|    work_type|Residence_type|avg_glucose_level|        bmi| smoking_status|stroke|gender_encoded|Marital_status_encoded|work_type_encoded|Residence_type_encoded|smoking_status_encoded|work_type_vec|
+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+--------------+----------------------+-----------------+----------------------+----------------------+-------------+
|  Male|67.0|           0|            1|           Yes|      Private|         Urban|           228.69|       36.6|formerly smoked|     1|           1.0|                   0.0|              0.0|                   0.0|                   2.0

In [58]:
#One hot encoder
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="smoking_status_encoded",outputCol="smoking_status_vec",dropLast=True)
encoded = encoder.transform(encoded)
encoded.show()

+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+--------------+----------------------+-----------------+----------------------+----------------------+-------------+------------------+
|gender| age|hypertension|heart_disease|Marital_status|    work_type|Residence_type|avg_glucose_level|        bmi| smoking_status|stroke|gender_encoded|Marital_status_encoded|work_type_encoded|Residence_type_encoded|smoking_status_encoded|work_type_vec|smoking_status_vec|
+------+----+------------+-------------+--------------+-------------+--------------+-----------------+-----------+---------------+------+--------------+----------------------+-----------------+----------------------+----------------------+-------------+------------------+
|  Male|67.0|           0|            1|           Yes|      Private|         Urban|           228.69|       36.6|formerly smoked|     1|           1.0|                   0.0|      

In [59]:
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- Marital_status: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [60]:
#all the independent variables need to be packed into one column of vector type
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["gender_encoded","age","hypertension","heart_disease","Marital_status_encoded","work_type_vec","Residence_type_encoded","avg_glucose_level","bmi","smoking_status_vec"], 
                            outputCol="features")
feature_vec=assembler.transform(encoded).select('features','stroke')
feature_vec.show(5)

+--------------------+------+
|            features|stroke|
+--------------------+------+
|(15,[0,1,3,5,10,1...|     1|
|(15,[1,6,9,10,11,...|     1|
|(15,[0,1,3,5,9,10...|     1|
|(15,[1,5,10,11],[...|     1|
|(15,[1,2,6,9,10,1...|     1|
+--------------------+------+
only showing top 5 rows



In [61]:
#Count of target classes
feature_vec.groupBy('stroke').count().show()
#there is data imbalance

+------+-----+
|stroke|count|
+------+-----+
|     1|  249|
|     0| 4861|
+------+-----+



In [62]:
# Split the data into train and test sets
train_data, test_data = feature_vec.randomSplit([.75,.25],seed=0)

In [63]:
from pyspark.ml.classification import RandomForestClassifier
#Grid Search
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
rf = RandomForestClassifier( labelCol='stroke',seed=0)
paramGrid = (ParamGridBuilder()\
             .addGrid(rf.maxDepth,[10,11,12])\
             .addGrid(rf.numTrees,[20,30,40])\
             .build())

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='stroke', metricName='f1')
# Create 4-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)

cvModel = cv.fit(train_data)

In [64]:
#Best Model Params
score_params_list = list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))
max(score_params_list,key=lambda item:item[0])

(0.9273151304798342,
 {Param(parent='RandomForestClassifier_42d29efe77b84a7387a1', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,
  Param(parent='RandomForestClassifier_42d29efe77b84a7387a1', name='numTrees', doc='Number of trees to train (>= 1).'): 20})

In [65]:
predictions = cvModel.transform(test_data)
evaluator.evaluate(predictions) 

0.9314948577847166

#### Interpretation
Here we can see that we have an accuracy of 93% for testing ,which indicates a good fit model.Tis is a a score which is almost similar to the score that i got in sklearn Randomforest.