In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=bc0b06ae3b3e1482760ece0aedfdcee24297ce0ebfaebc9cabfe9500c73f0589
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
# importing pyspark
import pyspark
from pyspark.sql import SparkSession
# creating a sparksession by default it will do two clusters

In [None]:
# Now we will create a cluster
spark=SparkSession.builder.appName("Healthcare").getOrCreate()

In [5]:
# importing data
appointmentdf=spark.read.csv("/content/drive/MyDrive/colab/KaggleV2-May-2016.csv",inferSchema=True,header=True)

In [7]:
appointmentdf.show(5) # displays top 5 rows

+-------------------+-------------+------+-------------------+-------------------+---+-----------------+-----------+------------+--------+----------+-------+------------+-------+
|          PatientId|AppointmentID|Gender|       ScheduledDay|     AppointmentDay|Age|    Neighbourhood|Scholarship|Hipertension|Diabetes|Alcoholism|Handcap|SMS_received|No-show|
+-------------------+-------------+------+-------------------+-------------------+---+-----------------+-----------+------------+--------+----------+-------+------------+-------+
| 2.9872499824296E13|      5642903|     F|2016-04-29 18:38:08|2016-04-29 00:00:00| 62|  JARDIM DA PENHA|          0|           1|       0|         0|      0|           0|     No|
|5.58997776694438E14|      5642503|     M|2016-04-29 16:08:27|2016-04-29 00:00:00| 56|  JARDIM DA PENHA|          0|           0|       0|         0|      0|           0|     No|
|  4.262962299951E12|      5642549|     F|2016-04-29 16:19:04|2016-04-29 00:00:00| 62|    MATA DA PRAIA| 

In [8]:
appointmentdf.describe().show() # displays the information of data set

+-------+--------------------+-----------------+------+------------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------+
|summary|           PatientId|    AppointmentID|Gender|               Age|Neighbourhood|        Scholarship|       Hipertension|           Diabetes|          Alcoholism|             Handcap|       SMS_received|No-show|
+-------+--------------------+-----------------+------+------------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------+
|  count|              110527|           110527|110527|            110527|       110527|             110527|             110527|             110527|              110527|              110527|             110527| 110527|
|   mean|1.474962657103946...|5675305.123426855|  NULL| 37.08887421173107|         NULL|0.09826558216544373| 0.1972459218109

In [None]:
# from above we got to know that there are no null values present in data set

In [9]:
appointmentdf.columns # gives you the names of variables

['PatientId',
 'AppointmentID',
 'Gender',
 'ScheduledDay',
 'AppointmentDay',
 'Age',
 'Neighbourhood',
 'Scholarship',
 'Hipertension',
 'Diabetes',
 'Alcoholism',
 'Handcap',
 'SMS_received',
 'No-show']

In [10]:
appointmentdf.dtypes # gives the data types of the variables

[('PatientId', 'double'),
 ('AppointmentID', 'int'),
 ('Gender', 'string'),
 ('ScheduledDay', 'timestamp'),
 ('AppointmentDay', 'timestamp'),
 ('Age', 'int'),
 ('Neighbourhood', 'string'),
 ('Scholarship', 'int'),
 ('Hipertension', 'int'),
 ('Diabetes', 'int'),
 ('Alcoholism', 'int'),
 ('Handcap', 'int'),
 ('SMS_received', 'int'),
 ('No-show', 'string')]

In [11]:
appointmentdf.groupBy('No-show').count().show() # gives the value counts of the variable

+-------+-----+
|No-show|count|
+-------+-----+
|     No|88208|
|    Yes|22319|
+-------+-----+



In [17]:
# Exploratory Data Analysis

In [12]:
appointmentdf.select("Age").describe().show()  # gives the statistical values of numeric columns

+-------+------------------+
|summary|               Age|
+-------+------------------+
|  count|            110527|
|   mean| 37.08887421173107|
| stddev|23.110204963682584|
|    min|                -1|
|    max|               115|
+-------+------------------+



In [13]:
from pyspark.sql.functions import skewness,kurtosis # importing skewness and kurtosis

In [14]:
appointmentdf.select(skewness("Age"),kurtosis("Age")).show() # displays the skewness and kurtosis of the variable

+-------------------+------------------+
|      skewness(Age)|     kurtosis(Age)|
+-------------------+------------------+
|0.12165636682687768|-0.952278601592428|
+-------------------+------------------+



In [18]:
appointmentdf.groupBy('No-show').mean('Age').show() # displays the average age of people who came for there appointment
# and who didnot came for there appointment

+-------+------------------+
|No-show|          avg(Age)|
+-------+------------------+
|     No|37.790064393252315|
|    Yes| 34.31766656212196|
+-------+------------------+



In [19]:
appointmentdf.crosstab('No-show','SMS_received').show() # displays crosstabulation of the variables

+--------------------+-----+-----+
|No-show_SMS_received|    0|    1|
+--------------------+-----+-----+
|                  No|62510|25698|
|                 Yes|12535| 9784|
+--------------------+-----+-----+



In [87]:
# Data Preprocessing

In [20]:
# Encoding object variables

In [21]:
# importing String Indexer and Pipeline(Pipeline passes the information to all clusters)
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [22]:
# dummy encoding the object variables
gender_dummy=StringIndexer(inputCol='Gender',outputCol='genderdummy')
neighbor_dummy=StringIndexer(inputCol='Neighbourhood',outputCol='neighbordummy')
scholar_dummy=StringIndexer(inputCol='Scholarship',outputCol='scholardummy')
bp_dummy=StringIndexer(inputCol='Hipertension',outputCol='bpdummy')
sugar_dummy=StringIndexer(inputCol='Diabetes',outputCol='sugardummy')
alcohol_dummy=StringIndexer(inputCol='Alcoholism',outputCol="alcoholdummy")
handicap_dummy=StringIndexer(inputCol='Handcap',outputCol='handicapdummy')
sms_dummy=StringIndexer(inputCol='SMS_received',outputCol='smsdummy')
noshow_dummy=StringIndexer(inputCol='No-show',outputCol='noshowdummy')

In [23]:
pipeline=Pipeline(stages=[gender_dummy,neighbor_dummy,scholar_dummy,bp_dummy,sugar_dummy,alcohol_dummy,handicap_dummy,sms_dummy,noshow_dummy])

In [25]:
appointmentRDD=pipeline.fit(appointmentdf).transform(appointmentdf)

In [26]:
appointmentRDD.show(5) # we can see that dummy columns are added at the last

+-------------------+-------------+------+-------------------+-------------------+---+-----------------+-----------+------------+--------+----------+-------+------------+-------+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
|          PatientId|AppointmentID|Gender|       ScheduledDay|     AppointmentDay|Age|    Neighbourhood|Scholarship|Hipertension|Diabetes|Alcoholism|Handcap|SMS_received|No-show|genderdummy|neighbordummy|scholardummy|bpdummy|sugardummy|alcoholdummy|handicapdummy|smsdummy|noshowdummy|
+-------------------+-------------+------+-------------------+-------------------+---+-----------------+-----------+------------+--------+----------+-------+------------+-------+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
| 2.9872499824296E13|      5642903|     F|2016-04-29 18:38:08|2016-04-29 00:00:00| 62|  JARDIM DA PENHA|          0|           1|       0|       

In [28]:
appointmentRDD.columns

['PatientId',
 'AppointmentID',
 'Gender',
 'ScheduledDay',
 'AppointmentDay',
 'Age',
 'Neighbourhood',
 'Scholarship',
 'Hipertension',
 'Diabetes',
 'Alcoholism',
 'Handcap',
 'SMS_received',
 'No-show',
 'genderdummy',
 'neighbordummy',
 'scholardummy',
 'bpdummy',
 'sugardummy',
 'alcoholdummy',
 'handicapdummy',
 'smsdummy',
 'noshowdummy']

In [29]:
# droping the columns
colsdrop=['PatientId','AppointmentID','Gender','ScheduledDay','AppointmentDay','Neighbourhood',
 'Scholarship','Hipertension','Diabetes','Alcoholism','Handcap','SMS_received','No-show']

In [30]:
appointmentRDDDF=appointmentRDD.drop(*colsdrop)

In [31]:
appointmentRDDDF.show(5) # we can cleary see that the columns in colsdrop got dropped

+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
|Age|genderdummy|neighbordummy|scholardummy|bpdummy|sugardummy|alcoholdummy|handicapdummy|smsdummy|noshowdummy|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
| 62|        0.0|          3.0|         0.0|    1.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        1.0|          3.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 62|        0.0|         49.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
|  8|        0.0|         75.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        0.0|          3.0|         0.0|    1.0|       1.0|         0.0|          0.0|     0.0|        0.0|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----

In [32]:
 # importing RFormula
 from pyspark.ml.feature import RFormula

In [33]:
formula=RFormula(formula='noshowdummy~.',featuresCol='features',labelCol='label')
# coverting indepentent varaiable into one cluster and dependent variable into one cluster
# features are independent variables
# label is dependent variable

In [34]:
appointmentRDD=formula.fit(appointmentRDDDF).transform(appointmentRDDDF)

In [35]:
appointmentRDD.select('features','label').show(5)
# we have converted dependent and independent variables into two clusters

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(9,[0,2,4],[62.0,...|  0.0|
|(9,[0,1,2],[56.0,...|  0.0|
|(9,[0,2],[62.0,49...|  0.0|
|(9,[0,2],[8.0,75.0])|  0.0|
|(9,[0,2,4,5],[56....|  0.0|
+--------------------+-----+
only showing top 5 rows



In [36]:
appointmentRDDDF.show(5)

+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
|Age|genderdummy|neighbordummy|scholardummy|bpdummy|sugardummy|alcoholdummy|handicapdummy|smsdummy|noshowdummy|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
| 62|        0.0|          3.0|         0.0|    1.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        1.0|          3.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 62|        0.0|         49.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
|  8|        0.0|         75.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        0.0|          3.0|         0.0|    1.0|       1.0|         0.0|          0.0|     0.0|        0.0|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----

In [38]:
# Model Building

In [40]:
# Logistic Regression

In [39]:
# importing LogisticRegression,BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [41]:
logit=LogisticRegression(featuresCol='features',labelCol='label')

In [42]:
logitmodel=logit.fit(appointmentRDD)

In [43]:
logitmodel.summary.accuracy # accuracy is 0.7

0.798067440534892

In [44]:
logitmodel.summary.areaUnderROC # auc=0.59

0.5964327533383069

In [45]:
logitpredict=logitmodel.transform(appointmentRDD)

In [47]:
logitpredict.show(5) # these are the predicted values

+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+--------------------+-----+--------------------+--------------------+----------+
|Age|genderdummy|neighbordummy|scholardummy|bpdummy|sugardummy|alcoholdummy|handicapdummy|smsdummy|noshowdummy|            features|label|       rawPrediction|         probability|prediction|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+--------------------+-----+--------------------+--------------------+----------+
| 62|        0.0|          3.0|         0.0|    1.0|       0.0|         0.0|          0.0|     0.0|        0.0|(9,[0,2,4],[62.0,...|  0.0|[1.84923515524927...|[0.86403727644512...|       0.0|
| 56|        1.0|          3.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|(9,[0,1,2],[56.0,...|  0.0|[1.76093057588088...|[0.85332616995024...|       0.0|
| 62|        0.0|         49.0|         

In [48]:
# for base model we will get it as part of model but for other models we need to do it manually
accuracy=MulticlassClassificationEvaluator(metricName='accuracy')
auc=BinaryClassificationEvaluator()

In [49]:
# Decision Tree

In [50]:
from pyspark.ml.classification import DecisionTreeClassifier

In [53]:
tree=DecisionTreeClassifier(maxBins=81)

In [54]:
treemodel=tree.fit(appointmentRDD)

In [55]:
treepredict=treemodel.transform(appointmentRDD)

In [56]:
accuracy.evaluate(treepredict) # accuracy is 0.79

0.798067440534892

In [57]:
auc.evaluate(treepredict) # auc is 0.5

0.5

In [58]:
# Random Forest

In [59]:
from pyspark.ml.classification import RandomForestClassifier

In [60]:
rf=RandomForestClassifier(maxBins=81)

In [61]:
rfmodel=rf.fit(appointmentRDD)

In [62]:
rfpredict=rfmodel.transform(appointmentRDD)

In [63]:
accuracy.evaluate(rfpredict) # accuracy is 0.79

0.798067440534892

In [64]:
auc.evaluate(rfpredict) # auc is 0.6

0.6047476048470436

In [65]:
# Gradiant Boosting

In [66]:
from pyspark.ml.classification import GBTClassifier

In [67]:
gbm=GBTClassifier(maxBins=81)

In [68]:
gbmmodel=gbm.fit(appointmentRDD)

In [69]:
gbmpredict=gbmmodel.transform(appointmentRDD)

In [70]:
accuracy.evaluate(gbmpredict) # accuracy is 0.79

0.7986012467541868

In [71]:
auc.evaluate(gbmpredict) # auc is 0.64

0.6454879968792955

In [72]:
# Support Vector Machine

In [73]:
from pyspark.ml.classification import LinearSVC

In [74]:
svc=LinearSVC()

In [75]:
svcmodel=svc.fit(appointmentRDD)

In [76]:
svcpredict=svcmodel.transform(appointmentRDD)

In [77]:
accuracy.evaluate(svcpredict) # accuracy is 0.79

0.798067440534892

In [78]:
auc.evaluate(svcpredict) # auc is 0.54

0.540733973376347

| Model Name | Accuracy | ROC-AUC |
|--|--|--|
| Logistic Regression | 0.798067440534892 | 0.5964327533383069 |
| Decision Tree |0.798067440534892 |0.5 |
| Random Forest |0.798067440534892|0.6047476048470436 |
| **Gradient Boosting Machine** | **0.7986012467541868** | **0.6454879968792955**|
| Support Vector Machine | 0.798067440534892 |0.540733973376347|

 From the Accuracy and AUC we say that  Gradient Boosting  is best model