# Importing Pyspark

In [1]:
import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ubung").getOrCreate()

spark

In [2]:
import numpy as np
import pandas as pd

# Reading the Dataset

In [3]:
path = "Dataset/"
df = spark.read.csv(path + "Toddler Autism dataset July 2018.csv", header = True, inferSchema = True)

In [4]:
df.limit(5).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


In [5]:
df.printSchema()

root
 |-- Case_No: integer (nullable = true)
 |-- A1: integer (nullable = true)
 |-- A2: integer (nullable = true)
 |-- A3: integer (nullable = true)
 |-- A4: integer (nullable = true)
 |-- A5: integer (nullable = true)
 |-- A6: integer (nullable = true)
 |-- A7: integer (nullable = true)
 |-- A8: integer (nullable = true)
 |-- A9: integer (nullable = true)
 |-- A10: integer (nullable = true)
 |-- Age_Mons: integer (nullable = true)
 |-- Qchat-10-Score: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- Jaundice: string (nullable = true)
 |-- Family_mem_with_ASD: string (nullable = true)
 |-- Who completed the test: string (nullable = true)
 |-- Class/ASD Traits : string (nullable = true)



In [6]:
df.count()

1054

In [7]:
df.groupBy("Class/ASD Traits ").count().limit(10).toPandas()

Unnamed: 0,Class/ASD Traits,count
0,No,326
1,Yes,728


In [8]:
df = df.drop('Case_No')

# Importing required modules for preprocessing

In [9]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler
from pyspark.sql.types import *
from pyspark.sql.functions import *

# One Hot Encoding

In [10]:
indexer = StringIndexer(inputCol = "Class/ASD Traits ", outputCol = "label") # instantiate the class
df = indexer.fit(df).transform(df)
df = df.drop("Class/ASD Traits ")

In [11]:
for column in df.columns:
    if str(df.schema[column].dataType) == "StringType":
        indexer = StringIndexer(inputCol = column, outputCol = column+"_num")
        df = indexer.fit(df).transform(df)
        df = df.drop(column)

In [12]:
df.printSchema()

root
 |-- A1: integer (nullable = true)
 |-- A2: integer (nullable = true)
 |-- A3: integer (nullable = true)
 |-- A4: integer (nullable = true)
 |-- A5: integer (nullable = true)
 |-- A6: integer (nullable = true)
 |-- A7: integer (nullable = true)
 |-- A8: integer (nullable = true)
 |-- A9: integer (nullable = true)
 |-- A10: integer (nullable = true)
 |-- Age_Mons: integer (nullable = true)
 |-- Qchat-10-Score: integer (nullable = true)
 |-- label: double (nullable = false)
 |-- Sex_num: double (nullable = false)
 |-- Ethnicity_num: double (nullable = false)
 |-- Jaundice_num: double (nullable = false)
 |-- Family_mem_with_ASD_num: double (nullable = false)
 |-- Who completed the test_num: double (nullable = false)



# Vectorizing the features

In [13]:
feature_cols = df.columns
feature_cols.remove("label")

assembler = VectorAssembler(inputCols = feature_cols, outputCol = "features") # instantiate the VectorAssembler class
df = assembler.transform(df).select('features','label')

# MinMax Scaling

In [14]:
scaler = MinMaxScaler(inputCol = 'features', outputCol = 'scaled_features', min = 0, max = 1000)
df = scaler.fit(df).transform(df).drop('features').withColumnRenamed('scaled_features','features').select('features','label')

# Train-Test-Split

In [15]:
train, test = df.randomSplit([0.7,0.3])

# Importing required modules for ML models

In [16]:
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql.functions import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Evaluators

In [17]:
binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction")
MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# Logistic Regression (Simple)

In [18]:
classifier_lr = LogisticRegression() # instantiate the model
fitModel_lr = classifier_lr.fit(train)
prediction_lr = fitModel_lr.transform(test)
accuracy_lr = binary_evaluator.evaluate(prediction_lr)

# Logistic Regression + Cross Validation

In [19]:
classifier = LogisticRegression() # instantiate the model
param_grid = ParamGridBuilder().addGrid(classifier.maxIter,[10,15,20]).build()
cross_val = CrossValidator(estimator = classifier, 
                           estimatorParamMaps = param_grid, 
                           evaluator = MC_evaluator,
                           numFolds = 2)
fitModel_lr_cv = cross_val.fit(train)
bestModel_lr_cv = fitModel_lr_cv.bestModel
prediction_lr_cv = bestModel_lr_cv.transform(test)
accuracy_lr_cv = binary_evaluator.evaluate(prediction_lr_cv)

## Model parameters

In [20]:
print("intercept vector : {}".format(bestModel_lr_cv.interceptVector)) # b
print("coefficient matrix : {}".format(bestModel_lr_cv.coefficientMatrix.values)) # w
print("size of coefficient matrix: {}".format(bestModel_lr_cv.coefficientMatrix.values.size)) # y = X*w + b

intercept vector : [30.060869474027093]
coefficient matrix : [-6.28275248e-03 -6.46228179e-03 -6.52060181e-03 -6.57498595e-03
 -6.57408183e-03 -6.59697182e-03 -6.57001553e-03 -6.38239866e-03
 -6.90670261e-03 -6.51317152e-03 -5.92170795e-04 -1.87971824e-02
  8.12299853e-05 -9.72707493e-04 -3.63403933e-04  5.46948541e-04
 -4.63015354e-03]
size of coefficient matrix: 17


In [21]:
coeff_dict = {}
for i in range(17):
    coeff_dict[feature_cols[i]] = bestModel_lr_cv.coefficientMatrix.values[i]
coeff_df = pd.DataFrame(coeff_dict.values(), index = coeff_dict.keys(),columns = ["Coefficient"])
coeff_df.head(20)

Unnamed: 0,Coefficient
A1,-0.006283
A2,-0.006462
A3,-0.006521
A4,-0.006575
A5,-0.006574
A6,-0.006597
A7,-0.00657
A8,-0.006382
A9,-0.006907
A10,-0.006513


In [22]:
bestModel_lr_cv.summary.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|          (17,[],[])|  1.0|[-30.060869474027...|[8.80501836673220...|       1.0|
|(17,[0,1,2,3,4,5,...|  0.0|[45.7273871772886...|           [1.0,0.0]|       0.0|
|(17,[0,1,2,3,4,5,...|  0.0|[37.5898886293620...|           [1.0,0.0]|       0.0|
|(17,[0,1,2,3,4,5,...|  0.0|[29.5585593148317...|[0.99999999999985...|       0.0|
|(17,[0,1,2,3,4,5,...|  0.0|[29.3682909659193...|[0.99999999999982...|       0.0|
|(17,[0,1,2,3,4,6,...|  0.0|[29.3439861600389...|[0.99999999999981...|       0.0|
|(17,[0,1,2,3,4,6,...|  0.0|[37.3002399899054...|           [1.0,0.0]|       0.0|
|(17,[0,1,2,3,4,6,...|  0.0|[29.6780217573680...|[0.99999999999987...|       0.0|
|(17,[0,1,2,3,4,9,...|  0.0|[21.0196600966124...|[0.99999999925650...|       0.0|
|(17,[0,1,2,3,5,

In [23]:
bestModel_lr_cv.summary.objectiveHistory

[0.616580115847775,
 0.5115461139458507,
 0.4463288722949394,
 0.27818593629176064,
 0.2521447969907854,
 0.21423165163412697,
 0.19429642293276117,
 0.1726094498882126,
 0.13890882003001176,
 0.08497330773467081,
 0.03879752528276039,
 0.021816564529232037,
 0.014295431281499942,
 0.0074907102283146565,
 0.005952551671438286,
 0.002979112365691491]

In [24]:
bestModel_lr_cv.summary.accuracy

1.0

In [25]:
for i, rate in enumerate(bestModel_lr_cv.summary.falsePositiveRateByLabel):
    print("false positive of label_{} : {}".format(i,rate))
    
for i, rate in enumerate(bestModel_lr_cv.summary.truePositiveRateByLabel):
    print("true positive of label_{} : {}".format(i,rate))
    
for i, prec in enumerate(bestModel_lr_cv.summary.precisionByLabel):
    print("precision by label for label_{} : {}".format(i,prec))
    
for i, rec in enumerate(bestModel_lr_cv.summary.recallByLabel):
    print("recall by label for label_{} : {}".format(i,rec))
    
for i, f in enumerate(bestModel_lr_cv.summary.fMeasureByLabel()):
    print("F-measure by label for label_{} : {}".format(i,f))
    
print("The accuracy of this regression model was: {0:.0%}".format(bestModel_lr_cv.summary.accuracy))

false positive of label_0 : 0.0
false positive of label_1 : 0.0
true positive of label_0 : 1.0
true positive of label_1 : 1.0
precision by label for label_0 : 1.0
precision by label for label_1 : 1.0
recall by label for label_0 : 1.0
recall by label for label_1 : 1.0
F-measure by label for label_0 : 1.0
F-measure by label for label_1 : 1.0
The accuracy of this regression model was: 100%


# MLP

## extracting the number of features and classes

In [26]:
num_features = len(df.select(['features']).collect()[0][0])
num_classes = df.select(countDistinct('label')).collect()[0][0]
print("The number of features = {}".format(num_features))
print("The number of classes = {}".format(num_classes))

The number of features = 17
The number of classes = 2


In [27]:
N1 = 128
N2 = 32
layers = [num_features,N1,N2,num_classes]
classifier_mlp = MultilayerPerceptronClassifier(maxIter = 100, layers = layers, seed = 1234, blockSize= 128)
fitModel_mlp = classifier_mlp.fit(train)
predictions_mlp = fitModel_mlp.transform(test)
accuracy_mlp = MC_evaluator.evaluate(predictions_mlp)
print("The accuracy of this model was : {0:.0%}".format(accuracy_mlp))

The accuracy of this model was : 96%


# Naive Bayes

In [28]:
classifier_nb = NaiveBayes()
fitModel_nb = classifier_nb.fit(train)
predictions_nb = fitModel_nb.transform(test)
accuracy_nb = MC_evaluator.evaluate(predictions_nb)
print("The accuracy of this model was : {0:.0%}".format(accuracy_nb))

The accuracy of this model was : 86%


# Linear Support Vector Machine

In [29]:
classifier_svc = LinearSVC()
fitModel_svc = classifier_svc.fit(train)
predictions_svc = fitModel_svc.transform(test)
accuracy_svc = MC_evaluator.evaluate(predictions_svc)
print("The accuracy of this model was {0:.0%}".format(accuracy_svc))

The accuracy of this model was 100%


# Decision Tree

In [30]:
classifier_dt = DecisionTreeClassifier()
fitModel_dt = classifier_dt.fit(train)
predictions_dt = fitModel_dt.transform(test)
accuracy_dt = binary_evaluator.evaluate(predictions_dt)
print("The accuracy of this model is {0:.0%}".format(accuracy_dt))

The accuracy of this model is 100%


# Random Forest

In [31]:
classifier_rf = RandomForestClassifier()
fitModel_rf = classifier_rf.fit(train)
predictions_rf = fitModel_rf.transform(test)
accuracy_rf = binary_evaluator.evaluate(predictions_rf)
print("The accuracy of this model was {0:.0%}".format(accuracy_rf))

The accuracy of this model was 100%


# Gradient Boost Tree

In [32]:
classifier_gbt = GBTClassifier()
fitModel = classifier.fit(train)
predictions_gbt = fitModel.transform(test)
accuracy_gbt = binary_evaluator.evaluate(predictions_gbt)
print("The accuracy of this model was {0:.0%}".format(accuracy_gbt))

The accuracy of this model was 100%


# Confusion Matrix

In [44]:
from pyspark.mllib.evaluation import MulticlassMetrics
preds_and_labels = predictions_gbt.select('prediction',predictions_gbt.label.cast(FloatType())).orderBy('prediction')
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
cm = metrics.confusionMatrix().toArray()