In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("demo").getOrCreate()

In [3]:
schema = StructType([
    StructField("age", IntegerType(),nullable=True),
    StructField("sex", IntegerType(),nullable=True),
    StructField("cp", IntegerType(),nullable=True),
    StructField("trestbps", IntegerType(),nullable=True),
    StructField("chol", IntegerType(),nullable=True),
    StructField("fbs", IntegerType(),nullable=True),
    StructField("restecg", IntegerType(),nullable=True),
    StructField("thalach", IntegerType(),nullable=True),
    StructField("exang", IntegerType(),nullable=True),
    StructField("oldpeak", FloatType(),nullable=True),
    StructField("slope", IntegerType(),nullable=True),
    StructField("ca", IntegerType(),nullable=True),
    StructField("thal", IntegerType(),nullable=True),
    StructField("target", IntegerType(),nullable=True),
])

In [4]:
path = "heart.csv"
df = spark.read.csv(path=path,schema=schema,header=True)

In [5]:
df.show(5)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: float (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [7]:
for i in df.columns:
    print(i," ",":",df.stat.corr(i,"target"))

age   : -0.22543871587483838
sex   : -0.28093657550176687
cp   : 0.4337982615068946
trestbps   : -0.14493112849775
chol   : -0.08523910513756904
fbs   : -0.02804576027271302
restecg   : 0.1372295028737732
thalach   : 0.4217409338106742
exang   : -0.43675708335330315
oldpeak   : -0.4306960030062106
slope   : 0.34587707824172464
ca   : -0.39172399235125244
thal   : -0.34402926803830997
target   : 1.0


In [8]:
data = df[['age', 'trestbps', 'chol', 'thalach', 'oldpeak','sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal','target']]

In [9]:
data.show(5)

+---+--------+----+-------+-------+---+---+---+-------+-----+-----+---+----+------+
|age|trestbps|chol|thalach|oldpeak|sex| cp|fbs|restecg|exang|slope| ca|thal|target|
+---+--------+----+-------+-------+---+---+---+-------+-----+-----+---+----+------+
| 63|     145| 233|    150|    2.3|  1|  3|  1|      0|    0|    0|  0|   1|     1|
| 37|     130| 250|    187|    3.5|  1|  2|  0|      1|    0|    0|  0|   2|     1|
| 41|     130| 204|    172|    1.4|  0|  1|  0|      0|    0|    2|  0|   2|     1|
| 56|     120| 236|    178|    0.8|  1|  1|  0|      1|    0|    2|  0|   2|     1|
| 57|     120| 354|    163|    0.6|  0|  0|  0|      1|    1|    2|  0|   2|     1|
+---+--------+----+-------+-------+---+---+---+-------+-----+-----+---+----+------+
only showing top 5 rows



In [10]:
from pyspark.ml.feature import *

In [11]:
feature = VectorAssembler(inputCols = df.columns[:len(df.columns)-1],outputCol="features")
feature_vector= feature.transform(df)

In [12]:
feature_vector.show(5)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|            features|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|[63.0,1.0,3.0,145...|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|[37.0,1.0,2.0,130...|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|[41.0,0.0,1.0,130...|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|[56.0,1.0,1.0,120...|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|[57.0,0.0,0.0,120...|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
only showing top 5 rows



In [13]:
pandasDF = feature_vector.toPandas()

In [76]:
feature_vector_select = feature_vector.select(['features','target'])

In [77]:
(x_train, x_test) = feature_vector_select.randomSplit([0.8, 0.2])

In [85]:
type(x_train)

pyspark.sql.dataframe.DataFrame

In [78]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [79]:
rf = RandomForestClassifier(labelCol='target',featuresCol="features")

In [80]:
model = rf.fit(x_train)

In [81]:
prediction = model.transform(x_test)
prediction.show(5)

+--------------------+------+--------------------+--------------------+----------+
|            features|target|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(13,[0,2,3,4,7,10...|     1|[1.14760664292285...|[0.05738033214614...|       1.0|
|(13,[0,3,4,7,10,1...|     0|[0.98355852369041...|[0.04917792618452...|       1.0|
|[34.0,0.0,1.0,118...|     1|[1.05972505180352...|[0.05298625259017...|       1.0|
|[34.0,1.0,3.0,118...|     1|[2.91670752319277...|[0.14583537615963...|       1.0|
|[37.0,0.0,2.0,120...|     1|[1.05972505180352...|[0.05298625259017...|       1.0|
+--------------------+------+--------------------+--------------------+----------+
only showing top 5 rows



In [82]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [83]:
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='target',predictionCol="prediction",metricName='accuracy').evaluate(prediction))

Accuracy:  0.8676470588235294


In [20]:
paramGrid = ParamGridBuilder()\
                                .addGrid(rf.maxDepth, [5, 10, 20]) \
                                .addGrid(rf.maxBins, [20, 32, 50]) \
                                .addGrid(rf.numTrees, [20, 40, 60 ]) \
                                .addGrid(rf.impurity, ["gini", "entropy"]) \
                                .addGrid(rf.minInstancesPerNode, [1, 5, 10]) \
                                .build()

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
tvs = TrainValidationSplit( estimator=rf
                           ,estimatorParamMaps=paramGrid
                           ,evaluator=MulticlassClassificationEvaluator(labelCol='target')
                           ,trainRatio=0.8)
model = tvs.fit(x_train)
model_predictions= model.transform(x_test)


print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='target',predictionCol="prediction",metricName='accuracy').evaluate(model_predictions))

Py4JJavaError: An error occurred while calling o64752.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4243.0 failed 1 times, most recent failure: Lost task 0.0 in stage 4243.0 (TID 4030) (DESKTOP-ONEK9CT executor driver): java.io.IOException: fail to rename file C:\Users\LENOVO\AppData\Local\Temp\blockmgr-40784e38-8652-41bc-ad5a-0b9ec51b9f58\1a\shuffle_1798_4030_0.data.d219e88a-a39c-4696-9ed6-be734edb9fb7 to C:\Users\LENOVO\AppData\Local\Temp\blockmgr-40784e38-8652-41bc-ad5a-0b9ec51b9f58\1a\shuffle_1798_4030_0.data
	at org.apache.spark.shuffle.IndexShuffleBlockResolver.writeMetadataFileAndCommit(IndexShuffleBlockResolver.scala:385)
	at org.apache.spark.shuffle.sort.io.LocalDiskShuffleMapOutputWriter.commitAllPartitions(LocalDiskShuffleMapOutputWriter.java:119)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:663)
	at org.apache.spark.ml.tree.impl.RandomForest$.runBagged(RandomForest.scala:208)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:302)
	at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:161)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:138)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at sun.reflect.GeneratedMethodAccessor162.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.io.IOException: fail to rename file C:\Users\LENOVO\AppData\Local\Temp\blockmgr-40784e38-8652-41bc-ad5a-0b9ec51b9f58\1a\shuffle_1798_4030_0.data.d219e88a-a39c-4696-9ed6-be734edb9fb7 to C:\Users\LENOVO\AppData\Local\Temp\blockmgr-40784e38-8652-41bc-ad5a-0b9ec51b9f58\1a\shuffle_1798_4030_0.data
	at org.apache.spark.shuffle.IndexShuffleBlockResolver.writeMetadataFileAndCommit(IndexShuffleBlockResolver.scala:385)
	at org.apache.spark.shuffle.sort.io.LocalDiskShuffleMapOutputWriter.commitAllPartitions(LocalDiskShuffleMapOutputWriter.java:119)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
