In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\spark\\spark-3.0.3-bin-hadoop2.7'

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(appName = "MedicalData")

In [4]:
! echo $PYSPARK_SUBMIT_ARGS

$PYSPARK_SUBMIT_ARGS


In [5]:
spark = SparkSession.builder.appName('MedicalData').getOrCreate()

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer

In [7]:
#Load a text file and convert each line to a Row

data = []
with open("wdbc.data") as infile:
    for line in infile:
        tokens = line.rstrip("\n").split(",")
        y = tokens[1]
        features = Vectors.dense([float(x) for x in tokens[2:]])
        
        data.append((y, features))

In [8]:
inputDF = spark.createDataFrame(data, ["label", "features"])

In [9]:
inputDF.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    M|[17.99,10.38,122....|
|    M|[20.57,17.77,132....|
|    M|[19.69,21.25,130....|
|    M|[11.42,20.38,77.5...|
|    M|[20.29,14.34,135....|
|    M|[12.45,15.7,82.57...|
|    M|[18.25,19.98,119....|
|    M|[13.71,20.83,90.2...|
|    M|[13.0,21.82,87.5,...|
|    M|[12.46,24.04,83.9...|
|    M|[16.02,23.24,102....|
|    M|[15.78,17.89,103....|
|    M|[19.17,24.8,132.4...|
|    M|[15.85,23.95,103....|
|    M|[13.73,22.61,93.6...|
|    M|[14.54,27.54,96.7...|
|    M|[14.68,20.13,94.7...|
|    M|[16.13,20.68,108....|
|    M|[19.81,22.15,130....|
|    B|[13.54,14.36,87.4...|
+-----+--------------------+
only showing top 20 rows



In [10]:
stringIndexer = StringIndexer(inputCol = "label", outputCol = "labelIndexed")
si_model = stringIndexer.fit(inputDF)
inputDF2 = si_model.transform(inputDF)

In [11]:
inputDF2.show()

+-----+--------------------+------------+
|label|            features|labelIndexed|
+-----+--------------------+------------+
|    M|[17.99,10.38,122....|         1.0|
|    M|[20.57,17.77,132....|         1.0|
|    M|[19.69,21.25,130....|         1.0|
|    M|[11.42,20.38,77.5...|         1.0|
|    M|[20.29,14.34,135....|         1.0|
|    M|[12.45,15.7,82.57...|         1.0|
|    M|[18.25,19.98,119....|         1.0|
|    M|[13.71,20.83,90.2...|         1.0|
|    M|[13.0,21.82,87.5,...|         1.0|
|    M|[12.46,24.04,83.9...|         1.0|
|    M|[16.02,23.24,102....|         1.0|
|    M|[15.78,17.89,103....|         1.0|
|    M|[19.17,24.8,132.4...|         1.0|
|    M|[15.85,23.95,103....|         1.0|
|    M|[13.73,22.61,93.6...|         1.0|
|    M|[14.54,27.54,96.7...|         1.0|
|    M|[14.68,20.13,94.7...|         1.0|
|    M|[16.13,20.68,108....|         1.0|
|    M|[19.81,22.15,130....|         1.0|
|    B|[13.54,14.36,87.4...|         0.0|
+-----+--------------------+------

In [12]:
#train/test split
(trainingData, testData) = inputDF2.randomSplit([0.7,0.3], seed = 23)

In [13]:
#Training Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [14]:
decisionTree = DecisionTreeClassifier(labelCol = "labelIndexed")

In [15]:
dtModel = decisionTree.fit(trainingData)

In [16]:
dtModel.numNodes

27

In [17]:
dtModel.depth

5

In [18]:
dtModel.featureImportances

SparseVector(30, {0: 0.0073, 1: 0.0565, 2: 0.0082, 3: 0.0131, 5: 0.0219, 10: 0.0105, 20: 0.7509, 21: 0.018, 24: 0.0134, 26: 0.0106, 27: 0.0897})

In [19]:
dtModel.numFeatures

30

In [20]:
print (dtModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b7047f737c92, depth=5, numNodes=27, numClasses=2, numFeatures=30
  If (feature 20 <= 16.765)
   If (feature 27 <= 0.15635)
    If (feature 3 <= 696.25)
     If (feature 24 <= 0.179)
      Predict: 0.0
     Else (feature 24 > 0.179)
      If (feature 0 <= 9.7485)
       Predict: 0.0
      Else (feature 0 > 9.7485)
       Predict: 1.0
    Else (feature 3 > 696.25)
     If (feature 1 <= 13.295)
      Predict: 0.0
     Else (feature 1 > 13.295)
      Predict: 1.0
   Else (feature 27 > 0.15635)
    If (feature 21 <= 23.215)
     If (feature 2 <= 86.945)
      Predict: 0.0
     Else (feature 2 > 86.945)
      Predict: 1.0
    Else (feature 21 > 23.215)
     Predict: 1.0
  Else (feature 20 > 16.765)
   If (feature 1 <= 14.805)
    If (feature 5 <= 0.13035)
     Predict: 0.0
    Else (feature 5 > 0.13035)
     Predict: 1.0
   Else (feature 1 > 14.805)
    If (feature 10 <= 0.18159999999999998)
     Predict: 0.0
    Else (feature 10 > 

In [21]:
predictions = dtModel.transform(testData)

In [22]:
predictions.select('label','labelIndexed','probability','prediction').show()

+-----+------------+--------------------+----------+
|label|labelIndexed|         probability|prediction|
+-----+------------+--------------------+----------+
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|     

In [23]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="labelIndexed", predictionCol="prediction", metricName="accuracy")

In [24]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.948052 


In [25]:
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.0519481 


In [26]:
# Random Forest
from pyspark.ml.classification import RandomForestClassifier

In [27]:
rfClassifier = RandomForestClassifier(labelCol="labelIndexed", numTrees = 100)

In [28]:
rfModel = rfClassifier.fit(trainingData)

In [29]:
rfModel.featureImportances

SparseVector(30, {0: 0.0712, 1: 0.0192, 2: 0.0454, 3: 0.0332, 4: 0.0054, 5: 0.0047, 6: 0.0342, 7: 0.0808, 8: 0.0034, 9: 0.0037, 10: 0.0215, 11: 0.0033, 12: 0.0096, 13: 0.0247, 14: 0.0057, 15: 0.0021, 16: 0.0075, 17: 0.0037, 18: 0.0031, 19: 0.004, 20: 0.1208, 21: 0.0165, 22: 0.1459, 23: 0.1374, 24: 0.0107, 25: 0.006, 26: 0.0427, 27: 0.1198, 28: 0.0082, 29: 0.0052})

In [30]:
rfModel.toDebugString

'RandomForestClassificationModel: uid=RandomForestClassifier_aceb7e7656fe, numTrees=100, numClasses=2, numFeatures=30\n  Tree 0 (weight 1.0):\n    If (feature 22 <= 113.15)\n     If (feature 27 <= 0.11065)\n      If (feature 13 <= 48.565)\n       Predict: 0.0\n      Else (feature 13 > 48.565)\n       If (feature 7 <= 0.03758)\n        Predict: 1.0\n       Else (feature 7 > 0.03758)\n        Predict: 0.0\n     Else (feature 27 > 0.11065)\n      If (feature 29 <= 0.07428)\n       Predict: 1.0\n      Else (feature 29 > 0.07428)\n       If (feature 26 <= 0.31375)\n        Predict: 0.0\n       Else (feature 26 > 0.31375)\n        If (feature 1 <= 19.875)\n         Predict: 0.0\n        Else (feature 1 > 19.875)\n         Predict: 1.0\n    Else (feature 22 > 113.15)\n     If (feature 7 <= 0.04289)\n      If (feature 7 <= 0.03419)\n       If (feature 24 <= 0.09376000000000001)\n        Predict: 0.0\n       Else (feature 24 > 0.09376000000000001)\n        Predict: 1.0\n      Else (feature 7 > 

In [31]:
predictions = rfModel.transform(testData)

In [32]:
predictions.select('label','labelIndexed','prediction').show()

+-----+------------+----------+
|label|labelIndexed|prediction|
+-----+------------+----------+
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
+-----+------------+----------+
only showing top 20 rows



In [33]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="labelIndexed", predictionCol="prediction", metricName="accuracy")

In [34]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.025974 
