In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext
import time

sc = SparkContext(appName="mlModels")

Helper Functions

In [2]:
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])
#Evaluate
def evaluate(model,trainingData,testData):
    predictions = model.predict(trainingData.map(lambda x: x.features))
    labelsAndPredictions = trainingData.map(lambda lp: lp.label).zip(predictions)
    trainErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(trainingData.count())
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    return (trainErr,testErr)

Process Data

In [3]:
data = sc.textFile("blood_data.txt")
parsedData = data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.8, 0.2])

SVM with SGD

In [4]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
start=time.time()
model = SVMWithSGD.train(trainingData, iterations=100)
print(time.time()-start)
evaluate(model,trainingData,testData)

10.08607792854309


(0.22857142857142856, 0.27450980392156865)

Decisiont Tree

In [5]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
start=time.time()
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)
print(time.time()-start)
evaluate(model,trainingData,testData)

5.810112476348877


(0.1949579831932773, 0.20261437908496732)

Random Forest

In [6]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)
evaluate(model,trainingData,testData)

(0.19642857142857142, 0.25757575757575757)

Gradient Boosted Trees

In [7]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
model = GradientBoostedTrees.trainClassifier(parsedData,
                                             categoricalFeaturesInfo={}, numIterations=3)
evaluate(model,trainingData,testData)

(0.19155844155844157, 0.22727272727272727)

In [8]:
dat=evaluate(model,trainingData,testData)

In [9]:
import pandas as pd
pd.DataFrame(dat).to_csv('resutls.txt',index=False,header=False)