<a href="https://colab.research.google.com/github/ValenciaN6/ADH_ClassTest/blob/master/Copy_of_trainDataAndVisualizeItUsingPyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
#from pyspark.ml.regression import LinearRegression
#from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import when, col
from pyspark.ml.classification import LinearSVC, LinearSVCModel
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create pyspark session

spark = SparkSession.builder \
    .appName("Breast cancer") \
    .getOrCreate()

# load data
filePath = "/Users/valenciamagagane/Documents/pythond/breast-cancer.csv"
data = spark.read.csv(filePath, header=True, inferSchema=True)

data.show(5)
data.printSchema()









In [None]:
# drop unneccessary data

drop_cols = [c for c in data.columns if c.lower() == 'id']
data = data.drop(*drop_cols)

data = data.withColumn('label', when(col('diagnosis')=='M', 1.0).otherwise(0.0))

feature_cols = [c for c , t in data.dtypes if t in ('int','double','float') and c not in ('label','diagnosis')]


In [None]:
#assemble features into on vector

assembler = VectorAssembler(inputCols=feature_cols,outputCol='feature_unscaled')

scaler = StandardScaler(inputCol='feature_unscaled',outputCol='features',withMean=True,withStd=True)

svc = LinearSVC(featuresCol='features',labelCol='label',maxIter=100)

pipeline = Pipeline(stages=[assembler,scaler,svc])

In [None]:
#train data

train, test = data.randomSplit([0.8,0.2],seed=42)

In [None]:
#grid
grid = ParamGridBuilder().addGrid(svc.regParam, [0.001, 0.01, 0.1, 1.0]).addGrid(svc.maxIter, [50,100]).build()


In [None]:
#evaluation
evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')


cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=5,
                    parallelism=2)

cvModel = cv.fit(train)

bestModel = cvModel.bestModel
bestSVC = [s for s in bestModel.stages if isinstance(s,LinearSVCModel)][0]


print("Best regParam:", bestSVC.getRegParam())
print("Best maxIter:", bestSVC.getMaxIter())

predictions = bestModel.transform(test)
predictions.select("label", "prediction", "rawPrediction").show(10, truncate=False)


# compute evaluation metrics
auc = evaluator.evaluate(predictions)
acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy").evaluate(predictions)
prec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision").evaluate(predictions)
rec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall").evaluate(predictions)
f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1").evaluate(predictions)

print(f"AUC: {auc:.4f}, Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")


spark.stop()