# Classification via Neural Network

In [None]:
from helpers.helper_functions import translate_to_file_string
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler, VectorIndexer, StandardScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession

In [None]:
inputFile = translate_to_file_string("../data/churn.csv")

In [None]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("ChurnClustering")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   

In [None]:
# data preperation
splits = df.randomSplit([0.6, 0.4 ], 1234)
train = splits[0]
test = splits[1]
   
# Transform labels into index
labelIndexer = StringIndexer(inputCol="LEAVE", outputCol="label").fit(df)
collegeIndexer = StringIndexer().setInputCol("COLLEGE").setOutputCol("COLLEGE_NUM")
satIndexer = StringIndexer().setInputCol("REPORTED_SATISFACTION").setOutputCol("REPORTED_SATISFACTION_NUM")
usageIndexer = StringIndexer().setInputCol("REPORTED_USAGE_LEVEL").setOutputCol("REPORTED_USAGE_LEVEL_NUM")
changeIndexer = StringIndexer().setInputCol("CONSIDERING_CHANGE_OF_PLAN").setOutputCol("CONSIDERING_CHANGE_OF_PLAN_NUM")

In [None]:
#Build feature vector
featureCols = df.columns.copy()
featureCols.remove("LEAVE")
featureCols.remove("COLLEGE")
featureCols.remove("REPORTED_SATISFACTION")
featureCols.remove("REPORTED_USAGE_LEVEL")
featureCols.remove("CONSIDERING_CHANGE_OF_PLAN")
featureCols = featureCols +["COLLEGE_NUM","REPORTED_SATISFACTION_NUM","REPORTED_USAGE_LEVEL_NUM","CONSIDERING_CHANGE_OF_PLAN_NUM"]

In [None]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

In [None]:
nn = MultilayerPerceptronClassifier(seed=1234, featuresCol="scaledFeatures")
   
# build network parameters grid
# TODO add change the params 

paramGrid =  ParamGridBuilder().addGrid(nn.layers, [[ 11, 12, 5, 2 ]]) \
				.addGrid(nn.blockSize,  [128 ]) \
                .addGrid(nn.maxIter,[ 100 ] )\
				.addGrid(nn.stepSize, [ 0.3 ])\
				.addGrid(nn.tol, [ 0.05 ]) \
				.build()

In [None]:
pipeline = Pipeline(stages= [labelIndexer, collegeIndexer, satIndexer,
				usageIndexer, changeIndexer, assembler, scaler, nn, predConverter ])


evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC")

cv = CrossValidator(estimator=pipeline, evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=2, parallelism=2)

cvModel = cv.fit(train)

In [None]:
bestModel = cvModel.bestModel.stages[7]
print("Layers: " , bestModel.layers)
print(bestModel.explainParams())
  
predictions = cvModel.transform(test)

predictions.show()
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

In [None]:
spark.stop()