In [38]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [39]:
#pyspark init
builder = SparkSession.Builder().appName('creditcard_highAPI')
spark = builder.getOrCreate()

In [40]:
#Read input file
raw_df = spark.read.csv('creditcard.csv', header=True, inferSchema=True)

                                                                                

## Preprocessing

In [41]:
#Extract features name
cols = [col for col in raw_df.columns if col != 'Class']

#Use VectorAssembler to transform feature columns into a single vector
assembler = VectorAssembler(inputCols=cols, outputCol= 'features')
data = assembler.transform(raw_df).select('features', 'Class')

In [42]:
#Split data into training and testing sets

###training set proportion parameter:
train_size = 0.7
###

train, test = data.randomSplit([train_size, 1 - train_size], seed=24) #Fixed with seed for reproductivity

## Fit the model

In [43]:
classifier = LogisticRegression(featuresCol= 'features', labelCol= 'Class')
model = classifier.fit(train)

                                                                                

In [44]:
# Display model coefficients and intercept
print("Coefficients: ", model.coefficients)
print("Intercept: ", model.intercept)

Coefficients:  [-1.4205842552948074e-06,0.06448671102025771,0.07343322668750614,-0.018577980628522246,0.7664195873034164,0.18968501658537054,-0.14423536452744626,-0.05080773689778858,-0.1531071872669441,-0.2145123404076433,-0.8965157118511374,-0.09796312071675183,0.14389493400055703,-0.43606232896089364,-0.4819243168315607,-0.050624993435994775,-0.23137287549235477,-0.011095142200674944,-0.02333055472085141,0.015229224764578385,-0.5039467472247684,0.3997853455514391,0.6247652360674825,-0.07474196829812284,0.18621389095132587,-0.09849214283232235,-0.0445375136385123,-0.86646899960172,-0.2722885401250183,0.0010506580217886793]
Intercept:  -8.74985105016428


In [45]:
summary = model.summary
print('Accuracy: ', summary.accuracy)
print('AUC: ', summary.areaUnderROC)
print('Precision: ', summary.precisionByLabel)
print('Recall: ', summary.recallByLabel)

                                                                                

Accuracy:  0.999202063594026


                                                                                

AUC:  0.9808917897987934
Precision:  [0.9993568582525638, 0.8713692946058091]
Recall:  [0.9998441631561485, 0.621301775147929]


## Evaluate the model on the test set

In [46]:
eval = model.transform(test)

In [None]:
binary_evaluator = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="rawPrediction")
multi_evaluator = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction")

metrics = {
    "AUC": binary_evaluator.evaluate(eval),
    "Accuracy": multi_evaluator.setMetricName("accuracy").evaluate(eval),
    "Precision": multi_evaluator.setMetricName("weightedPrecision").evaluate(eval),
    "Recall": multi_evaluator.setMetricName("weightedRecall").evaluate(eval)
}

for name, value in metrics.items():
    print(f"{name}: {value:.4f}")

[Stage 695:===>                                                   (1 + 15) / 16]

AUC: 0.9689
Accuracy: 0.9991
Precision: 0.9991
Recall: 0.9991


                                                                                

In [48]:
eval.groupby('Class', 'prediction').count().sort('Class', 'prediction').show()

[Stage 697:===>                                                   (1 + 15) / 16]

+-----+----------+-----+
|Class|prediction|count|
+-----+----------+-----+
|    0|       0.0|85378|
|    0|       1.0|   11|
|    1|       0.0|   64|
|    1|       1.0|   90|
+-----+----------+-----+



                                                                                