In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [25]:
#pyspark init
builder = SparkSession.Builder().appName('creditcard_highAPI')
spark = builder.getOrCreate()

In [26]:
#Read input file
raw_df = spark.read.csv('creditcard.csv', header=True, inferSchema=True)

                                                                                

## Preprocessing

In [27]:
#Extract features name
cols = [col for col in raw_df.columns if col != 'Class']

#Use VectorAssembler to transform feature columns into a single vector
assembler = VectorAssembler(inputCols=cols, outputCol= 'features')
data = assembler.transform(raw_df).select('features', 'Class')

In [28]:
#Split data into training and testing sets

###training set proportion parameter:
train_size = 0.7
###

train, test = data.randomSplit([train_size, 1 - train_size], seed=24) #Fixed with seed for reproductivity

## Fit the model

In [29]:
classifier = LogisticRegression(featuresCol= 'features', labelCol= 'Class')
model = classifier.fit(train)

                                                                                

In [30]:
# Display model coefficients and intercept
print("Coefficients: ", model.coefficients)
print("Intercept: ", model.intercept)

Coefficients:  [-1.420584255290576e-06,0.0644867110201878,0.07343322668777727,-0.01857798062830314,0.7664195873036443,0.1896850165855631,-0.14423536452753677,-0.05080773689777933,-0.15310718726690467,-0.21451234040733136,-0.896515711851349,-0.09796312071671975,0.1438949340007252,-0.4360623289609669,-0.48192431683149745,-0.05062499343587694,-0.23137287549281907,-0.011095142200480482,-0.02333055472087488,0.015229224764392445,-0.503946747224968,0.39978534555112405,0.6247652360668583,-0.07474196829786554,0.18621389095128627,-0.09849214283246698,-0.044537513638330084,-0.8664689996020296,-0.2722885401249795,0.0010506580217897828]
Intercept:  -8.749851050164766


In [31]:
summary = model.summary
print('Accuracy: ', summary.accuracy)
print('AUC: ', summary.areaUnderROC)
print('Precision: ', summary.precisionByLabel)
print('Recall: ', summary.recallByLabel)

                                                                                

Accuracy:  0.999202063594026


                                                                                

AUC:  0.9808942884235088
Precision:  [0.9993568582525638, 0.8713692946058091]
Recall:  [0.9998441631561485, 0.621301775147929]


## Evaluate the model on the test set

In [32]:
eval = model.transform(test)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction")

# Calculate metrics
accuracy = evaluator.setMetricName("accuracy").evaluate(eval)
precision = evaluator.setMetricName("weightedPrecision").evaluate(eval)
recall = evaluator.setMetricName("weightedRecall").evaluate(eval)

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

[Stage 520:===>                                                   (1 + 15) / 16]

Accuracy: 0.9991
Precision: 0.9991
Recall: 0.9991


                                                                                

In [34]:
eval.groupby('Class', 'prediction').count().sort('Class', 'prediction').show()

[Stage 522:===>                                                   (1 + 15) / 16]

+-----+----------+-----+
|Class|prediction|count|
+-----+----------+-----+
|    0|       0.0|85378|
|    0|       1.0|   11|
|    1|       0.0|   64|
|    1|       1.0|   90|
+-----+----------+-----+



                                                                                