# import necessasry packages

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
spark = SparkSession.builder \
    .appName("LogisticRegression with PySpark MLlib") \
    .getOrCreate()

# load data

In [None]:
url = "https://raw.githubusercontent.com/pkmklong/Breast-Cancer-Wisconsin-Diagnostic-DataSet/master/data.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("data.csv"), header=True, inferSchema=True)
df.show(2)

# Create feature column

In [None]:
# Rename the columns for better readability
columns = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 32)]
data = df.toDF(*columns)
#Map 'M' (malignant) to 1 and 'B' (benign) to 0
data = data.withColumn("label", (data["diagnosis"] == "M").cast("integer")).drop("diagnosis")
feature_columns = [f'feature_{i}' for i in range(1, 25)]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [None]:
train_data.show(2)
test_data.show(2)

# Create a Logistic Regression model and fit it to the training data

In [13]:
logistic_regression = LogisticRegression(featuresCol="features", labelCol="label")
model = logistic_regression.fit(train_data)

# Inspect the model coefficients and intercept

In [None]:
coefficients = model.coefficients
intercept = model.intercept
print("Coefficients: ", coefficients)
print("Intercept: {:.3f}".format(intercept))

In [None]:
lrn_summary = model.summary
lrn_summary.predictions.select("features", "rawPrediction", "probability", "prediction").show(5)

# Evaluating the model on test data

In [None]:
predictions = model.transform(test_data)
# AUC-ROC
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
auc = evaluator.evaluate(predictions)
# Accuracy, Precision, and Recall
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})
print(f"AUC-ROC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")