In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Databricks automatically creates a SparkSession called 'spark'

# Load dataset with sklearn
data = load_breast_cancer()
df_pd = pd.DataFrame(data.data, columns=data.feature_names)
df_pd["target"] = data.target

# Convert to Spark DataFrame
df_spark = spark.createDataFrame(df_pd)

# Assemble features
features = data.feature_names[:5]  # Use first 5 features
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=list(features), outputCol="features")

df_features = assembler.transform(df_spark)

# Train-test split
train_df, test_df = df_features.randomSplit([0.8, 0.2], seed=42)

# Logistic regression model
lr = LogisticRegression(labelCol="target", featuresCol="features")
model = lr.fit(train_df)

# Predictions
predictions = model.transform(test_df)

# Evaluate model using AUC
evaluator = BinaryClassificationEvaluator(labelCol="target")
auc = evaluator.evaluate(predictions)
print(f"AUC on test data: {auc:.4f}")
