In [0]:
# # Notebook 3: Train and evaluate Logistic Regression using selected features

# from pyspark.ml import PipelineModel
# from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# from pyspark.ml.classification import LogisticRegression
# from pyspark.ml import Pipeline
# from pyspark.sql import functions as F

# # Load slicing stage (top-10 features)
# slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

# # Load preprocessed datasets
# train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
# val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")

# # Apply slicer to keep only top-10 features
# train_topk = slicer_model.transform(train_ready)
# val_topk   = slicer_model.transform(val_ready)

In [0]:
# # Split into majority and minority classes
# minority_df = train_topk.filter("label = 1")
# majority_df = train_topk.filter("label = 0")

# # Sample only majority class (e.g., keep 80%)
# majority_sampled = majority_df.sample(withReplacement=False, fraction=0.6, seed=42)

# # Combine back together
# train_sample = minority_df.union(majority_sampled)

# # Optional: Shuffle the dataset (if desired)
# train_sample = train_sample.orderBy(F.rand(seed=42))

# # Define LR
# lr = LogisticRegression(labelCol="label", featuresCol="features_topK")

# # Param grid (keep it small!)
# param_grid = ParamGridBuilder() \
#     .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
#     .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
#     .build()

# # Evaluator
# evaluator = MulticlassClassificationEvaluator(
#     labelCol="label", 
#     predictionCol="prediction", 
#     metricName="f1"
# )

# train_split, val_split = train_sample.randomSplit([0.8, 0.2], seed=42)
# tvs = TrainValidationSplit(
#     estimator=lr,
#     estimatorParamMaps=param_grid,
#     evaluator=evaluator,
#     trainRatio=1.0,  # Porque já dividiste
#     parallelism=1
# )
# tvs_model = tvs.fit(train_split)

# # Apply best model on validation
# val_preds = tvs_model.transform(val_topk)
# f1_score = evaluator.evaluate(val_preds)

# print(f"Best model F1-score on validation set: {f1_score:.4f}")

# # Save the best model
# tvs_model.bestModel.write().overwrite().save("/FileStore/models/lr_top10_model_grid")

In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

# Load slicing stage (top-10 features)
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

# Load preprocessed datasets
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply slicer to keep only top-10 features
train_topk = slicer_model.transform(train_ready)
val_topk   = slicer_model.transform(val_ready)

# Efficient class balancing using sampleBy
fractions = {0: 0.6, 1: 1.0}  # 60% of majority, 100% of minority
train_sample = train_topk.sampleBy("label", fractions=fractions, seed=42)

# Define Logistic Regression
lr = LogisticRegression(labelCol="label", featuresCol="features_topK")

# Small param grid to limit overhead
param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Evaluator (can switch to BinaryClassificationEvaluator if binary)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

# TrainValidationSplit (no internal split since we did it manually)
tvs = TrainValidationSplit(
    estimator=lr,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    trainRatio=1.0,
    parallelism=1  # Required for Databricks CE
)

# Train the model with grid search
tvs_model = tvs.fit(train_sample)

# Evaluate on external validation set
val_preds = tvs_model.transform(val_topk)
f1_score = evaluator.evaluate(val_preds)

print(f"Best model F1-score on validation set: {f1_score:.4f}")

# Save best model
tvs_model.bestModel.write().overwrite().save("/FileStore/models/lr_top10_model_grid")

Best model F1-score on validation set: 0.9463
