## Create Label Column
### Convert business event into ML label

In [0]:
from pyspark.sql.functions import *

import os

os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/ecommerce_data"

df = spark.sql("SELECT * FROM ecommerce_catalog.gold.vw_events_nov_analytics")



df_labeled = df.withColumn(
    "label",
    when(df.event_type == "purchase", 1).otherwise(0)
)

## Feature Engineering

## We will engineer meaningful features from raw columns.

### Numeric Features
- price

### Categorical Features
- brand

### Category_code
- Time Feature
- hour (from event_time)

In [0]:
df_features = df_labeled.withColumn("event_hour", hour("event_time"))

## Spark ML Feature Pipeline

In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

brand_indexer = StringIndexer(
    inputCol="brand",
    outputCol="brand_idx",
    handleInvalid="keep"
)

category_indexer = StringIndexer(
    inputCol="category_code",
    outputCol="category_idx",
    handleInvalid="keep"
)

assembler_tree = VectorAssembler(
    inputCols=["price", "event_hour"],  # NO brand / category
    outputCol="features"
)

feature_pipeline = Pipeline(stages=[
    brand_indexer,
    category_indexer,
    assembler
])

RandomForestClassifier(
    labelCol="label",
    maxBins=5000
)

## Prepare Train/Test Data

In [0]:
final_df = feature_pipeline.fit(df_features).transform(df_features)

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)


## Training 3 Models

In [0]:
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    GBTClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import mlflow
import mlflow.spark

from pyspark.ml.feature import FeatureHasher

models = {
    "LogisticRegression": LogisticRegression(labelCol="label"),
    "RandomForest": RandomForestClassifier(labelCol="label", numTrees=50),
    "GBT": GBTClassifier(labelCol="label")
}

RandomForestClassifier(
    labelCol="label",
    maxBins=5000
)


hasher = FeatureHasher(
    inputCols=["brand", "category_code"],
    outputCol="hashed_features",
    numFeatures=256
)
evaluator = BinaryClassificationEvaluator(labelCol="label")
results = {}


## Compare Models Using MLflow

In [0]:
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        fitted_model = model.fit(train_df)
        preds = fitted_model.transform(test_df)
        
        auc = evaluator.evaluate(preds)
        
        mlflow.log_metric("AUC", auc)
        mlflow.log_param("model_type", name)
        mlflow.spark.log_model(fitted_model, "model")
        
        results[name] = auc
