In [0]:
df_spark = spark.table("gold.customer_summary")
df_spark.show(5)
df_spark.printSchema()

+------------+---------------+------------------+
|     country|total_customers|        avg_income|
+------------+---------------+------------------+
|     Germany|           1008|102271.41964285714|
|       Japan|            495|106217.04242424242|
|      Canada|            828|103189.84299516908|
|South Africa|            394| 54797.36040609137|
|      France|            640|     55375.9109375|
+------------+---------------+------------------+
only showing top 5 rows
root
 |-- country: string (nullable = true)
 |-- total_customers: long (nullable = true)
 |-- avg_income: double (nullable = true)



In [0]:
df = df_spark.toPandas()

In [0]:
from sklearn.model_selection import train_test_split

X = df[["avg_income"]]
y = df["total_customers"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(max_depth=5),
    "RandomForest": RandomForestRegressor(n_estimators=50, random_state=42)
}

for name, model in models.items():
    with mlflow.start_run(run_name=name):

        mlflow.log_param("model_type", name)
        mlflow.log_param("feature_used", "avg_income")

        model.fit(X_train, y_train)
        r2 = model.score(X_test, y_test)

        mlflow.log_metric("r2_score", r2)
        mlflow.sklearn.log_model(model, "model")

        print(f"{name} R2 score: {r2:.4f}")




LinearRegression R2 score: -1.3213




DecisionTree R2 score: -1.4794




RandomForest R2 score: -1.5543


In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml import Pipeline

In [0]:
assembler = VectorAssembler(
    inputCols=["avg_income"],
    outputCol="features"
)

spark_lr = SparkLR(
    featuresCol="features",
    labelCol="total_customers"
)

pipeline = Pipeline(stages=[assembler, spark_lr])


In [0]:
train_df, test_df = df_spark.randomSplit([0.8, 0.2], seed=42)

spark_model = pipeline.fit(train_df)

Model selection was based on highest RÂ² score observed in MLflow.
Random Forest performed best among tested models.