In [0]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [0]:
from sklearn.linear_model import LinearRegression
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100)
}

In [0]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=200, n_features=5, noise=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
import mlflow
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)

        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)

        mlflow.log_metric("r2_score", score)
        mlflow.sklearn.log_model(model, "model")

        print(f"{name}: R² = {score:.4f}")



linear: R² = 1.0000




decision_tree: R² = 0.8393




random_forest: R² = 0.9010


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

In [0]:
assembler = VectorAssembler(inputCols=["views"], outputCol="features")
lr = SparkLR(featuresCol="features", labelCol="purchases")
pipeline = Pipeline(stages=[assembler, lr])

In [0]:
assembler = VectorAssembler(inputCols=["views"], outputCol="features")
lr = SparkLR(featuresCol="features", labelCol="purchases")
pipeline = Pipeline(stages=[assembler, lr])

spark_df = spark.read.format('delta').load('/Volumes/workspace/ecommerce/ecommerce_data/gold_products')
train, test = spark_df.randomSplit([0.8, 0.2])
model = pipeline.fit(train)

In [0]:
display(spark_df.limit(20))

product_id,views,purchases,revenue,conversion_rate
8500290,1,1,10602.3,100.0
1005159,1,1,524295.44,100.0
17300014,1,1,968.8,100.0
12400055,1,1,116.09,100.0
29502246,1,1,361.36,100.0
6902812,1,1,325.36,100.0
15200176,1,1,293.11,100.0
26404407,1,1,1268.28,100.0
7004004,1,1,1544.1600000000003,100.0
9800341,1,1,287.3,100.0
