In [0]:
df = spark.table("gold.customer_summary").toPandas()
df.head()

Unnamed: 0,country,total_customers,avg_income
0,Germany,1008,102271.419643
1,Japan,495,106217.042424
2,Canada,828,103189.842995
3,South Africa,394,54797.360406
4,France,640,55375.910937


In [0]:
from sklearn.model_selection import train_test_split

X = df[["avg_income"]]
y = df["total_customers"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression

In [0]:
with mlflow.start_run(run_name="linear_regression_v1"):

    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("feature", "avg_income")
    mlflow.log_param("test_size", 0.2)

    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Evaluate model
    r2 = model.score(X_test, y_test)
    mlflow.log_metric("r2_score", r2)

    # Log model artifact
    mlflow.sklearn.log_model(model, "model")

print(f"R² Score: {r2:.4f}")




R² Score: -1.3213


In [0]:
with mlflow.start_run(run_name="linear_regression_v2"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("feature", "avg_income_scaled")

    # Feature scaling
    X_scaled = X / X.max()

    X_train2, X_test2, y_train2, y_test2 = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

    model2 = LinearRegression()
    model2.fit(X_train2, y_train2)

    r2_v2 = model2.score(X_test2, y_test2)
    mlflow.log_metric("r2_score", r2_v2)

    mlflow.sklearn.log_model(model2, "model")

print(f"R² Score v2: {r2_v2:.4f}")




R² Score v2: -1.3213
