# DAY 12 : MLflow Basics

In [0]:
import pandas as pd
from pyspark.sql import functions as F

# Load only a sample to keep it fast and avoid memory issues
# Reading directly from your Volume path
raw_df = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True) \
    .limit(100000) \
    .toPandas()

# Quick preprocessing in Pandas for ML
df = raw_df.copy()
df['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0)
df['event_type_encoded'] = (df['event_type'] == 'purchase').astype(int)

# Use 'price' as the feature to predict 'purchase' intent
X = df[['price']]
y = df['event_type_encoded']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

with mlflow.start_run(run_name="fast_csv_sample_v1"):
    # Log metadata
    mlflow.log_param("data_source", "2019-Nov.csv_sample")
    mlflow.log_param("sample_size", 100000)
    
    # Train
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Evaluate
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mlflow.log_metric("mse", mse)
    
    # Save model to MLflow
    mlflow.sklearn.log_model(model, "regression_model")
    
    print(f"Model trained and logged. MSE: {mse:.6f}")



Model trained and logged. MSE: 0.013998


In [0]:
# List the last few runs to confirm logging
recent_runs = mlflow.search_runs(max_results=3)
display(recent_runs[['run_id', 'params.data_source', 'metrics.mse']])

run_id,params.data_source,metrics.mse
c4297c4c25b14defab6e4bb612db6275,2019-Nov.csv_sample,0.0139983773012054
c20ead386350447ea2d0269aa2fab9d7,,
