In [0]:
import mlflow
import mlflow.spark


In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import functions as F

# Sample data
data = [
    (1.0, 10.0),
    (2.0, 20.0),
    (3.0, 30.0),
    (4.0, 40.0),
    (5.0, 50.0)
]

df = spark.createDataFrame(data, ["x", "y"])

# Feature engineering
assembler = VectorAssembler(
    inputCols=["x"],
    outputCol="features"
)
df_features = assembler.transform(df)


In [0]:
%sql
CREATE CATALOG IF NOT EXISTS mlflow_catalog;
CREATE SCHEMA IF NOT EXISTS mlflow_catalog.mlflow_schema;

CREATE VOLUME IF NOT EXISTS mlflow_catalog.mlflow_schema.mlflow_tmp;


In [0]:
import os
import mlflow
import mlflow.spark

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# REQUIRED for UC + Spark ML
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/mlflow_catalog/mlflow_schema/mlflow_tmp"

with mlflow.start_run(run_name="linear_regression_run_1"):

    lr = LinearRegression(
        featuresCol="features",
        labelCol="y",
        maxIter=10
    )

    model = lr.fit(df_features)
    predictions = model.transform(df_features)

    evaluator = RegressionEvaluator(
        labelCol="y",
        predictionCol="prediction",
        metricName="rmse"
    )

    rmse = evaluator.evaluate(predictions)

    # Log params & metrics
    mlflow.log_param("maxIter", 10)
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)

    # Log Spark ML model
    mlflow.spark.log_model(
        model,
        artifact_path="linear_regression_model"
    )

    print("RMSE:", rmse)




RMSE: 1.5828525626165987e-14
