In [1]:
storage_account_name = "<Storage account name>"
storage_account_key = "<Storage account key>"
container = "<Container name>"

In [2]:
spark.conf.set("fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name), storage_account_key)

In [3]:
data = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .option("delimiter", ",") \
  .csv("wasbs://{0}@{1}.blob.core.windows.net/housing.csv".format(container, storage_account_name))

data.show(5)

In [4]:
dbutils.library.installPyPI("mlflow", extras="extras")

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import mlflow

In [6]:
vectors = VectorAssembler(inputCols= ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'], outputCol= 'features')

vectors.setParams(handleInvalid="skip")

vector_data = vectors.transform(data)

In [7]:
features = vector_data.select(["features", "median_house_value"])

features.show()

In [8]:
with mlflow.start_run() as run:
  for param in [0.1, 0.00001]:
      lr = LinearRegression(labelCol="median_house_value", featuresCol="features", regParam=param)
      
      model = lr.fit(features)
      
      mlflow.log_metric("R2", model.summary.r2)