In [1]:
storage_account_name = "<Storage account name>"
storage_account_key = "<Storage account key>"
container = "<Container name>"

In [2]:
spark.conf.set("fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name), storage_account_key)

In [3]:
dbutils.fs.mount(
 source = "wasbs://{0}@{1}.blob.core.windows.net".format(container, storage_account_name),
 mount_point = "/mnt/data",
 extra_configs = {"fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name): storage_account_key}
)

In [4]:
%fs ls /mnt/data

path,name,size
dbfs:/mnt/data/housing.csv,housing.csv,1423529


In [5]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
data = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .option("delimiter", ",") \
  .csv("/mnt/data/housing.csv".format(container, storage_account_name))

data.show()

In [7]:
vectors = VectorAssembler(inputCols= ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'], outputCol= 'features')

vectors.setParams(handleInvalid="skip")

vector_data = vectors.transform(data)

In [8]:
features = vector_data.select(["features", "median_house_value"])

features.show()

In [9]:
(trainingData, testData) = features.randomSplit([0.7, 0.3])

In [10]:
lr = LinearRegression(labelCol="median_house_value", featuresCol="features")

In [11]:
model = lr.fit(trainingData)

In [12]:
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

In [13]:
model_summary = model.summary

print("RMSE", model_summary.rootMeanSquaredError)
print("R^2:", model_summary.r2)

In [14]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="median_house_value", metricName="r2")

In [15]:
test_data = model.transform(testData)

evaluator.evaluate(test_data)