In [1]:
storage_account_name = "<Storage account name>"
storage_account_key = "<Storage account key>"
container = "<Container name>"

In [2]:
spark.conf.set("fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name), storage_account_key)

In [3]:
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
data = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .option("delimiter", ",") \
  .csv("wasbs://{0}@{1}.blob.core.windows.net/housing.csv".format(container, storage_account_name))

data.show()

In [5]:
vectors = VectorAssembler(inputCols= ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'], outputCol= 'features')

vectors.setParams(handleInvalid="skip")

vector_data = vectors.transform(data)

In [6]:
features = vector_data.select(["features", "median_house_value"])

features.show()

In [7]:
(trainingData, testData) = features.randomSplit([0.7, 0.3])

In [8]:
lr = LinearRegression(labelCol="median_house_value", featuresCol="features")

In [9]:
model = lr.fit(trainingData)

In [10]:
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

In [11]:
model_summary = model.summary

print("RMSE", model_summary.rootMeanSquaredError)
print("R^2:", model_summary.r2)

In [12]:
model.save("dbfs:/FileStore/lr_model.model")

In [13]:
dbutils.fs.ls("dbfs:/FileStore")

In [14]:
loaded_model = LinearRegressionModel.load("dbfs:/FileStore/lr_model.model")

In [15]:
print("Coefficients:", loaded_model.coefficients)
print("Intercept:", loaded_model.intercept)