### Accesing data lake data from container

In [0]:
storage_account_name = "storage_account_name"
storage_account_key = "storage_account_key"
lake_container = "container_name"

file_location = "abfss://"+lake_container+"@"+storage_account_name+".dfs.core.windows.net/input/housing.csv"

spark.conf.set(
"fs.azure.account.key."+storage_account_name+".dfs.core.windows.net",
storage_account_key
)

In [0]:
#read in the data to dataframe df
housing = spark.read.format("csv").option("inferSchema", "true").option("header",
"true").option("delimiter",",").load(file_location)
 

In [0]:
print(housing.columns)

In [0]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): 
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

In [0]:
cat_attribs = ["ocean_proximity"]
num_attribs = [i for i in housing.columns if i not in cat_attribs]
target_col = "median_house_value"

In [0]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ("numerical", num_pipeline, num_attribs),
        ("categorical", OneHotEncoder(), cat_attribs),
    ])

In [0]:
from sklearn.model_selection import train_test_split

housing_data = full_pipeline.fit_transform(housing.toPandas())

X_train, X_test, y_train, y_test = train_test_split(housing_data, 
                                                    housing.select(target_col).toPandas(), 
                                                    random_state=0, test_size=0.2)

In [0]:
import mlflow

# Loading workspace

from azureml.core import Workspace
ws = Workspace.get(name="workspace",
               subscription_id='subscription_id',
               resource_group='resource_group')

In [0]:
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

experiment_name = 'experiment_name'
mlflow.set_experiment(experiment_name)

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

lin_reg = LinearRegression()

In [0]:
with mlflow.start_run() as parent_run:
    model = lin_reg.fit(X_train, y_train)
    
    y_pred = lin_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    
    mlflow.sklearn.log_model(model, "LR_model")  