In [2]:
import azureml.core

from azureml.core import Experiment, Workspace, Dataset, Datastore
from azureml.train.automl import AutoMLConfig
from notebookutils import mssparkutils
from azureml.data.dataset_factory import TabularDatasetFactory

In [3]:
linkedService_name = "AzureML_telemetryexperiments"
experiment_name = "Predict_Core_Temp"

ws = mssparkutils.azureML.getWorkspace(linkedService_name)
experiment = Experiment(ws, experiment_name)

In [7]:
# Changing the query to the Data Explorer pool table
# Make sure you change this code to use YOUR Data Explorer pool, 
# database, and table. 
df  = spark.read \
    .format("com.microsoft.kusto.spark.synapse.datasource") \
    .option("spark.synapse.linkedService", "kustoPool") \
    .option("kustoCluster", "https://droneanalyticsadx.drone-analytics.kusto.azuresynapse.net") \
    .option("kustoDatabase", "drone-telemetry") \
    .option("kustoQuery", "['fleet data']") \
    .load()

# Downsampling data
df = df.sample(True, 0.01, seed=1234)

from pyspark.sql.functions import *

# Select specific columns
df = df.select(
      'DeviceState', 
      'Engine1Status','Engine2Status','Engine3Status','Engine4Status',
      'Engine1RPM','Engine2RPM','Engine3RPM','Engine4RPM',
      'Engine1Temp','Engine2Temp','Engine3Temp','Engine4Temp',
      'CoreTemp','BatteryTemp','CoreStatus','MemoryAvailable','BatteryLevel',
      'Altitude','Speed','DistanceFromBase','RFSignal','PayloadWeight',
      date_format('LocalDateTime', 'hh').alias('HourOfDay'),
      dayofmonth('LocalDateTime').alias('DayOfMonth'),
      dayofweek('LocalDateTime').alias('DayOfWeek')
      )

# Create the training and validation dataframes
training_data, validation_data = df.randomSplit([0.8,0.2], seed=1234)

# Create a dataset from the training dataframe, 
# using the default workspace data store
datastore = Datastore.get_default(ws)
dataset = TabularDatasetFactory.register_spark_dataframe(training_data, 
      datastore, name = experiment_name + "-dataset")

In [8]:
display(df, summary=True)

In [9]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_minutes": 15,
    "enable_early_stopping": True,
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 2}

automl_config = AutoMLConfig(spark_context = sc,
                             task = 'regression',
                             training_data = dataset,
                             label_column_name = 'CoreTemp',
                             primary_metric = 'r2_score',
                             max_concurrent_iterations = 2,
                             enable_onnx_compatible_models = False, **automl_settings)

In [10]:
run = experiment.submit(automl_config, show_output=True)

In [11]:
run.wait_for_completion()

import mlflow

# Get best model from automl run
best_run, fitted_model = run.get_output()

artifact_path = experiment_name + "_artifact"

mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)

with mlflow.start_run() as run:
    # Save the model to the outputs directory for capture
    mlflow.sklearn.log_model(fitted_model, artifact_path)

    # Register the model to AML model registry
    mlflow.register_model("runs:/" + run.info.run_id + "/" + artifact_path, "drone-analytics-fleet_data-20221026113234-Best")

In [12]:
import pandas as pd

# Create a Pandas DataFrame from the validation set
validation_data_pd = validation_data.toPandas()

# Create a new Pandas DataFrame with the REAL CoreTemp data
df_compare = validation_data_pd.pop('CoreTemp').to_frame()

# Predict the CoreTemp column using the fitted model
y_predict = fitted_model.predict(validation_data_pd)

# Add the predicted values to the new Pandas DataFrame
# for comparison 
df_compare['Predicted'] = y_predict
display(df_compare)

In [15]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Compute the R2 score by using the predicted and actual CoreTemp
y_test_actual = y_test['CoreTemp']
r2 = r2_score(y_test_actual, y_predict)

# Plot the actual versus predicted CoreTemp values
plt.style.use('ggplot')
plt.figure(figsize=(10, 7))
plt.scatter(y_test_actual,y_predict)
plt.plot([np.min(y_test_actual), np.max(y_test_actual)], [np.min(y_test_actual), np.max(y_test_actual)], color='lightblue')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted CoreTemp R^2={}".format(r2))
plt.show()

In [16]:
description = 'My automated ML model'
model_path='outputs/model.pkl'
model = best_run.register_model(model_name = 'DeviceTelemetry', model_path = model_path, description = description)
print(model.name, model.version)