In [2]:
import mlflow
from pyspark.ml import Pipeline
from synapse.ml.core.platform import *
from synapse.ml.lightgbm import LightGBMRegressor

## Define model_uri to load the model with its specified version
mlexperiment_name = "nyc_yellowtaxi_predict_tripduration"
mlalgorithm_name = "lightgbm" 
mlmodel_name = f"{mlexperiment_name}_{mlalgorithm_name}"
mlmodel_uri = f"models:/{mlmodel_name}/2"
loaded_model = mlflow.spark.load_model(mlmodel_uri, dfs_tmpdir="Files/mlflow/tmp/")

StatementMeta(, e3ea4c39-d35b-4e52-9d12-bb1fba6b3dfe, 4, Finished, Available)

2023/09/14 00:43:46 INFO mlflow.spark: 'models:/nyc_yellowtaxi_predict_tripduration_lightgbm/2' resolved as 'abfss://04fb8c5b-81dd-4501-847b-e6edca0dc50d@onelake.dfs.fabric.microsoft.com/6c6bcfc1-c9e8-413a-813a-6c209ce9fcad/6a1fcbcc-61e4-4458-9e16-025a53e06200/artifacts'
2023/09/14 00:43:47 INFO mlflow.spark: File 'abfss://04fb8c5b-81dd-4501-847b-e6edca0dc50d@onelake.dfs.fabric.microsoft.com/6c6bcfc1-c9e8-413a-813a-6c209ce9fcad/6a1fcbcc-61e4-4458-9e16-025a53e06200/artifacts/sparkml' is already on DFS, copy is not necessary.


In [3]:
SEED = 1234 # Specify a random seed to use with random sampling

# Read cleasned data for the month of June and take 25% random sample of it
nyc_yellowtaxi_prediction_input_df = spark.read.table("nyc_yellowtaxi_clean")\
            .filter("puYear = 2016 AND puMonth = 6")\
            .sample(True, 0.25, seed=SEED) 

StatementMeta(, e3ea4c39-d35b-4e52-9d12-bb1fba6b3dfe, 5, Finished, Available)

In [4]:
# Generate predictions by applying model transform on the input dataframe
nyc_yellowtaxi_prediction_output_df = loaded_model.transform(nyc_yellowtaxi_prediction_input_df)

# Remove unnecessary columns added during prediction generation
cols_toremove = ['storeAndFwdFlagStrIdx', 'timeBinsStrIdx', 'vendorIDStrIdx', 'paymentTypeStrIdx', 'vendorIDOHEnc',
 'rateCodeIdOHEnc', 'paymentTypeOHEnc', 'weekDayOHEnc', 'pickupHourOHEnc', 'storeAndFwdFlagOHEnc', 'timeBinsOHEnc', 'features','weekDayNameStrIdx',
 'pickupHourStrIdx', 'rateCodeIdStrIdx', 'weekDayNameOHEnc']

nyc_yellowtaxi_prediction_output_df = nyc_yellowtaxi_prediction_output_df.withColumnRenamed("prediction", "predictedtripDuration").drop(*cols_toremove)

StatementMeta(, e3ea4c39-d35b-4e52-9d12-bb1fba6b3dfe, 6, Finished, Available)

In [5]:
display(nyc_yellowtaxi_prediction_output_df)

StatementMeta(, e3ea4c39-d35b-4e52-9d12-bb1fba6b3dfe, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 9ceb44fe-eb6c-49da-9107-f7ab5b8d4d5e)

In [6]:
table_name = "nyc_yellowtaxi_prediction"
nyc_yellowtaxi_prediction_output_df.write.mode("overwrite").format("delta").saveAsTable(f"{table_name}")
print(f"Trip duration batch redictions saved to the delta table: {table_name}")

StatementMeta(, e3ea4c39-d35b-4e52-9d12-bb1fba6b3dfe, 8, Finished, Available)

Trip duration batch redictions saved to the delta table: nyc_yellowtaxi_prediction


In [7]:
%%sql
SELECT tripDuration, predictedtripDuration, * 
FROM nyc_yellowtaxi_prediction LIMIT 20

StatementMeta(, e3ea4c39-d35b-4e52-9d12-bb1fba6b3dfe, 9, Finished, Available)

<Spark SQL result set with 20 rows and 33 fields>