## ModelTrainingPipeline
This notebook can be used to run the ModelTrainingPipeline

In [None]:
your_name = "RuggieroS"

In [None]:
import git
import os
os.environ["GIT_PYTHON_REFRESH"] = "quiet"
repo = git.Repo(os.getcwd(), search_parent_directories=True)

os.chdir(repo.working_tree_dir)

print(os.getcwd())

In [None]:
from azureml.core.compute import ComputeTarget
from azureml.core import Workspace, Experiment, Environment, RunConfiguration

from Pipelines.AML_PipelinesFactory import AML_PipelinesFactory

In [None]:
# Define workspace
workspace = Workspace.from_config()

# Initialize run configuration
run_config = RunConfiguration()

# Define environment
env = Environment.from_conda_specification(
    ".venv", "./environment.yml"
)
env.register(workspace)
run_config.environment = env

# Define compute target
compute_target = ComputeTarget(workspace=workspace, name="EF-MLOPS-Course")
run_config.target = compute_target

# Define datastore
datastore = workspace.get_default_datastore()

## Define the pipeline

In [None]:
pipeline = AML_PipelinesFactory.execute(pipeline_type="ModelTrainingPipeline",
                                        run_config=run_config,
                                        datastore=datastore,
                                        workspace=workspace,
                                        allow_reuse_all_components=True)

## Run the pipeline

In [None]:
pipeline_experiment = Experiment(workspace, f"{your_name}_ModelTrainingPipeline").submit(
    pipeline,
    tags={'war_time':False},
    pipeline_parameters={
        # DataFetcher
        "input_dataset_name": "BikeSharingPredictionsHours",
        "target_feat": "count",
        # DataSplitter
        "test_size": 0.25,
        "random_state_splitting": 1,
        # PipelineTrainer
        "method_missing_value": "mean",
        "categorical_class_minimum_occurrences": 0.25,
        "model_params": "{'n_estimators':100}",
        "random_state_training": 1,
        # ModelPusher
        "model_name": f"{your_name}_Model-EF-MLOPS",
        "skip_pushing_model": False,
    })
pipeline_experiment.wait_for_completion()

## Fetching and processing results from an Azure Run

In [None]:
from Notebooks.utils import fetch_uploaded_files_from_run
import pandas as pd

path_to_data = fetch_uploaded_files_from_run(run_id='368c596b-1f3f-4b22-a606-b1331826452f',
                                           dir_data_to_fetch='ModelEvaluator_plots/df_results_test.pickle',
                                           output_file_path='data\output_folder_data')
df_results_test = pd.read_pickle(path_to_data)
df_results_test.head()

In [None]:
# Example: run some analysis on the fetched data
import plotly.express as px

fig = px.box(df_results_test,
             x='year',
             y='prediction_error')
fig.add_hline(0)

In [None]:
# example on how to fetch files in output from a step
from Notebooks.utils import fetch_output_from_run_id

model_folder = fetch_output_from_run_id(run_id='40f67d87-dd14-485c-8a67-acd7d797c77a',
                                        data_to_fetch='model_pipeline',
                                        output_dir='data\output_folder_model')