3. <b>Experimentation Workflow:</b><br>
    Here is a basic workflow for conducting model experiments using MLflow:<br>
    -  <b>Import Libraries:</b><br>
       Import the necessary libraries and MLflow.

In [33]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.metrics import accuracy_score, f1_score
# plot_confusion_matrix,

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import argparse

In [34]:
# load data
df = pd.read_csv(r"../data/train_v6.csv")
df.head()

Unnamed: 0,PC1,PC2,price
0,-0.366039,0.476383,9279.0
1,4.970104,0.26879,22563.0
2,-0.78673,0.023457,9995.0
3,-0.441181,0.414536,11259.0
4,1.863872,0.750249,15750.0


In [35]:
# splitig
X = df.drop(columns= ["price",])
y = df.price

In [36]:
# Load and preprocess your data (X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

- **Experiment Tracking**:<br>
  Start a new MLflow experiment to track your modeling efforts. You can create a new experiment or use an existing one.

In [37]:
mlflow.set_experiment("PricingStartegy")

2023/11/07 15:28:47 INFO mlflow.tracking.fluent: Experiment with name 'PricingStartegy' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/SRA/Desktop/backup/C/MLgrit/Generative_Artificial_Intelligence/Car_Price_Strategy/Car_Price_LinearRegression_Case_Study/b_Model/Experimentation/code/mlruns/922765404485738083', creation_time=1699349327439, experiment_id='922765404485738083', last_update_time=1699349327439, lifecycle_stage='active', name='PricingStartegy', tags={}>

- **Model Training and Logging**:<br>
    Train your regression model while logging relevant parameters and metrics.

# LinearRegression

In [56]:
mlflow.set_experiment("LinearRegression")

with mlflow.start_run():
    model = LinearRegression()

    param_grid = {
        "fit_intercept": [True],
        "copy_X": [True],
        "n_jobs": [None],
        "positive": [False],
    }
    
# GridSearchCV Each combination will try.
# RandomizedSearchCV Some random combination will try and we use it for huge data processing.
    # This all process call tuning and optimization
    grid = GridSearchCV(
        model,
        param_grid=param_grid,
        scoring="neg_mean_squared_error", 
        cv=5  # Number of cross-validation folds
    )
    grid.fit(X_train, y_train)
    
    # Log parameters
    mlflow.log_params(grid.best_params_)
    
    # Predict on the test set using the best estimator from the grid search
    y_pred = grid.best_estimator_.predict(X_test)

    # Calculate and log the evaluation metrics (e.g., RMSE, MAE, MAPE, R2)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metrics({
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape,  # Use "MAPE" instead of "MAPE0"
        "R2_SCORE": r2
    })

    # Log the best model using MLflow
    mlflow.sklearn.log_model(grid.best_estimator_, "LinearRegression")


In [42]:
# LinearRegression?

# KNeighborsRegressor

In [59]:
mlflow.set_experiment("KNeighborsRegressor")

with mlflow.start_run():
    model = KNeighborsRegressor() 

    param_distributions = {
        "n_neighbors" : [3],
        "weights": ['uniform', 'distance'], # uniform gives equal or same priority for each vote, distance gives more priority for the closer vote.   
        "algorithm": ['auto'],
        "leaf_size": [30],
        "p": [2],
        "metric": ['minkowski'],
        "metric_params": [None],
        "n_jobs": [None],
    }
# GridSearchCV Each combination will try.
# RandomizedSearchCV some random combination will try and we use it for huge data processing.
    grid = RandomizedSearchCV(
        model,
        param_distributions = param_distributions,
        scoring="neg_mean_squared_error", 
        cv=5  # Number of cross-validation folds
    )
    grid.fit(X_train, y_train)
    
    # Log parameters
    mlflow.log_params(grid.best_params_)
    
    # Predict on the test set using the best estimator from the grid search
    y_pred = grid.best_estimator_.predict(X_test)
    
    b_score = grid.best_score_
    print(b_score)

    # Calculate and log the evaluation metric (e.g., RMSE)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metrics({
        "RMSE": rmse,
        "MAE": mae,
        "MAPE0": mape,
        "R2_SCORE": r2
    })
    
    # Log the best model using MLflow
    mlflow.sklearn.log_model(model, 'knn')


In [44]:
# KNeighborsRegressor?

# Decision Tree Regression

In [55]:
mlflow.set_experiment("DecisionTreeRegressor")

with mlflow.start_run():
    model = DecisionTreeRegressor() 

    param_distributions = {
        "criterion": ["squared_error", "friedman_mse", "absolute_error"],
        "splitter": ["best", "random"],
        "max_depth": [None],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "min_weight_fraction_leaf": [0.0],
        "max_features": [None, "auto", "sqrt", "log2"],
        "random_state": [None],
        "max_leaf_nodes": [None],
        "min_impurity_decrease": [0.0],
        "ccp_alpha": [0.0],
    }
# GridSearchCV Each combination will try.
# RandomizedSearchCV some random combination will try and we use it for huge data processing.
    grid = RandomizedSearchCV(
        model,
        param_distributions = param_distributions,
        scoring="neg_mean_squared_error", 
        cv=5  # Number of cross-validation folds
    )
    grid.fit(X_train, y_train)
    
    # Log parameters
    mlflow.log_params(grid.best_params_)
    
    # Predict on the test set using the best estimator from the grid search
    y_pred = grid.best_estimator_.predict(X_test)
    
    b_score = grid.best_score_
    print(b_score)

    # Calculate and log the evaluation metric (e.g., RMSE)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metrics({
        "RMSE": rmse,
        "MAE": mae,
        "MAPE0": mape,
        "R2_SCORE": r2
    })
    
    # Log the best model using MLflow
    mlflow.sklearn.log_model(model, 'DecisionTreeRegressor')

In [54]:
# DecisionTreeRegressor?

# RandomForest

In [57]:
mlflow.set_experiment("RandomForestRegressor")

with mlflow.start_run():
    model = RandomForestRegressor()  # Replace with your regression model

    param_distributions = {
        "n_estimators": [64,100,200,300],
        "criterion": ['squared_error'],
        "max_depth": [None],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "min_weight_fraction_leaf": [0.0],
        "max_features": [1.0],
        "max_leaf_nodes": [None],
        "min_impurity_decrease": [0.0],
        "bootstrap": [True],
        "oob_score": [False],
        "n_jobs": [None],
        "random_state": [None],
        "verbose": [0],
        "warm_start": [False],
        "ccp_alpha": [0.0],
        "max_samples": [None],
    }

# GridSearchCV Each combination will try.
# RandomizedSearchCV some random combination will try and we use it for huge data processing.
    grid = RandomizedSearchCV(
        model,
        param_distributions = param_distributions,
        scoring="neg_mean_squared_error", 
        cv=5  # Number of cross-validation folds
    )
    grid.fit(X_train, y_train)
    
    # Log parameters
    mlflow.log_params(grid.best_params_)
    
    # Predict on the test set using the best estimator from the grid search
    y_pred = grid.best_estimator_.predict(X_test)

    b_score = grid.best_score_
    print(b_score)
    
    # Calculate and log the evaluation metric (e.g., RMSE)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metrics({
        "RMSE": rmse,
        "MAE": mae,
        "MAPE0": mape,
        "R2_SCORE": r2
    })
    
    # Log the best model using MLflow
    mlflow.sklearn.log_model(model, 'random_forest')


In [18]:
# RandomForestRegressor?

# Support Vector Regression (SVR)

In [58]:
mlflow.set_experiment("SVR")

with mlflow.start_run():
    model = SVR()  # Replace with your regression model

    param_distributions = {
        "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
        "degree": [3, 5, 6, 7, 8],
        "gamma": ['scale', 'auto'],
        "coef0": [0.0],
        "tol": [0.001],
        "C": [1.0],
        "epsilon": [0.1],
        "shrinking": [True],
        "cache_size": [200],
        "verbose": [False],
        "max_iter": [-1],
    }

# GridSearchCV Each combination will try.
# RandomizedSearchCV some random combination will try and we use it for huge data processing.
    grid = RandomizedSearchCV(
        model,
        param_distributions=param_distributions,
        scoring="accuracy", 
        cv=5  # Number of cross-validation folds
    )
    grid.fit(X_train, y_train)
    
    # Log parameters
    mlflow.log_params(grid.best_params_)
    
    # Predict on the test set using the best estimator from the grid search
    y_pred = grid.best_estimator_.predict(X_test)

    b_score = grid.best_score_
    print(b_score)
    
    # Calculate and log the evaluation metric (e.g., RMSE)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metrics({
        "RMSE": rmse,
        "MAE": mae,
        "MAPE0": mape,
        "R2_SCORE": r2
    })
    
    # Log the best model using MLflow
    mlflow.sklearn.log_model(model, 'random_forest')

In [55]:
# SVR?