# **MLFlow**

In [21]:
import pandas as pd 
from sklearn import datasets 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.models import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [22]:
# !pip install holidays

In [23]:
## Set the tracking URI of mlflow

mlflow.set_tracking_uri("uri = http://127.0.0.1")

In [24]:
df=pd.read_csv("Cleaned_Sample_removed_cols.csv")
columns_to_drop = ["ArrTime", "DepTime", "WheelsOff", "WheelsOn",
                   "ActualElapsedTime", "SchdArrTime","Quarter","DistanceGroup","Cancelled","Diverted","OriginCityName","OriginStateName","DestCityName","DestState","DestStateName","ArrDelay","DepartureDelayGroups"]
df = df.drop(columns=columns_to_drop)
columns_to_drop=["Airline","OriginAirport","DestAirport","DepDelay","ArrivalDelayGroups","ArrDel15"]
df = df.drop(columns=columns_to_drop)
columns_to_drop=["DepDel15"]
df = df.drop(columns=columns_to_drop)
import pandas as pd
import holidays

# Ensure SchdDepTime is in a 4-digit format
df['SchdDepTime'] = df['SchdDepTime'].astype(str).str.zfill(4)

# Extract hour from SchdDepTime
df['SchdDepHour'] = df['SchdDepTime'].str[:2].astype(int)

# Create time-of-day categories
bins = [0, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
df['SchdDepTimeOfDay'] = pd.cut(df['SchdDepHour'], bins=bins, labels=labels, right=False)

# Check for holidays
us_holidays = holidays.US()
df['IsHoliday'] = df['FlightDate'].apply(lambda x: x in us_holidays)

df=df.drop(columns="FlightDate")
df['SchdDepHour'] = df['SchdDepTime'].str[:2].astype(int)   # First two characters = Hour
df['SchdDepMinute'] = df['SchdDepTime'].str[2:].astype(int) # Last two characters = Minutes
df.drop(columns=['SchdDepTime'], inplace=True)
df['IsHoliday'] = df['IsHoliday'].astype(int)

# One-hot encode 'SchdDepTimeOfDay' and 'Operating_Airline'
df = pd.get_dummies(df, columns=['SchdDepTimeOfDay', 'Operating_Airline'], drop_first=True)
df = df.astype(int)
# List of selected features
selected_features = [
    'DepDelayMinutes',  # Strongest predictor
    'TaxiOut',          # Moderate impact
    'TaxiIn',           # Moderate impact
    'SchdDepHour',      # Time of day
    'SchdDepTimeOfDay_Evening',  # Evening flights
    'IsHoliday'         # Holiday indicator
]

# Filter the dataframe to include only selected features
x = df[selected_features]
y = df['ArrDelayMinutes']  # Target variable

# Display the filtered dataframe
print(x.head())
scaler = StandardScaler()
# x_scaled = scaler.fit_transform(x)

# Train-Test Split (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


   DepDelayMinutes  TaxiOut  TaxiIn  SchdDepHour  SchdDepTimeOfDay_Evening  \
0                0        6       7           19                         1   
1               16       22      23           20                         1   
2               38       16       4           16                         0   
3                6        9       7           18                         1   
4                1        7       2           12                         0   

   IsHoliday  
0          0  
1          0  
2          0  
3          0  
4          0  


## **RandomForest**

In [25]:
# Initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# Make predictions
y_pred = rf_model.predict(x_test)

In [26]:

# Evaluate the model performance
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
rmse = np.sqrt(mse)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-squared

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 3.3570554856078556
Mean Squared Error (MSE): 50.125326083936045
Root Mean Squared Error (RMSE): 7.079924158063845
R-squared (R²): 0.9756644603181026


In [27]:
## MlFlow Tracking 
mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")

## creating a new MLFlow Exp
mlflow.set_experiment("Project Test 1")

## Start an MLFlow run
with mlflow.start_run():
    # log the accuracy metrics
    mlflow.log_metric("R-squared-R2", r2)

    #set a tag that we can use to remind ourselves what tis run was for 
    mlflow.set_tag("Training Info", "Training using Random forest without X-Scaled")

    ## Infer ignature 
    signature = infer_signature(x_train,rf_model.predict(x_train))

    ## log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = rf_model,
        artifact_path = "FlightDelayData",
        signature = signature,
        input_example = x_train,
        registered_model_name = "tracking-RandomForest-without-Scaled",
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'tracking-RandomForest-without-Scaled'.
2025/02/06 11:14:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-RandomForest-without-Scaled, version 1
Created version '1' of model 'tracking-RandomForest-without-Scaled'.


🏃 View run upset-cow-910 at: http://127.0.0.1:5000/#/experiments/824477963141232602/runs/ab3b35d18a8a4b129d9201556d469ae9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/824477963141232602


In [28]:
print(model_info.model_uri)

runs:/ab3b35d18a8a4b129d9201556d469ae9/FlightDelayData


In [29]:
features =x.columns
features

Index(['DepDelayMinutes', 'TaxiOut', 'TaxiIn', 'SchdDepHour',
       'SchdDepTimeOfDay_Evening', 'IsHoliday'],
      dtype='object')

In [30]:
## load the model back for prediction as a generic python fucntion model 

loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
predictions = loaded_model.predict(x_test)
features =x.columns

result  = pd.DataFrame(x_test, columns=features)
result["actual_class"] = y_test
result['predicted1_class'] = predictions


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [32]:
result.head(25)

Unnamed: 0,DepDelayMinutes,TaxiOut,TaxiIn,SchdDepHour,SchdDepTimeOfDay_Evening,IsHoliday,actual_class,predicted1_class
24129,0,8,6,14,0,0,0,0.0
45793,0,19,12,8,0,0,0,0.133333
20318,0,10,5,20,1,0,0,0.0
11839,50,17,7,9,0,0,34,39.76
15541,5,11,6,7,0,0,0,1.51
13040,45,81,6,18,1,0,98,109.78
1175,8,14,6,18,1,0,0,5.470667
9647,0,21,2,6,0,0,6,0.028333
46445,0,34,6,16,0,0,0,8.028333
47022,214,11,9,18,1,0,182,203.89


## **Gradient Boosting Regressor**

In [33]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the model
gbr = GradientBoostingRegressor()

# Train the model
gbr.fit(x_train, y_train)

# Make predictions
y_pred_gbr = gbr.predict(x_test)

# Evaluate the model
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f"Gradient Boosting Regressor - MAE: {mae_gbr}, MSE: {mse_gbr}, RMSE: {rmse_gbr}, R²: {r2_gbr}")


Gradient Boosting Regressor - MAE: 3.1320220948937068, MSE: 41.39666108871121, RMSE: 6.434023709057281, R²: 0.9799021738644561


In [36]:
## MlFlow Tracking 
mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")

## create a new MLFlow Exp

mlflow.set_experiment("Project Test 1")

## Start an MLFlow run

with mlflow.start_run():
    # log the accuracy metrics
    mlflow.log_metric("R-squared-R2", r2_gbr)

    #set a tag that we can use to remind ourselves what tis run was for 
    mlflow.set_tag("Training Info", "Training using GradientBoosting regressor")

    ## Infer ignature 
    signature = infer_signature(x_train,gbr.predict(x_train))

    ## log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = gbr,
        artifact_path = "FlightDelayData",
        signature = signature,
        input_example = x_train,
        registered_model_name = "tracking-GradientBoosting regressor",
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'tracking-GradientBoosting regressor' already exists. Creating a new version of this model...
2025/02/06 11:22:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-GradientBoosting regressor, version 2


🏃 View run righteous-horse-503 at: http://127.0.0.1:5000/#/experiments/824477963141232602/runs/2c08d9a7b6b54a319ac47cbdf60bfbbc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/824477963141232602


Created version '2' of model 'tracking-GradientBoosting regressor'.


In [38]:
## load the model back for prediction as a generic python fucntion model 

loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
predictions = loaded_model.predict(x_test)
features =x.columns

result  = pd.DataFrame(x_test, columns=features)
result["actual_class"] = y_test
result['predicted1_class'] = predictions
result.head(20)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,DepDelayMinutes,TaxiOut,TaxiIn,SchdDepHour,SchdDepTimeOfDay_Evening,IsHoliday,actual_class,predicted1_class
24129,0,8,6,14,0,0,0,0.024363
45793,0,19,12,8,0,0,0,1.203843
20318,0,10,5,20,1,0,0,-0.014069
11839,50,17,7,9,0,0,34,40.854284
15541,5,11,6,7,0,0,0,1.456845
13040,45,81,6,18,1,0,98,100.608695
1175,8,14,6,18,1,0,0,2.760737
9647,0,21,2,6,0,0,6,0.836109
46445,0,34,6,16,0,0,0,5.168248
47022,214,11,9,18,1,0,182,204.334765


### **Hyperparam Gradient**

In [47]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 300, 500],  # Number of boosting stages
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Controls shrinkage
    'max_depth': [3, 5, 7],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split an internal node
    'min_samples_leaf': [1, 3, 5],  # Minimum samples per leaf
    'subsample': [0.7, 0.8, 1.0]  # Fraction of samples used for training each tree
}

# Initialize the model
gbr = GradientBoostingRegressor()

# Perform GridSearchCV
grid_search = GridSearchCV(gbr, param_grid, scoring='r2', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions
y_pred_gbr = best_model.predict(x_test)

# Evaluate the model
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f"Best Parameters: {best_params}")
print(f"Gradient Boosting Regressor (Tuned) - MAE: {mae_gbr}, MSE: {mse_gbr}, RMSE: {rmse_gbr}, R²: {r2_gbr}")

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300, 'subsample': 0.7}
Gradient Boosting Regressor (Tuned) - MAE: 3.091091926659406, MSE: 41.133844702439696, RMSE: 6.413567236915794, R²: 0.9800297696148849


In [48]:
## MlFlow Tracking 
mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")

## create a new MLFlow Exp

mlflow.set_experiment("Project Test 1")

## Start an MLFlow run

with mlflow.start_run():
        ##<og hyper params
    mlflow.log_params(grid_search.best_params_)
    # log the accuracy metrics
    mlflow.log_metric("R-squared-R2", r2_gbr)

    #set a tag that we can use to remind ourselves what tis run was for 
    mlflow.set_tag("Training Info", "Training using Hyperparam GradientBoostingRegressor")

    ## Infer ignature 
    signature = infer_signature(x_train,best_model.predict(x_train))

    ## log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = best_model,
        artifact_path = "FlightDelayData",
        signature = signature,
        input_example = x_train,
        registered_model_name = "GradientBoostingRegressor-Hyperparam",
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'GradientBoostingRegressor-Hyperparam'.
2025/02/06 14:30:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GradientBoostingRegressor-Hyperparam, version 1


🏃 View run melodic-smelt-245 at: http://127.0.0.1:5000/#/experiments/824477963141232602/runs/174fe5791faf4bc485313df4e90d069a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/824477963141232602


Created version '1' of model 'GradientBoostingRegressor-Hyperparam'.


## **ElasticNet**

In [39]:
from sklearn.linear_model import ElasticNet

# Initialize the model
elastic_net = ElasticNet()

# Train the model
elastic_net.fit(x_train, y_train)

# Make predictions
y_pred_en = elastic_net.predict(x_test)

# Evaluate the model
mae_en = mean_absolute_error(y_test, y_pred_en)
mse_en = mean_squared_error(y_test, y_pred_en)
rmse_en = np.sqrt(mse_en)
r2_en = r2_score(y_test, y_pred_en)

print(f"ElasticNet - MAE: {mae_en}, MSE: {mse_en}, RMSE: {rmse_en}, R²: {r2_en}")


ElasticNet - MAE: 4.80919800880946, MSE: 57.77904995118301, RMSE: 7.601253182941811, R²: 0.9719486241243633


In [40]:
## MlFlow Tracking 
mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")

## create a new MLFlow Exp

mlflow.set_experiment("Project Test 1")

## Start an MLFlow run

with mlflow.start_run():
    # log the accuracy metrics
    mlflow.log_metric("R-squared-R2", r2_en)

    #set a tag that we can use to remind ourselves what tis run was for 
    mlflow.set_tag("Training Info", "Training using ElasticNet")

    ## Infer ignature 
    signature = infer_signature(x_train,elastic_net.predict(x_train))

    ## log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = elastic_net,
        artifact_path = "FlightDelayData",
        signature = signature,
        input_example = x_train,
        registered_model_name = "tracking-ElasticNet",
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'tracking-ElasticNet'.
2025/02/06 11:24:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-ElasticNet, version 1


🏃 View run abrasive-mule-309 at: http://127.0.0.1:5000/#/experiments/824477963141232602/runs/b222168482b64fd2aa487ed58de632d3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/824477963141232602


Created version '1' of model 'tracking-ElasticNet'.


In [41]:
## load the model back for prediction as a generic python fucntion model 

loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
predictions = loaded_model.predict(x_test)
features =x.columns

result  = pd.DataFrame(x_test, columns=features)
result["actual_class"] = y_test
result['predicted1_class'] = predictions
result.head(20)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,DepDelayMinutes,TaxiOut,TaxiIn,SchdDepHour,SchdDepTimeOfDay_Evening,IsHoliday,actual_class,predicted1_class
24129,0,8,6,14,0,0,0,-4.766959
45793,0,19,12,8,0,0,0,3.74832
20318,0,10,5,20,1,0,0,-4.262405
11839,50,17,7,9,0,0,34,48.822404
15541,5,11,6,7,0,0,0,1.77604
13040,45,81,6,18,1,0,98,77.073617
1175,8,14,6,18,1,0,0,6.047704
9647,0,21,2,6,0,0,6,0.484791
46445,0,34,6,16,0,0,0,8.881218
47022,214,11,9,18,1,0,182,204.86377


### **HYperparam Elasticnet**

In [42]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]  # Mix of L1 and L2 regularization
}

# Initialize the model
elastic_net = ElasticNet()

# Perform GridSearchCV
grid_search = GridSearchCV(elastic_net, param_grid, scoring='r2', cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions
y_pred_en = best_model.predict(x_test)

# Evaluate the model
mae_en = mean_absolute_error(y_test, y_pred_en)
mse_en = mean_squared_error(y_test, y_pred_en)
rmse_en = np.sqrt(mse_en)
r2_en = r2_score(y_test, y_pred_en)

print(f"Best Parameters: {best_params}")
print(f"ElasticNet (Tuned) - MAE: {mae_en}, MSE: {mse_en}, RMSE: {rmse_en}, R²: {r2_en}")


Best Parameters: {'alpha': 0.01, 'l1_ratio': 0.1}
ElasticNet (Tuned) - MAE: 4.844377907497688, MSE: 57.63226826060817, RMSE: 7.591591945080305, R²: 0.9720198857386934


In [45]:
## MlFlow Tracking 
mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")

## create a new MLFlow Exp

mlflow.set_experiment("Project Test 1")

## Start an MLFlow run

with mlflow.start_run():
        ##<og hyper params
    mlflow.log_params(grid_search.best_params_)
    # log the accuracy metrics
    mlflow.log_metric("R-squared-R2", r2_en)

    #set a tag that we can use to remind ourselves what tis run was for 
    mlflow.set_tag("Training Info", "Training using Hyperparam ElasticNet")

    ## Infer ignature 
    signature = infer_signature(x_train,best_model.predict(x_train))

    ## log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = best_model,
        artifact_path = "FlightDelayData",
        signature = signature,
        input_example = x_train,
        registered_model_name = "ElasticNet-Hyperparam",
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'ElasticNet-Hyperparam'.
2025/02/06 11:32:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ElasticNet-Hyperparam, version 1


🏃 View run masked-dove-846 at: http://127.0.0.1:5000/#/experiments/824477963141232602/runs/7ba897410d8b4efba5a2192c970f5af2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/824477963141232602


Created version '1' of model 'ElasticNet-Hyperparam'.


In [46]:
## load the model back for prediction as a generic python fucntion model 

loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
predictions = loaded_model.predict(x_test)
features =x.columns

result  = pd.DataFrame(x_test, columns=features)
result["actual_class"] = y_test
result['predicted1_class'] = predictions
result.head(20)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,DepDelayMinutes,TaxiOut,TaxiIn,SchdDepHour,SchdDepTimeOfDay_Evening,IsHoliday,actual_class,predicted1_class
24129,0,8,6,14,0,0,0,-4.780607
45793,0,19,12,8,0,0,0,3.925112
20318,0,10,5,20,1,0,0,-4.63905
11839,50,17,7,9,0,0,34,48.919911
15541,5,11,6,7,0,0,0,1.779467
13040,45,81,6,18,1,0,98,77.324015
1175,8,14,6,18,1,0,0,5.723429
9647,0,21,2,6,0,0,6,0.494949
46445,0,34,6,16,0,0,0,9.086098
47022,214,11,9,18,1,0,182,204.675555
