In [37]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import shapiro

In [4]:
df = pd.read_csv('/workspaces/mlops-zoomcamp/project/data/flight_dataset.csv')

In [6]:
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Date,Month,Year,Dep_hours,Dep_min,Arrival_hours,Arrival_min,Duration_hours,Duration_min
0,IndiGo,Banglore,New Delhi,0,3897,24,3,2019,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,2,7662,1,5,2019,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,2,13882,9,6,2019,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,1,6218,12,5,2019,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,1,13302,1,3,2019,16,50,21,35,4,45


Check Is null values

In [8]:
df.isnull().sum()

Airline           0
Source            0
Destination       0
Total_Stops       0
Price             0
Date              0
Month             0
Year              0
Dep_hours         0
Dep_min           0
Arrival_hours     0
Arrival_min       0
Duration_hours    0
Duration_min      0
dtype: int64

Check Duplicate values

In [10]:
df.duplicated().sum()

np.int64(222)

Drop Duplicates

In [13]:
df.drop_duplicates(inplace = True)

In [23]:
df.count()

Airline           10461
Source            10461
Destination       10461
Total_Stops       10461
Price             10461
Date              10461
Month             10461
Year              10461
Dep_hours         10461
Dep_min           10461
Arrival_hours     10461
Arrival_min       10461
Duration_hours    10461
Duration_min      10461
dtype: int64

Remove Outliers


            Outliers were detected and removed using the Interquartile Range (IQR) method for numerical columns

In [20]:
def remove_outliers_iqr(df,numerical_columns):
    for column in numerical_columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df=df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

numerical_columns = ['Price', 'Dep_hours', 'Dep_min', 'Arrival_hours', 'Arrival_min', 'Duration_hours', 'Duration_min']
df_clean = remove_outliers_iqr(df, numerical_columns)

In [22]:
df_clean.count()

Airline           10301
Source            10301
Destination       10301
Total_Stops       10301
Price             10301
Date              10301
Month             10301
Year              10301
Dep_hours         10301
Dep_min           10301
Arrival_hours     10301
Arrival_min       10301
Duration_hours    10301
Duration_min      10301
dtype: int64

In [38]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2024/07/02 08:18:14 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/07/02 08:18:14 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/project/mlruns/1', creation_time=1719908297003, experiment_id='1', last_update_time=1719908297003, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

check for data normality


Columns that were not normally distributed were transformed using log, square root, or Box-Cox transformations

In [26]:
# Check normal distribution and apply transformations
def check_normality_transform(df, column):
    stat, p = shapiro(df[column])
    if p < 0.05:
        # Not normally distributed
        # Apply log transformation if all values are positive
        if all(df[column] > 0):
            df[column] = np.log(df[column])
        # Apply square root transformation if values are non-negative
        elif all(df[column] >= 0):
            df[column] = np.sqrt(df[column])
        else:
            # Apply Box-Cox transformation
            pt = PowerTransformer(method='box-cox')
            df[column] = pt.fit_transform(df[[column]])
    return df

for column in numerical_columns:
    df = check_normality_transform(df, column)

In [27]:
# Features and target
X = df.drop(columns=['Price'])
y = df['Price']

In [28]:
# Encoding categorical variables
categorical_features = ['Airline', 'Source', 'Destination']
encoder = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

In [29]:
# Scaling
scaler = StandardScaler()

In [30]:
# Preprocessing pipeline
preprocessor = Pipeline(steps=[
    ('encoder', encoder),
    ('scaler', scaler)
])

In [31]:
# Preprocess the features
X_processed = preprocessor.fit_transform(X)

In [32]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [33]:
# Model setup
model = RandomForestRegressor(
    n_estimators=200,  # Number of trees
    max_depth=20,      # Maximum depth of the tree
    min_samples_split=5,  # Minimum number of samples required to split an internal node
    min_samples_leaf=2,   # Minimum number of samples required to be at a leaf node
    random_state=42
)

In [39]:
# Start an MLflow run
with mlflow.start_run():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Log model
    mlflow.sklearn.log_model(model, "model")

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Mean Absolute Percentage Error
    def mean_absolute_percentage_error(y_true, y_pred): 
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mape", mape)

    # Log parameters
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 20)
    mlflow.log_param("min_samples_split", 5)
    mlflow.log_param("min_samples_leaf", 2)
    
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R²: {r2}")
    print(f"MAPE: {mape}%")

RMSE: 0.020225815481500774
MAE: 0.013463981948057127
R²: 0.8779253231918751
MAPE: 0.613919246731386%


In [35]:
# Predictions
y_pred = model.predict(X_test)

In [36]:
# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Mean Absolute Percentage Error
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print(f"MAPE: {mape}%")

RMSE: 0.020225815481500774
MAE: 0.013463981948057127
R²: 0.8779253231918751
MAPE: 0.613919246731386%
