In [1]:
import pandas as pd
import sys
import os
from pathlib import Path

project_root = str(Path.cwd().parent)
sys.path.append(project_root)
from sklearn.model_selection import train_test_split
from src.data.make_dataset import extract_departure_time
from src.data.make_dataset import impute_distance_by_average
from src.data.make_dataset import clean_cabin_type
from src.data.make_dataset import clean_data
from src.models.train_model import train_model
from src.models.predict_model import predict_model
from scipy.stats.mstats import winsorize

In [2]:
airport_combined_path = '../data/processed/all_airports.csv'

In [3]:
airport_df = pd.read_csv(airport_combined_path)

In [4]:
Airport_df_cleaned = clean_data(airport_df)

In [5]:
Airport_df_cleaned['totalFare'] = winsorize(Airport_df_cleaned['totalFare'], limits=[0.05, 0.05])

In [6]:
Airport_df_cleaned.head()

Unnamed: 0,startingAirport,destinationAirport,Departure_Year,Departure_Month,Departure_Day,depatureTimeCategory,Cabin_Type,Cabin_Score,totalFare,totalTravelDistance
0,ATL,BOS,2022,4,17,Afternoon,coach,1,248.6,947.0
1,ATL,BOS,2022,4,17,Morning,coach,1,248.6,947.0
2,ATL,BOS,2022,4,17,Afternoon,coach,1,248.6,947.0
3,ATL,BOS,2022,4,17,Evening,coach,1,248.6,947.0
4,ATL,BOS,2022,4,17,Afternoon,coach,1,248.6,947.0


In [7]:
Airport_df_cleaned.columns

Index(['startingAirport', 'destinationAirport', 'Departure_Year',
       'Departure_Month', 'Departure_Day', 'depatureTimeCategory',
       'Cabin_Type', 'Cabin_Score', 'totalFare', 'totalTravelDistance'],
      dtype='object')

In [8]:
X = Airport_df_cleaned.drop(['totalFare'], axis=1)  # remove target variable
y = Airport_df_cleaned['totalFare']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,  # 20% for testing
    random_state=42  # for reproducibility
)

In [10]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [11]:
model = lgb.LGBMRegressor(
    objective='regression',
    random_state=42,
    n_jobs=-1
)

In [12]:
pipeline = train_model(
    model=model,
    X_train=X_train,
    y_train=y_train,
    model_name='lgb_model',
    output_path='../models/'
)

Training Data Shape: (10815999, 9)

Feature Names: ['startingAirport', 'destinationAirport', 'Departure_Year', 'Departure_Month', 'Departure_Day', 'depatureTimeCategory', 'Cabin_Type', 'Cabin_Score', 'totalTravelDistance']

Sample of training data:
         startingAirport destinationAirport  Departure_Year  Departure_Month  \
1921258              CLT                JFK            2022                5   
3522597              DFW                ATL            2022                6   
6499223              JFK                IAD            2022                6   
13034622             SFO                EWR            2022                5   
9590693              MIA                PHL            2022                6   

          Departure_Day depatureTimeCategory Cabin_Type  Cabin_Score  \
1921258              18              Morning      coach            3   
3522597               3              Evening      coach            1   
6499223               6              Evening      coac

In [13]:
results = predict_model(X_test, y_test, pipeline)


Model Performance:
RMSE: $101.78
MAE: $80.09
R2 Score: 0.644
MAPE: 27.34%


In [14]:
import xgboost as xgb

In [15]:
model = xgb.XGBRegressor(
        objective='reg:squarederror', eval_metric='rmse', tree_method='hist' 
    )



In [16]:
pipeline = train_model(model, X_train, y_train, output_path='../models/')

Training Data Shape: (10815999, 9)

Feature Names: ['startingAirport', 'destinationAirport', 'Departure_Year', 'Departure_Month', 'Departure_Day', 'depatureTimeCategory', 'Cabin_Type', 'Cabin_Score', 'totalTravelDistance']

Sample of training data:
         startingAirport destinationAirport  Departure_Year  Departure_Month  \
1921258              CLT                JFK            2022                5   
3522597              DFW                ATL            2022                6   
6499223              JFK                IAD            2022                6   
13034622             SFO                EWR            2022                5   
9590693              MIA                PHL            2022                6   

          Departure_Day depatureTimeCategory Cabin_Type  Cabin_Score  \
1921258              18              Morning      coach            3   
3522597               3              Evening      coach            1   
6499223               6              Evening      coac

In [17]:
results = predict_model(X_test, y_test, pipeline)


Model Performance:
RMSE: $91.53
MAE: $70.53
R2 Score: 0.712
MAPE: 23.64%


In [18]:
from src.models.train_model import tune_pipeline

In [19]:
param_grid = {
    'model__learning_rate': [0.1,0.001],
    'model__n_estimators': [1000,1500,2000,2500],
    'model__num_leaves': [31, 62, 127,200],
    'model__max_depth': [-1, 5, 10],
    'model__min_child_samples': [20, 50, 100],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0]
}

# Tune the pipeline
results = tune_pipeline(
    pipeline=pipeline,
    param_grid=param_grid,
    X_train=X_train,
    y_train=y_train,
    n_iter= 5,
    search_type='random',  # or 'random'
    cv=5
)

# Get the best pipeline
best_pipeline = results['best_pipeline']


Starting random search CV...
Fitting 5 folds for each of 5 candidates, totalling 25 fits




Error during pipeline tuning: Could not pickle the task to send it to the workers.


PicklingError: Could not pickle the task to send it to the workers.

In [1]:
import joblib

In [4]:
import sys
from pathlib import Path

# Add your project root to Python path
project_root = str(Path.cwd().parent)  # Adjust this based on your current location
if project_root not in sys.path:
    sys.path.append(project_root)

# Now load the pipeline
best_pipeline = joblib.load('../models/best_pipeline_random_20241101_184558.joblib')

In [14]:
results_tune = predict_model(X_test, y_test, best_pipeline)


Model Performance:
RMSE: $201.00
MAE: $148.83
R2 Score: 0.062
MAPE: 57.87%
