## 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import joblib

## 2. Display Setting

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

## 3. Getting Data

In [3]:
# Function to get the data
path = r'D:\FLIGHT_PRICE_PREDICTION\DATA'
def get_data(name):
    filename = f'{name}.csv'
    filepath = os.path.join(path,filename)
    return pd.read_csv(filepath)

In [4]:
# Reading the training data
train_df = get_data('transformed_train')
train_df

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,journey__date_of_journey_month,journey__date_of_journey_week,journey__date_of_journey_day_of_year,location__source_Delhi,location__source_other,location__destination_Cochin,location__destination_Delhi,time__arrival_time_hour,flight__duration_category,flight__over_1000,flight__duration,break__total_stops,break__is_direct_flight,price
0,0.0,0.0,0.0,0.333333,0.294118,0.262712,0.0,0.0,0.0,0.0,1.000000,2.0,1,2.441302,3,0,8607
1,0.0,0.0,0.0,1.000000,0.764706,0.779661,1.0,0.0,1.0,0.0,0.913043,1.0,0,-0.301855,1,0,13587
2,0.0,1.0,0.0,0.666667,0.705882,0.686441,0.0,0.0,0.0,0.0,1.000000,2.0,0,-0.118978,1,0,10844
3,0.0,1.0,0.0,0.000000,0.058824,0.067797,1.0,0.0,1.0,0.0,0.173913,2.0,1,1.526916,2,0,16914
4,0.0,1.0,0.0,0.666667,0.647059,0.661017,0.0,0.0,0.0,0.0,0.347826,2.0,0,0.643010,1,0,8586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7318,1.0,0.0,0.0,1.000000,0.823529,0.796610,0.0,0.0,0.0,1.0,0.478261,0.0,0,-0.931765,0,1,4823
7319,0.0,1.0,0.0,0.000000,0.176471,0.169492,0.0,0.0,0.0,0.0,0.913043,2.0,0,0.470293,1,0,7832
7320,1.0,0.0,0.0,0.333333,0.294118,0.279661,1.0,0.0,1.0,0.0,0.652174,1.0,0,-0.667610,1,0,5073
7321,1.0,0.0,0.0,0.333333,0.470588,0.483051,1.0,0.0,1.0,0.0,0.347826,1.0,0,-0.880966,0,1,6015


In [5]:
# Reading the test data
test_df = get_data('transformed_test')
test_df

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,journey__date_of_journey_month,journey__date_of_journey_week,journey__date_of_journey_day_of_year,location__source_Delhi,location__source_other,location__destination_Cochin,location__destination_Delhi,time__arrival_time_hour,time__arrival_time_minute,flight__duration_category,flight__over_1000,flight__duration,break__total_stops,break__is_direct_flight,price
0,0.0,1.0,0.0,0.000000,0.058824,0.042373,0.0,0.0,0.0,0.0,0.347826,0.272727,2.0,1,1.645847,1,0,17996
1,0.0,0.0,1.0,1.000000,0.823529,0.822034,0.0,0.0,0.0,0.0,0.000000,0.727273,0.0,0,-0.969282,0,1,3873
2,1.0,0.0,0.0,0.000000,0.176471,0.144068,0.0,0.0,0.0,0.0,0.347826,0.363636,0.0,0,-0.909621,0,1,4462
3,1.0,0.0,0.0,1.000000,1.000000,1.000000,0.0,1.0,0.0,0.0,0.913043,1.000000,0.0,0,-0.969282,0,1,3597
4,1.0,0.0,0.0,0.666667,0.588235,0.559322,0.0,0.0,0.0,0.0,0.739130,0.818182,0.0,0,-0.949395,0,1,4804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3134,0.0,1.0,0.0,1.000000,0.823529,0.822034,1.0,0.0,1.0,0.0,0.826087,0.000000,1.0,0,-0.651091,1,0,10262
3135,0.0,0.0,0.0,0.000000,0.235294,0.220339,1.0,0.0,1.0,0.0,0.304348,0.727273,2.0,1,0.969692,2,0,8892
3136,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.956522,1.000000,2.0,1,2.322002,2,0,14887
3137,1.0,0.0,0.0,1.000000,0.764706,0.779661,0.0,0.0,0.0,1.0,0.434783,0.090909,0.0,0,-0.899677,0,1,4823


- We need to drop the column 'air__airline_other' from train_df at the model training time as it is absent in test data

## 4. Data Spliting

In [6]:
# Train data spliting
X_train = train_df.drop(['price'],axis=1)
y_train = train_df.price

In [7]:
# Test data spliting
X_test = test_df.drop(['price','time__arrival_time_minute'],axis=1)
y_test = test_df.price

## 5. Model Training & Hyper-parameter Tuning

- Now we are going to train Random Forest and XGBoost models with hyper-parameter tuning using GridSearchcv to get the best possible model

### 5.1 Random Forest

In [15]:
# Defining the model
rf_model = RandomForestRegressor()

# Defining the param grid
param_grid = {
    'n_estimators' : [100,150,200],
    'criterion' : ['squared_error','friedman_mse'],
    'max_depth' : [3,4,5],
    'max_features' : [0.5,0.75],
    'max_samples' : [0.5,0.75],
    'n_jobs' : [-1],
    'random_state' : [42]
}

# Using GridSearchcv
grid = GridSearchCV(estimator = rf_model, param_grid = param_grid, n_jobs = -1, cv = 10)

# Fitting the grid
grid.fit(X_train,y_train)

# Printing the best parameters
print(f"The best parameters for Random Forest Regressor are : {grid.best_params_}")

The best parameters for Random Forest Regressor are : {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 0.75, 'max_samples': 0.5, 'n_estimators': 200, 'n_jobs': -1, 'random_state': 42}


- So we will be making our final Random Forest model based on these parameters and then evaluate it on the test data

### 5.2 XGBoost

In [18]:
# Defining the model
xgb_model = xgb.XGBRegressor()

# Defining the param grid
param_grid = {
    'n_estimators' : [100,150,200],
    'eta' : [0.1,0.2],
    'gamma' : [0.08,0.1],
    'max_depth' : [3,4,5],
    'lambda' : [10,12,14],
    'colsample_bynode' : [0.5,0.75,1],
    'subsample' : [0.5,0.75],
    'tree_method' : ['auto','approx'],
    'n_jobs' : [-1],
    'random_state' : [42]
}

# Using GridSearchcv
grid = GridSearchCV(estimator = xgb_model, param_grid = param_grid, n_jobs = -1, cv = 10)

# Fitting the grid
grid.fit(X_train,y_train)

# Printing the best parameters
print(f"The best parameters for XGBoost Regressor are : {grid.best_params_}")

The best parameters for XGBoost Regressor are : {'colsample_bynode': 0.75, 'eta': 0.2, 'gamma': 0.08, 'lambda': 12, 'max_depth': 5, 'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'subsample': 0.75, 'tree_method': 'approx'}


- So we will be making our final XGBoost model based on these parameters and then evaluate it on the test data

## 6. Model Evaluation

- Now we are going to do prediction using our two models with the best parameters and will calculate test score using r2_score 

### 6.1 Random Forest

In [8]:
# Making the final model
rf_final_model = RandomForestRegressor(criterion='squared_error',max_depth=5,max_features=0.75,max_samples=0.5,n_estimators=200,
                                      n_jobs=-1,random_state=42)
# Fitting our model
rf_final_model.fit(X_train,y_train)

# Prediction on test data and getting test score
y_test_pred = rf_final_model.predict(X_test)
print(f'Testing score for Random Forest Regressor is : {r2_score(y_test,y_test_pred)}')
print(f'Mean Absolute Error for Random Forest Regressor is : {mean_absolute_error(y_test,y_test_pred)}')

Testing score for Random Forest Regressor is : 0.6984347003044997
Mean Absolute Error for Random Forest Regressor is : 1726.8760791818668


### 6.2 XGBoost

In [9]:
# Making the final model
xgb_final_model = xgb.XGBRegressor(colsample_bynode=0.75,reg_lambda=12,eta=0.2,max_depth=5,gamma=0.08,n_estimators=200,subsample=0.75,
                                  tree_method='approx',n_jobs=-1,random_sate=42)
# Fitting our model
xgb_final_model.fit(X_train,y_train)

# Prediction on test data and getting test score
y_test_pred = xgb_final_model.predict(X_test)
print(f'Testing score for XGBoost Regressor is : {r2_score(y_test,y_test_pred)}')
print(f'Mean Absolute Error for XGBoost Regressor is : {mean_absolute_error(y_test,y_test_pred)}')

Testing score for XGBoost Regressor is : 0.7521722912788391
Mean Absolute Error for XGBoost Regressor is : 1435.8908435132482


- So we are going to choose our XGBoost Regressor model for deployment

## 7. Saving Model

In [10]:
joblib.dump(xgb_final_model,"XGBoost.joblib")

['XGBoost.joblib']