In [275]:
import numpy as np
import pandas as pd
import joblib 
import matplotlib.pyplot as plt

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score

from feature_engine.datetime import DatetimeFeatures

from xgboost import XGBRegressor

In [276]:
pd.set_option("display.max_columns", None)
# this is done to show all the columns for better understanding

In [277]:
sklearn.set_config(transform_output="pandas")

In [278]:
train_df = pd.read_csv('data/train.csv')

In [279]:
val_df = pd.read_csv('data/val.csv')

In [280]:
test_df = pd.read_csv('data/test.csv')

In [281]:
def split_data(data):
    X=data.drop(columns='price')
    y=data.price.copy()
    return (X,y)

In [282]:
X_train, y_train = split_data(train_df)

In [283]:
X_train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included
1,Air India,2019-05-18,Delhi,Cochin,09:00:00,07:40:00,1360,1.0,No Info
2,Air India,2019-06-12,Kolkata,Banglore,09:10:00,11:05:00,1555,2.0,No Info
3,Vistara,2019-04-01,Kolkata,Banglore,20:20:00,22:55:00,1595,1.0,No Info
4,Vistara,2019-06-06,Kolkata,Banglore,17:00:00,10:45:00,1065,1.0,No Info
...,...,...,...,...,...,...,...,...,...
635,Air Asia,2019-04-12,Banglore,Delhi,04:55:00,07:45:00,170,0.0,No Info
636,Jet Airways,2019-05-09,Kolkata,Banglore,09:35:00,21:05:00,690,1.0,No Info
637,Indigo,2019-05-15,Banglore,Delhi,06:05:00,08:50:00,165,0.0,No Info
638,Multiple Carriers,2019-05-15,Delhi,Cochin,08:45:00,21:00:00,735,1.0,No Info


In [284]:
X_test, y_test = split_data(test_df)
X_val, y_val = split_data(val_df)

In [285]:
X_test.shape

(200, 9)

In [286]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 45.1+ KB


In [287]:
train_df.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included,4995
1,Air India,2019-05-18,Delhi,Cochin,09:00:00,07:40:00,1360,1.0,No Info,8372
2,Air India,2019-06-12,Kolkata,Banglore,09:10:00,11:05:00,1555,2.0,No Info,6117
3,Vistara,2019-04-01,Kolkata,Banglore,20:20:00,22:55:00,1595,1.0,No Info,7770
4,Vistara,2019-06-06,Kolkata,Banglore,17:00:00,10:45:00,1065,1.0,No Info,9187


In [288]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 45.1+ KB


In [289]:
for dataset in [X_train, X_test, X_val]:
    dataset['date_of_journey'] = pd.to_datetime(dataset['date_of_journey'])
    dataset['dep_time'] = pd.to_datetime(dataset['dep_time'], format='%H:%M:%S').dt.time
    dataset['arrival_time'] = pd.to_datetime(dataset['arrival_time'], format='%H:%M:%S').dt.time

In [290]:
# Extract features from date_of_journey
for dataset in [X_train, X_test, X_val]:
    dataset['journey_day_of_week'] = dataset['date_of_journey'].dt.dayofweek
    dataset['journey_day_of_month'] = dataset['date_of_journey'].dt.day
    dataset['journey_month'] = dataset['date_of_journey'].dt.month

In [291]:
# Extract features from dep_time and arrival_time
for dataset in [X_train, X_test, X_val]:
    dataset['dep_hour'] = pd.to_datetime(dataset['dep_time'], format='%H:%M:%S').dt.hour
    dataset['dep_minute'] = pd.to_datetime(dataset['dep_time'], format='%H:%M:%S').dt.minute
    dataset['arrival_hour'] = pd.to_datetime(dataset['arrival_time'], format='%H:%M:%S').dt.hour
    dataset['arrival_minute'] = pd.to_datetime(dataset['arrival_time'], format='%H:%M:%S').dt.minute

In [292]:
# Drop original datetime columns if not needed
for dataset in [X_train, X_test, X_val]:
    dataset.drop(columns=['date_of_journey', 'dep_time', 'arrival_time'], inplace=True)

# Define numerical and categorical columns
num_cols = ['duration', 'total_stops', 'journey_day_of_week', 'journey_day_of_month', 'journey_month', 'dep_hour', 'dep_minute', 'arrival_hour', 'arrival_minute']
cat_cols = ['airline', 'source', 'destination', 'additional_info']

In [293]:
# Create numerical transformer
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [294]:
# Create categorical transformer
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [295]:
# Create the preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

In [296]:
# Apply transformations to training data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_val_transformed = preprocessor.transform(X_val)

In [297]:
X_train_transformed

Unnamed: 0,num__duration,num__total_stops,num__journey_day_of_week,num__journey_day_of_month,num__journey_month,num__dep_hour,num__dep_minute,num__arrival_hour,num__arrival_minute,cat__airline_Air Asia,cat__airline_Air India,cat__airline_Goair,cat__airline_Indigo,cat__airline_Jet Airways,cat__airline_Multiple Carriers,cat__airline_Spicejet,cat__airline_Vistara,cat__source_Banglore,cat__source_Chennai,cat__source_Delhi,cat__source_Kolkata,cat__source_Mumbai,cat__destination_Banglore,cat__destination_Cochin,cat__destination_Delhi,cat__destination_Hyderabad,cat__destination_Kolkata,cat__destination_New Delhi,cat__additional_info_1 Long layover,cat__additional_info_In-flight meal not included,cat__additional_info_No Info,cat__additional_info_No check-in baggage included
0,-1.095918,-1.212132,0.566118,0.963554,1.151587,-0.462055,-0.140057,-0.345231,1.493859,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.435699,0.317975,1.050498,0.604465,0.306386,-0.640951,-1.229863,-0.935607,0.891041,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.824412,1.848082,-0.402640,-0.113711,1.151587,-0.640951,-0.684960,-0.345231,-1.218823,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.904148,0.317975,-1.371399,-1.430369,-0.538816,1.326906,-0.140057,1.278301,1.795268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.847647,0.317975,0.081739,-0.831888,1.151587,0.790218,-1.229863,-0.492825,1.192450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,-0.936446,-1.212132,0.566118,-0.113711,-0.538816,-1.535432,1.767103,-0.935607,1.192450,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
636,0.100122,0.317975,0.081739,-0.472800,0.306386,-0.640951,0.677297,1.130707,-1.218823,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
637,-0.946413,-1.212132,-0.402640,0.245377,0.306386,-1.177640,-0.957412,-0.788013,1.493859,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
638,0.189825,0.317975,-0.402640,0.245377,0.306386,-0.819847,1.222200,1.130707,-1.520232,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [298]:
X_train

Unnamed: 0,airline,source,destination,duration,total_stops,additional_info,journey_day_of_week,journey_day_of_month,journey_month,dep_hour,dep_minute,arrival_hour,arrival_minute
0,Jet Airways,Mumbai,Hyderabad,90,0.0,In-flight meal not included,4,21,6,10,20,11,50
1,Air India,Delhi,Cochin,1360,1.0,No Info,5,18,5,9,0,7,40
2,Air India,Kolkata,Banglore,1555,2.0,No Info,2,12,6,9,10,11,5
3,Vistara,Kolkata,Banglore,1595,1.0,No Info,0,1,4,20,20,22,55
4,Vistara,Kolkata,Banglore,1065,1.0,No Info,3,6,6,17,0,10,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,Air Asia,Banglore,Delhi,170,0.0,No Info,4,12,4,4,55,7,45
636,Jet Airways,Kolkata,Banglore,690,1.0,No Info,3,9,5,9,35,21,5
637,Indigo,Banglore,Delhi,165,0.0,No Info,2,15,5,6,5,8,50
638,Multiple Carriers,Delhi,Cochin,735,1.0,No Info,2,15,5,8,45,21,0


In [306]:
random_forest = RandomForestRegressor(random_state=42)
xgboost = XGBRegressor(random_state=42)

random_forest.fit(X_train_transformed, y_train)
rf_cv_scores = cross_val_score(random_forest, X_train_transformed, y_train, cv=5, scoring='r2')
rf_val_preds = random_forest.predict(X_val_transformed)
rf_val_r2 = r2_score(y_val, rf_val_preds)

print("Random Forest:")
print(f"CV R-squared scores: {rf_cv_scores}")
print(f"Mean CV R-squared: {np.mean(rf_cv_scores):.2f}")
print(f"R-squared on validation set: {rf_val_r2:.2f}")
print("\n")

xgboost.fit(X_train_transformed, y_train)
xgb_cv_scores = cross_val_score(xgboost, X_train_transformed, y_train, cv=5, scoring='r2')
xgb_val_preds = xgboost.predict(X_val_transformed)
xgb_val_r2 = r2_score(y_val, xgb_val_preds)

print("XGBoost:")
print(f"CV R-squared scores: {xgb_cv_scores}")
print(f"Mean CV R-squared: {np.mean(xgb_cv_scores):.2f}")
print(f"R-squared on validation set: {xgb_val_r2:.2f}")
print("\n")


Random Forest:
CV R-squared scores: [0.80348177 0.53918243 0.83408248 0.61936731 0.71172233]
Mean CV R-squared: 0.70
R-squared on validation set: 0.80


XGBoost:
CV R-squared scores: [0.83163503 0.54299131 0.82706945 0.63885103 0.73203052]
Mean CV R-squared: 0.71
R-squared on validation set: 0.79




In [310]:
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

# Initialize CatBoost model
catboost = CatBoostRegressor(random_state=42, verbose=0)

catboost.fit(X_train_transformed, y_train)

y_train_pred = catboost.predict(X_train_transformed)
train_r2 = r2_score(y_train, y_train_pred)
print(f"R-squared on training set: {train_r2:.2f}")

catboost_cv_scores = cross_val_score(catboost, X_train_transformed, y_train, cv=10, scoring='r2')
print("CatBoost:")
print(f"CV R-squared scores: {catboost_cv_scores}")
print(f"Mean CV R-squared: {np.mean(catboost_cv_scores):.2f}")

catboost_val_preds = catboost.predict(X_val_transformed)
catboost_val_r2 = r2_score(y_val, catboost_val_preds)
print(f"R-squared on validation set: {catboost_val_r2:.2f}")

sample_index = 10
sample_data = X_val_transformed.iloc[[sample_index]]
predicted_price = catboost.predict(sample_data)
print(f"Predicted price for the sample: {predicted_price[0]}")

R-squared on training set: 0.99
CatBoost:
CV R-squared scores: [0.82227465 0.80961247 0.56412297 0.76097578 0.86290571 0.85348761
 0.70562825 0.70134577 0.73891277 0.83917614]
Mean CV R-squared: 0.77
R-squared on validation set: 0.81
Predicted price for the sample: 6879.074332177648


In [307]:
y_val[10]

6347

In [311]:
# Model persistence---> saving the model

In [312]:
joblib.dump(catboost, "model.joblib")

['model.joblib']