In [310]:
import numpy as np
import pandas as pd
import joblib 
import matplotlib.pyplot as plt
import datetime as dt

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score

from feature_engine.datetime import DatetimeFeatures

from xgboost import XGBRegressor

In [311]:
pd.set_option("display.max_columns", None)

In [312]:
sklearn.set_config(transform_output="default")

In [313]:
train_df = pd.read_csv('data/train.csv')

In [314]:
val_df = pd.read_csv('data/val.csv')

In [315]:
test_df = pd.read_csv('data/test.csv')

In [316]:
train_df = pd.concat([train_df, val_df, test_df])

In [317]:
train_df.shape

(1000, 10)

In [318]:
def split_data(data):
    X=data.drop(columns='price')
    y=data.price.copy()
    return (X,y)

In [319]:
train_df

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included,4995
1,Air India,2019-05-18,Delhi,Cochin,09:00:00,07:40:00,1360,1.0,No Info,8372
2,Air India,2019-06-12,Kolkata,Banglore,09:10:00,11:05:00,1555,2.0,No Info,6117
3,Vistara,2019-04-01,Kolkata,Banglore,20:20:00,22:55:00,1595,1.0,No Info,7770
4,Vistara,2019-06-06,Kolkata,Banglore,17:00:00,10:45:00,1065,1.0,No Info,9187
...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,2019-04-21,Kolkata,Banglore,06:55:00,09:30:00,155,0.0,No Info,4174
196,Multiple Carriers,2019-06-01,Delhi,Cochin,11:40:00,19:15:00,455,1.0,No Info,10261
197,Indigo,2019-03-21,Mumbai,Hyderabad,21:20:00,22:45:00,85,0.0,No Info,2227
198,Jet Airways,2019-06-03,Delhi,Cochin,16:00:00,12:35:00,1235,1.0,In-flight meal not included,10262


In [320]:
date_columns = ['date_of_journey', 'dep_time', 'arrival_time']
train_df[date_columns] = train_df[date_columns].apply(pd.to_datetime)
test_df[date_columns] = test_df[date_columns].apply(pd.to_datetime)

In [321]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 199
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   airline          1000 non-null   object        
 1   date_of_journey  1000 non-null   datetime64[ns]
 2   source           1000 non-null   object        
 3   destination      1000 non-null   object        
 4   dep_time         1000 non-null   datetime64[ns]
 5   arrival_time     1000 non-null   datetime64[ns]
 6   duration         1000 non-null   int64         
 7   total_stops      1000 non-null   float64       
 8   additional_info  1000 non-null   object        
 9   price            1000 non-null   int64         
dtypes: datetime64[ns](3), float64(1), int64(2), object(4)
memory usage: 85.9+ KB


In [322]:
def timing(time):
    if time<=6:
        return "12 midnight to  6am"
    elif 6<time<=12:
        return "6am to 12 noon"
    elif 12<time<=18:
        return "12 noon to 6pm"
    else:
        return "6pm to 12 midnight"

In [323]:
train_df['dep_time'] = train_df['dep_time'].dt.hour.apply(timing)
train_df['arrival_time'] = train_df['arrival_time'].dt.hour.apply(timing)

In [324]:
train_df

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,6am to 12 noon,6am to 12 noon,90,0.0,In-flight meal not included,4995
1,Air India,2019-05-18,Delhi,Cochin,6am to 12 noon,6am to 12 noon,1360,1.0,No Info,8372
2,Air India,2019-06-12,Kolkata,Banglore,6am to 12 noon,6am to 12 noon,1555,2.0,No Info,6117
3,Vistara,2019-04-01,Kolkata,Banglore,6pm to 12 midnight,6pm to 12 midnight,1595,1.0,No Info,7770
4,Vistara,2019-06-06,Kolkata,Banglore,12 noon to 6pm,6am to 12 noon,1065,1.0,No Info,9187
...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,2019-04-21,Kolkata,Banglore,12 midnight to 6am,6am to 12 noon,155,0.0,No Info,4174
196,Multiple Carriers,2019-06-01,Delhi,Cochin,6am to 12 noon,6pm to 12 midnight,455,1.0,No Info,10261
197,Indigo,2019-03-21,Mumbai,Hyderabad,6pm to 12 midnight,6pm to 12 midnight,85,0.0,No Info,2227
198,Jet Airways,2019-06-03,Delhi,Cochin,12 noon to 6pm,6am to 12 noon,1235,1.0,In-flight meal not included,10262


In [325]:
# day, month, weekend

In [326]:
train_df['month'] = train_df['date_of_journey'].dt.month

In [327]:
train_df['day'] = train_df['date_of_journey'].dt.day

In [328]:
train_df

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price,month,day
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,6am to 12 noon,6am to 12 noon,90,0.0,In-flight meal not included,4995,6,21
1,Air India,2019-05-18,Delhi,Cochin,6am to 12 noon,6am to 12 noon,1360,1.0,No Info,8372,5,18
2,Air India,2019-06-12,Kolkata,Banglore,6am to 12 noon,6am to 12 noon,1555,2.0,No Info,6117,6,12
3,Vistara,2019-04-01,Kolkata,Banglore,6pm to 12 midnight,6pm to 12 midnight,1595,1.0,No Info,7770,4,1
4,Vistara,2019-06-06,Kolkata,Banglore,12 noon to 6pm,6am to 12 noon,1065,1.0,No Info,9187,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,2019-04-21,Kolkata,Banglore,12 midnight to 6am,6am to 12 noon,155,0.0,No Info,4174,4,21
196,Multiple Carriers,2019-06-01,Delhi,Cochin,6am to 12 noon,6pm to 12 midnight,455,1.0,No Info,10261,6,1
197,Indigo,2019-03-21,Mumbai,Hyderabad,6pm to 12 midnight,6pm to 12 midnight,85,0.0,No Info,2227,3,21
198,Jet Airways,2019-06-03,Delhi,Cochin,12 noon to 6pm,6am to 12 noon,1235,1.0,In-flight meal not included,10262,6,3


In [329]:
train_df['day_name'] = train_df['date_of_journey'].dt.day_name()

In [330]:
train_df

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price,month,day,day_name
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,6am to 12 noon,6am to 12 noon,90,0.0,In-flight meal not included,4995,6,21,Friday
1,Air India,2019-05-18,Delhi,Cochin,6am to 12 noon,6am to 12 noon,1360,1.0,No Info,8372,5,18,Saturday
2,Air India,2019-06-12,Kolkata,Banglore,6am to 12 noon,6am to 12 noon,1555,2.0,No Info,6117,6,12,Wednesday
3,Vistara,2019-04-01,Kolkata,Banglore,6pm to 12 midnight,6pm to 12 midnight,1595,1.0,No Info,7770,4,1,Monday
4,Vistara,2019-06-06,Kolkata,Banglore,12 noon to 6pm,6am to 12 noon,1065,1.0,No Info,9187,6,6,Thursday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,2019-04-21,Kolkata,Banglore,12 midnight to 6am,6am to 12 noon,155,0.0,No Info,4174,4,21,Sunday
196,Multiple Carriers,2019-06-01,Delhi,Cochin,6am to 12 noon,6pm to 12 midnight,455,1.0,No Info,10261,6,1,Saturday
197,Indigo,2019-03-21,Mumbai,Hyderabad,6pm to 12 midnight,6pm to 12 midnight,85,0.0,No Info,2227,3,21,Thursday
198,Jet Airways,2019-06-03,Delhi,Cochin,12 noon to 6pm,6am to 12 noon,1235,1.0,In-flight meal not included,10262,6,3,Monday


In [331]:
train_df['is_weekend'] = np.where(train_df['day_name'].isin(["Saturday","Sunday"]),1,0)

In [332]:
train_df = train_df.drop(['date_of_journey','day_name', 'additional_info'], axis=1 )

In [333]:
train_df

Unnamed: 0,airline,source,destination,dep_time,arrival_time,duration,total_stops,price,month,day,is_weekend
0,Jet Airways,Mumbai,Hyderabad,6am to 12 noon,6am to 12 noon,90,0.0,4995,6,21,0
1,Air India,Delhi,Cochin,6am to 12 noon,6am to 12 noon,1360,1.0,8372,5,18,1
2,Air India,Kolkata,Banglore,6am to 12 noon,6am to 12 noon,1555,2.0,6117,6,12,0
3,Vistara,Kolkata,Banglore,6pm to 12 midnight,6pm to 12 midnight,1595,1.0,7770,4,1,0
4,Vistara,Kolkata,Banglore,12 noon to 6pm,6am to 12 noon,1065,1.0,9187,6,6,0
...,...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,Kolkata,Banglore,12 midnight to 6am,6am to 12 noon,155,0.0,4174,4,21,1
196,Multiple Carriers,Delhi,Cochin,6am to 12 noon,6pm to 12 midnight,455,1.0,10261,6,1,1
197,Indigo,Mumbai,Hyderabad,6pm to 12 midnight,6pm to 12 midnight,85,0.0,2227,3,21,0
198,Jet Airways,Delhi,Cochin,12 noon to 6pm,6am to 12 noon,1235,1.0,10262,6,3,0


In [334]:
num_cols=['duration','total_stops','month','day', 'is_weekend']
cat_cols=['airline', 'source', 'destination', 'dep_time', 'arrival_time']

In [335]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [336]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [337]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

In [338]:
X=train_df.drop('price',axis=1)
y=train_df['price']

In [339]:
X

Unnamed: 0,airline,source,destination,dep_time,arrival_time,duration,total_stops,month,day,is_weekend
0,Jet Airways,Mumbai,Hyderabad,6am to 12 noon,6am to 12 noon,90,0.0,6,21,0
1,Air India,Delhi,Cochin,6am to 12 noon,6am to 12 noon,1360,1.0,5,18,1
2,Air India,Kolkata,Banglore,6am to 12 noon,6am to 12 noon,1555,2.0,6,12,0
3,Vistara,Kolkata,Banglore,6pm to 12 midnight,6pm to 12 midnight,1595,1.0,4,1,0
4,Vistara,Kolkata,Banglore,12 noon to 6pm,6am to 12 noon,1065,1.0,6,6,0
...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,Kolkata,Banglore,12 midnight to 6am,6am to 12 noon,155,0.0,4,21,1
196,Multiple Carriers,Delhi,Cochin,6am to 12 noon,6pm to 12 midnight,455,1.0,6,1,1
197,Indigo,Mumbai,Hyderabad,6pm to 12 midnight,6pm to 12 midnight,85,0.0,3,21,0
198,Jet Airways,Delhi,Cochin,12 noon to 6pm,6am to 12 noon,1235,1.0,6,3,0


In [340]:
y

0       4995
1       8372
2       6117
3       7770
4       9187
       ...  
195     4174
196    10261
197     2227
198    10262
199     4049
Name: price, Length: 1000, dtype: int64

In [341]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [342]:
X_train

Unnamed: 0,airline,source,destination,dep_time,arrival_time,duration,total_stops,month,day,is_weekend
29,Indigo,Delhi,Cochin,6pm to 12 midnight,12 midnight to 6am,195,0.0,4,3,0
535,Spicejet,Kolkata,Banglore,6am to 12 noon,12 noon to 6pm,520,1.0,4,1,0
55,Air India,Kolkata,Banglore,6am to 12 noon,12 midnight to 6am,1210,2.0,6,9,1
557,Jet Airways,Kolkata,Banglore,12 noon to 6pm,6am to 12 noon,1155,1.0,5,6,0
36,Indigo,Kolkata,Banglore,12 noon to 6pm,12 noon to 6pm,155,0.0,4,9,0
...,...,...,...,...,...,...,...,...,...,...
106,Jet Airways,Delhi,Cochin,12 noon to 6pm,6pm to 12 midnight,1485,1.0,6,6,0
270,Air India,Delhi,Cochin,6pm to 12 midnight,6pm to 12 midnight,1380,2.0,6,27,0
60,Air India,Mumbai,Hyderabad,6pm to 12 midnight,6pm to 12 midnight,80,0.0,5,24,0
435,Goair,Delhi,Cochin,6am to 12 noon,6am to 12 noon,355,1.0,5,15,0


In [343]:
y_train

29      5021
535     6013
55     11642
557     9663
36      4174
       ...  
106    10262
270    13591
60      3100
435     7682
102    12681
Name: price, Length: 800, dtype: int64

In [344]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [345]:
X_train_transformed

array([[-0.89385661, -1.21241796, -0.55800682, ...,  0.        ,
         0.        ,  0.        ],
       [-0.23097574,  0.30074018, -0.55800682, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.17637134,  1.81389833,  1.15564175, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.12841446, -1.21241796,  0.29881747, ...,  0.        ,
         0.        ,  1.        ],
       [-0.56751526,  0.30074018,  0.29881747, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.62509069,  0.30074018, -0.55800682, ...,  1.        ,
         0.        ,  0.        ]])

In [346]:
X_test_transformed = preprocessor.transform(X_test)

In [347]:
X_test_transformed

array([[-0.63890243,  0.30074018,  1.15564175, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.60527397,  0.30074018,  1.15564175, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.32934385,  0.30074018, -0.55800682, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.92445111, -1.21241796, -0.55800682, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.80865771,  0.30074018,  0.29881747, ...,  0.        ,
         1.        ,  0.        ],
       [ 2.03301738,  0.30074018,  0.29881747, ...,  0.        ,
         0.        ,  1.        ]])

In [356]:
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

# Initialize CatBoost model
catboost = CatBoostRegressor(random_state=42, verbose=0)

catboost.fit(X_train_transformed, y_train)

y_train_pred = catboost.predict(X_train_transformed)
train_r2 = r2_score(y_train, y_train_pred)
print(f"R-squared on training set: {train_r2:.2f}")

catboost_cv_scores = cross_val_score(catboost, X_train_transformed, y_train, cv=10, scoring='r2')
print("CatBoost:")
print(f"CV R-squared scores: {catboost_cv_scores}")
print(f"Mean CV R-squared: {np.mean(catboost_cv_scores):.2f}")

catboost_test_preds = catboost.predict(X_test_transformed)
catboost_test_r2 = r2_score(y_test, catboost_test_preds)
print(f"R-squared on test set: {catboost_test_r2:.2f}")

sample_index = 0
sample_data = X_test_transformed[sample_index].reshape(1, -1)  # Reshape for single sample prediction
predicted_price = catboost.predict(sample_data)
print(f"Predicted price for the sample: {predicted_price[0]}")

R-squared on training set: 0.95
CatBoost:
CV R-squared scores: [0.78375067 0.75681634 0.57130541 0.71769012 0.80123901 0.72203882
 0.61030517 0.71555916 0.66784058 0.69141992]
Mean CV R-squared: 0.70
R-squared on test set: 0.75
Predicted price for the sample: 7342.784190212054


In [358]:
y_test.head(10)

521     7095
97      9508
100    12681
20      4423
411     6297
38     11873
626     1965
513    15400
59      7677
136     7252
Name: price, dtype: int64

In [359]:
joblib.dump({'preprocessor': preprocessor, 'model': catboost}, 'pipeline2.joblib')

['pipeline2.joblib']