# Part 1: Feature Engineering

# What is Feature Engineering?
* `Feature Engineering` is the process of creating new features from the existing features.
* The primary aim is to create such features that can boost the performance of the model.
* This is totally experimental and depends on the creativity of the programmer.

# Data Preprocessing
`Data Preprocessing` is the process of converting the dataset into a form suitable for the model 

In [174]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.5-py3-none-manylinux2014_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m102.5 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.5


In [1]:
!pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.9.3-py3-none-any.whl.metadata (10 kB)
Downloading feature_engine-1.9.3-py3-none-any.whl (229 kB)
Installing collected packages: feature_engine
Successfully installed feature_engine-1.9.3


In [5]:
import sys

print(sys.executable)

C:\Users\jugal\AppData\Local\Programs\Python\Python311\python.exe


In [6]:
import pandas as pd
import numpy as np
import sklearn
from IPython.display import display


from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer

from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import MeanEncoder

import xgboost


# Display settings
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output = 'pandas') # To display sklearn outputs as pandas DataFrames

# 1. Reading the training data

In [4]:
path = r"train.csv"
df = pd.read_csv(path)

df.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Spicejet,2019-05-01,Banglore,Delhi,21:10:00,00:05:00,175,0.0,No Info,4991
1,Vistara,2019-06-01,Chennai,Kolkata,17:45:00,20:05:00,140,0.0,No Info,11982
2,Jet Airways,2019-06-09,Delhi,Cochin,17:30:00,12:35:00,1145,1.0,In-flight meal not included,10262
3,Air India,2019-05-18,Kolkata,Banglore,12:00:00,18:30:00,1830,2.0,No Info,10361
4,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0.0,In-flight meal not included,2228


In [5]:
def check_missing(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
    missing_data = pd.concat([total, percent], axis = 1, keys = ['Total', 'Percent'])
    
    display(missing_data)

    user_input = input("Do you want to drop rows with missing values? (y/n): ")
    if user_input.lower() == 'y':
        df.dropna(inplace = True)
        print()
        
        print("Rows with missing values dropped.")
    return df


In [6]:
df = check_missing(df)

Unnamed: 0,Total,Percent
total_stops,1,0.013656
airline,0,0.0
date_of_journey,0,0.0
source,0,0.0
dep_time,0,0.0
destination,0,0.0
arrival_time,0,0.0
duration,0,0.0
additional_info,0,0.0
price,0,0.0


Do you want to drop rows with missing values? (y/n):  y



Rows with missing values dropped.


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7322 entries, 0 to 7322
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          7322 non-null   object 
 1   date_of_journey  7322 non-null   object 
 2   source           7322 non-null   object 
 3   destination      7322 non-null   object 
 4   dep_time         7322 non-null   object 
 5   arrival_time     7322 non-null   object 
 6   duration         7322 non-null   int64  
 7   total_stops      7322 non-null   float64
 8   additional_info  7322 non-null   object 
 9   price            7322 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 629.2+ KB


In [12]:
# Splittinng into x and y
x = df.drop(columns = 'price')
y = df['price'].copy()

# 2. Transformation Operations (Column wise)

In [13]:
x.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

# 2.1 Airline

* Impute missing values  
* Group Rare Categories:
    * Some categories may have very few occurrences.
    * Group these into a single category called 'Other' to avoid overfitting.
* One-Hot Encoding

In [14]:
steps = [
    ('inputing', SimpleImputer(strategy = 'most_frequent')),
    ('rare_label_grouping', RareLabelEncoder(tol = 0.1, replace_with = "other", n_categories = 2)),
    ('ohe', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'))
]

airline_pipeline = Pipeline(steps = steps)

airline_pipeline.fit_transform(x[['airline']])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_other
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
7318,0.0,0.0,1.0,0.0,0.0
7319,0.0,1.0,0.0,0.0,0.0
7320,0.0,0.0,1.0,0.0,0.0
7321,0.0,0.0,0.0,1.0,0.0


# 2.2 date_of_journey
* Extract some date features
    


In [15]:
from feature_engine.datetime import DatetimeFeatures
from sklearn.preprocessing import MinMaxScaler

features_to_extract = ['month', 'week', 'day_of_week', 'day_of_month', 'weekend', 'month_start', 'month_end']

doj_steps = [
    ('extract_doj', DatetimeFeatures(features_to_extract = features_to_extract, format = "mixed", yearfirst = True)),
    ('scaling', MinMaxScaler())

]

doj_pipeline = Pipeline(steps = doj_steps)

doj_pipeline.fit_transform(x[['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_month,date_of_journey_weekend,date_of_journey_month_start,date_of_journey_month_end
0,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0
1,1.000000,0.764706,0.833333,0.000000,1.0,1.0,0.0
2,1.000000,0.823529,1.000000,0.307692,1.0,0.0,0.0
3,0.666667,0.647059,0.833333,0.653846,1.0,0.0,0.0
4,0.000000,0.176471,1.000000,0.884615,1.0,0.0,0.0
...,...,...,...,...,...,...,...
7318,0.666667,0.647059,0.333333,0.538462,0.0,0.0,0.0
7319,0.666667,0.705882,0.166667,0.769231,0.0,0.0,0.0
7320,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0
7321,1.000000,0.823529,0.000000,0.076923,0.0,0.0,0.0


# 2.3 `source` and `destination`: Both are having same type of values.
* Group rare values
* Mean encoding
* Power transformer

In [16]:
from feature_engine.encoding import MeanEncoder
from sklearn.preprocessing import PowerTransformer

In [17]:
df[['source', 'destination']]

Unnamed: 0,source,destination
0,Banglore,Delhi
1,Chennai,Kolkata
2,Delhi,Cochin
3,Kolkata,Banglore
4,Mumbai,Hyderabad
...,...,...
7318,Kolkata,Banglore
7319,Delhi,Cochin
7320,Kolkata,Banglore
7321,Delhi,Cochin


In [18]:
steps = [
    ('rare_label_grouping', RareLabelEncoder(tol = 0.1, replace_with = "other", n_categories = 2)),
    ('mean_encoding', MeanEncoder()),    # Mean Encoder needs target variable 'price' also, so we will pass y in fit_transform.
    ('power_transformer', PowerTransformer(standardize = True))
]

source_dest_pipeline = Pipeline(steps= steps)

source_dest_pipeline.fit_transform(
    X = x[['source', 'destination']],
    y = y)

Unnamed: 0,source,destination
0,-0.922934,-1.825340
1,-1.864720,-0.844206
2,1.047774,1.046261
3,-0.162824,-0.174735
4,-1.864720,-0.844206
...,...,...
7318,-0.162824,-0.174735
7319,1.047774,1.046261
7320,-0.162824,-0.174735
7321,1.047774,1.046261


# 2.4 `dep_time` and `arrival_time`: Having values of same types

In [19]:
time_columns = x[['dep_time', 'arrival_time']]

time_columns

Unnamed: 0,dep_time,arrival_time
0,21:10:00,00:05:00
1,17:45:00,20:05:00
2,17:30:00,12:35:00
3,12:00:00,18:30:00
4,15:50:00,17:20:00
...,...,...
7318,10:20:00,17:35:00
7319,10:35:00,21:00:00
7320,20:25:00,10:55:00
7321,07:00:00,21:00:00


In [20]:
def part_of_day(X, morning=4, noon=12, evening=16, night=20):
    X_temp = pd.DataFrame(index = X.index)
    for col in X.columns:
        hour = pd.to_datetime(X[col], errors='coerce', format = 'mixed').dt.hour
        X_temp[f"{col}_part_of_day"] = np.where(
            (hour >= morning) & (hour < noon), 'morning',
            np.where(
                (hour >= noon) & (hour < evening), 'afternoon',
                np.where(
                    (hour >= evening) & (hour < night), 'evening', 'night'
                )
            )
        )
    return X_temp


In [21]:
part_of_day(x[['dep_time', 'arrival_time']])

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,night,night
1,evening,night
2,evening,afternoon
3,afternoon,evening
4,afternoon,evening
...,...,...
7318,morning,evening
7319,morning,night
7320,night,morning
7321,morning,night


In [22]:
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

time_pipe_steps1 = [
    ('func_trans', FunctionTransformer(func=part_of_day, validate=False)),
    ('ordinal_encoder', OrdinalEncoder(categories=[['morning', 'afternoon', 'evening', 'night']] * 2)),
    ('scaler', MinMaxScaler())
]

time_pipeline1 = Pipeline(steps=time_pipe_steps1)

time_pipeline1.fit_transform(time_columns)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,1.000000,1.000000
1,0.666667,1.000000
2,0.666667,0.333333
3,0.333333,0.666667
4,0.333333,0.666667
...,...,...
7318,0.000000,0.666667
7319,0.000000,1.000000
7320,1.000000,0.000000
7321,0.000000,1.000000


In [23]:
time_columns = x[['dep_time', 'arrival_time']]
features_to_extract = ['hour', 'minute']

time_pipe_steps2 = [
    ('datetime', DatetimeFeatures(features_to_extract = features_to_extract, format = 'mixed')),
    ('scaler', MinMaxScaler())
    ]


time_pipeline2 = Pipeline(steps = time_pipe_steps2)

time_pipeline2.fit_transform(time_columns)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.913043,0.181818,0.000000,0.090909
1,0.739130,0.818182,0.869565,0.090909
2,0.739130,0.545455,0.521739,0.636364
3,0.521739,0.000000,0.782609,0.545455
4,0.652174,0.909091,0.739130,0.363636
...,...,...,...,...
7318,0.434783,0.363636,0.739130,0.636364
7319,0.434783,0.636364,0.913043,0.000000
7320,0.869565,0.454545,0.434783,1.000000
7321,0.304348,0.000000,0.913043,0.000000


# 2.5 Duration

* Only applying PowerTransformer.
* PT has a parameter `standardize`, if set to `True`, it does scaling also.


In [24]:
duration_steps = [
    ('capping', Winsorizer(capping_method = 'iqr')),
    ('power_transformer', PowerTransformer(standardize = True))
]

duration_pipeline = Pipeline(steps = duration_steps)

duration_pipeline.fit_transform(x[['duration']])

Unnamed: 0,duration
0,-1.013801
1,-1.236654
2,1.088183
3,1.680771
4,-1.662039
...,...
7318,-0.046843
7319,0.364748
7320,0.754289
7321,0.712317


# 2.6 total_stops
* Custom Function: `is_direct_flight()` 
* MinMaxScaler

Directly apply `MinMaxScaler` bcz the data is already `ordinal-numeric`

In [25]:
x['total_stops']

0       0.0
1       0.0
2       1.0
3       2.0
4       0.0
       ... 
7318    1.0
7319    1.0
7320    1.0
7321    1.0
7322    1.0
Name: total_stops, Length: 7322, dtype: float64

In [26]:
def is_direct_flight(x):
    y = pd.DataFrame(index= x.index)

    y['is_direct_flight'] = np.where(x['total_stops'] == 0, 1, 0)
    return y

is_direct_flight(x)

Unnamed: 0,is_direct_flight
0,1
1,1
2,0
3,0
4,1
...,...
7318,0
7319,0
7320,0
7321,0


In [27]:
total_stops_pipeline1 = FunctionTransformer(func=is_direct_flight, validate=False)

total_stops_pipeline1.fit_transform(x[['total_stops']])

Unnamed: 0,is_direct_flight
0,1
1,1
2,0
3,0
4,1
...,...
7318,0
7319,0
7320,0
7321,0


In [28]:
steps_total_stops = [
    ('scaling', MinMaxScaler())
]

total_stops_pipeline2 = Pipeline(steps = steps_total_stops)
total_stops_pipeline2.fit_transform(x[['total_stops']])

Unnamed: 0,total_stops
0,0.00
1,0.00
2,0.25
3,0.50
4,0.00
...,...
7318,0.25
7319,0.25
7320,0.25
7321,0.25


# 2.7 additional_info
* Dropping this column as it is not much necessary

In [29]:
x.drop(columns = 'additional_info', inplace = True)

# 3. Column Transformer

In [30]:
transformers = [
    ('airline', airline_pipeline, ['airline']),
    ('doj', doj_pipeline, ['date_of_journey']),
    ('source_dest', source_dest_pipeline, ['source', 'destination']),
    ('time_pipe1', time_pipeline1, ['dep_time', 'arrival_time']),
    ('time_pipe2', time_pipeline2, ['dep_time', 'arrival_time']),
    ('duration', duration_pipeline, ['duration']),
    ('total_stops1', total_stops_pipeline1, ['total_stops']),
    ('total_stops2', total_stops_pipeline2, ['total_stops']),
]

ct_pipeline = ColumnTransformer(transformers = transformers, remainder = 'passthrough')

ct_pipeline.fit_transform(X = x, y = y)

Unnamed: 0,airline__airline_Air India,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_Multiple Carriers,airline__airline_other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_month,doj__date_of_journey_weekend,doj__date_of_journey_month_start,doj__date_of_journey_month_end,source_dest__source,source_dest__destination,time_pipe1__dep_time_part_of_day,time_pipe1__arrival_time_part_of_day,time_pipe2__dep_time_hour,time_pipe2__dep_time_minute,time_pipe2__arrival_time_hour,time_pipe2__arrival_time_minute,duration__duration,total_stops1__is_direct_flight,total_stops2__total_stops
0,0.0,0.0,0.0,0.0,1.0,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0,-0.922934,-1.825340,1.000000,1.000000,0.913043,0.181818,0.000000,0.090909,-1.013801,1,0.00
1,0.0,0.0,0.0,0.0,1.0,1.000000,0.764706,0.833333,0.000000,1.0,1.0,0.0,-1.864720,-0.844206,0.666667,1.000000,0.739130,0.818182,0.869565,0.090909,-1.236654,1,0.00
2,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,1.000000,0.307692,1.0,0.0,0.0,1.047774,1.046261,0.666667,0.333333,0.739130,0.545455,0.521739,0.636364,1.088183,0,0.25
3,1.0,0.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.653846,1.0,0.0,0.0,-0.162824,-0.174735,0.333333,0.666667,0.521739,0.000000,0.782609,0.545455,1.680771,0,0.50
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,1.000000,0.884615,1.0,0.0,0.0,-1.864720,-0.844206,0.333333,0.666667,0.652174,0.909091,0.739130,0.363636,-1.662039,1,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7318,0.0,0.0,1.0,0.0,0.0,0.666667,0.647059,0.333333,0.538462,0.0,0.0,0.0,-0.162824,-0.174735,0.000000,0.666667,0.434783,0.363636,0.739130,0.636364,-0.046843,0,0.25
7319,0.0,1.0,0.0,0.0,0.0,0.666667,0.705882,0.166667,0.769231,0.0,0.0,0.0,1.047774,1.046261,0.000000,1.000000,0.434783,0.636364,0.913043,0.000000,0.364748,0,0.25
7320,0.0,0.0,1.0,0.0,0.0,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0,-0.162824,-0.174735,1.000000,0.000000,0.869565,0.454545,0.434783,1.000000,0.754289,0,0.25
7321,0.0,0.0,0.0,1.0,0.0,1.000000,0.823529,0.000000,0.076923,0.0,0.0,0.0,1.047774,1.046261,0.000000,1.000000,0.304348,0.000000,0.913043,0.000000,0.712317,0,0.25


In [31]:
for name, pipe, cols in transformers:
    # if your transformer needs y during fit_transform (e.g. MeanEncoder), pass it
    try:
        out = pipe.fit_transform(x[cols], y)  # passes y to those that need it
    except TypeError:
        out = pipe.fit_transform(x[cols])
    print(name, "->", type(out), getattr(out, "shape", None))
    # for pandas outputs also print index info
    if hasattr(out, "index"):
        print("   index equals input index:", out.index.equals(x[cols].index))


airline -> <class 'pandas.core.frame.DataFrame'> (7322, 5)
   index equals input index: True
doj -> <class 'pandas.core.frame.DataFrame'> (7322, 7)
   index equals input index: True
source_dest -> <class 'pandas.core.frame.DataFrame'> (7322, 2)
   index equals input index: True
time_pipe1 -> <class 'pandas.core.frame.DataFrame'> (7322, 2)
   index equals input index: True
time_pipe2 -> <class 'pandas.core.frame.DataFrame'> (7322, 4)
   index equals input index: True
duration -> <class 'pandas.core.frame.DataFrame'> (7322, 1)
   index equals input index: True
total_stops1 -> <class 'pandas.core.frame.DataFrame'> (7322, 1)
   index equals input index: True
total_stops2 -> <class 'pandas.core.frame.DataFrame'> (7322, 1)
   index equals input index: True


# Part 2: Feature Selection

In [32]:
from feature_engine.selection import SelectBySingleFeaturePerformance
from sklearn.ensemble import RandomForestRegressor


In [33]:
estimator = RandomForestRegressor(n_estimators = 20, max_depth = 5, random_state = 1)
selector = SelectBySingleFeaturePerformance(
    estimator = estimator,
    scoring = 'r2',
    threshold = 0.1
)

# Creating the final pipeline
final_pipeline = Pipeline(steps = [
    ('column_transformer', ct_pipeline),
    ('feature_selection', selector)
]) 

final_pipeline.fit_transform(X = x, y = y)

Unnamed: 0,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_other,doj__date_of_journey_week,source_dest__source,source_dest__destination,time_pipe2__arrival_time_hour,duration__duration,total_stops1__is_direct_flight,total_stops2__total_stops
0,0.0,0.0,1.0,0.529412,-0.922934,-1.825340,0.000000,-1.013801,1,0.00
1,0.0,0.0,1.0,0.764706,-1.864720,-0.844206,0.869565,-1.236654,1,0.00
2,0.0,1.0,0.0,0.823529,1.047774,1.046261,0.521739,1.088183,0,0.25
3,0.0,0.0,0.0,0.647059,-0.162824,-0.174735,0.782609,1.680771,0,0.50
4,0.0,1.0,0.0,0.176471,-1.864720,-0.844206,0.739130,-1.662039,1,0.00
...,...,...,...,...,...,...,...,...,...,...
7318,0.0,1.0,0.0,0.647059,-0.162824,-0.174735,0.739130,-0.046843,0,0.25
7319,1.0,0.0,0.0,0.705882,1.047774,1.046261,0.913043,0.364748,0,0.25
7320,0.0,1.0,0.0,0.529412,-0.162824,-0.174735,0.434783,0.754289,0,0.25
7321,0.0,0.0,0.0,0.823529,1.047774,1.046261,0.913043,0.712317,0,0.25


In [34]:
feature_performance = final_pipeline.named_steps['feature_selection'].feature_performance_
feature_performance

{'airline__airline_Air India': 0.002164161719206007,
 'airline__airline_Indigo': 0.1304685854559775,
 'airline__airline_Jet Airways': 0.17747908206974086,
 'airline__airline_Multiple Carriers': 0.023670755844778586,
 'airline__airline_other': 0.11382330665356306,
 'doj__date_of_journey_month': 0.09257108730694212,
 'doj__date_of_journey_week': 0.19880224650655076,
 'doj__date_of_journey_day_of_week': 0.004037300003953526,
 'doj__date_of_journey_day_of_month': 0.03265359713452783,
 'doj__date_of_journey_weekend': -0.0002788474239527883,
 'doj__date_of_journey_month_start': 0.00919686178886451,
 'doj__date_of_journey_month_end': -0.0005203120844960255,
 'source_dest__source': 0.13345045789214835,
 'source_dest__destination': 0.13608356693474802,
 'time_pipe1__dep_time_part_of_day': -7.984605189144862e-05,
 'time_pipe1__arrival_time_part_of_day': 0.03193302206860971,
 'time_pipe2__dep_time_hour': 0.018386426318662052,
 'time_pipe2__dep_time_minute': 0.04745652918226225,
 'time_pipe2__arri

In [35]:
# Descending
sorted_dict_desc = dict(sorted(feature_performance.items(), key=lambda item: item[1], reverse=True))
sorted_dict_desc

{'duration__duration': 0.4521077726963904,
 'total_stops2__total_stops': 0.4142148505511029,
 'total_stops1__is_direct_flight': 0.38930682669073796,
 'doj__date_of_journey_week': 0.19880224650655076,
 'airline__airline_Jet Airways': 0.17747908206974086,
 'source_dest__destination': 0.13608356693474802,
 'source_dest__source': 0.13345045789214835,
 'airline__airline_Indigo': 0.1304685854559775,
 'airline__airline_other': 0.11382330665356306,
 'time_pipe2__arrival_time_hour': 0.11091793919084451,
 'doj__date_of_journey_month': 0.09257108730694212,
 'time_pipe2__dep_time_minute': 0.04745652918226225,
 'time_pipe2__arrival_time_minute': 0.047102223679684974,
 'doj__date_of_journey_day_of_month': 0.03265359713452783,
 'time_pipe1__arrival_time_part_of_day': 0.03193302206860971,
 'airline__airline_Multiple Carriers': 0.023670755844778586,
 'time_pipe2__dep_time_hour': 0.018386426318662052,
 'doj__date_of_journey_month_start': 0.00919686178886451,
 'doj__date_of_journey_day_of_week': 0.004037

Observation:

* The dataset had `23 columns after Feature Engineering`
* The `Feature Selection algorithm selected 10` features out of that

# Importing the Feature Engineering Pipeline

In [3]:
import cloudpickle

with open("flights_final_pipeline.pkl", "rb") as f:
    final_pipeline = cloudpickle.load(f)

# Sagemaker Starts

# 1. Reading `training`, `validation` and `test` datasets.

In [8]:
# function to read_dataset.

def read_data(file_name):

    file_name = rf"{file_name}.csv"

    df = pd.read_csv(file_name)
    return df


train_df = read_data(file_name = 'train_data')
val_df = read_data(file_name = 'validation_data')
test_df = read_data(file_name = 'test_data')

In [9]:
for data in [train_df, val_df, test_df]:
    print(data.shape)

(7323, 10)
(1569, 10)
(1570, 10)


# 2. Preprocessing the data and uploading to S3 bucket

In [10]:
# File name Function
def get_file_name(name):
    return f"{name}_pre.csv"

get_file_name('train')

'train_pre.csv'

In [11]:
# Function to transform and save the data using column_transformer object

def transform_and_export(data, name, pipeline):
    # Get file name to save the transformed data
    file_name = get_file_name(name)

    # Seperating target_variable and other independent variable bcz we never transform target variable.
    x = data.drop(columns = 'price')
    y = data['price'].copy()

    # Transforming x data
    x_trans = pipeline.transform(x)

    # Saving the data. 

    # Sagemaker accepts target_variable first and then the remaining columns
    final_data = y.to_frame().join(x_trans)

    # Saving
    final_data.to_csv(file_name, index = False, header = False)
    

In [12]:
# Function to upload to S3 bucket

import os
import boto3
import pickle

In [13]:
bucket_name = "flight-price-prediction-project-sagemaker"
data_prefix = 'data'

def upload_to_bucket(name):
    file_name = get_file_name(name)

    (
        boto3
        .Session()
        .resource('s3')
        .Bucket(bucket_name)
        .Object(os.path.join(data_prefix, f"{file_name}"))
        .upload_file(file_name)
)
    print(f"{file_name} uploaded to bucket.")

In [14]:
# Final workflow

def final_workflow(data, name, pipeline):
    transform_and_export(data, name, pipeline)
    upload_to_bucket(name)

In [15]:
final_workflow(data = train_df, name = 'train', pipeline = final_pipeline)
final_workflow(data = val_df, name = 'val', pipeline = final_pipeline)
final_workflow(data = test_df, name = 'test', pipeline = final_pipeline)

NoCredentialsError: Unable to locate credentials

# 3. Model training and Hyperparameter Tunning using `Sagemaker`

# 3.1 Model Setup

In [196]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner,
    )

In [197]:
session = sagemaker.Session()
region_name = session.boto_region_name

print(f'''
session: {session}
region_name: {region_name}
''')


session: <sagemaker.session.Session object at 0x7f7b0f902da0>
region_name: eu-north-1



In [198]:
image_uri = sagemaker.image_uris.retrieve(framework = 'xgboost', region = region_name, version = '1.2-1')
role = sagemaker.get_execution_role()
output_path = f"s3://{bucket_name}/model"



print(f'''
image_uri: {image_uri}
role: {role}
output_path: {output_path}
''')


image_uri: 662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-xgboost:1.2-1
role: arn:aws:iam::730335554189:role/service-role/AmazonSageMaker-ExecutionRole-20251014T195528
output_path: s3://flight-price-prediction-project-sagemaker/model



In [199]:
model = Estimator(
    image_uri = image_uri,
    role = role,
    instance_count = 1,
    instance_type = 'ml.t3.large',
    volume_size = 5,
    output_path = output_path,
    use_spot_instances = False,
    max_run = 500,
    # max_wait = 100,    # This is only allowed if 'use_spot_instances = True'
    sagemaker_session = session
    
)

In [200]:
# Setting parameters of XGBoost
model.set_hyperparameters(
    objective = 'reg:squarederror',   # means 'MSE'
    num_round = 10,
    eta = 0.1,
    max_depth = 8,
    subsample = 0.8,
    colsample_bytree = 0.8,
    alpha = 0.1    # alpha means "l2 Regularization" 
)

model.hyperparameters()

{'objective': 'reg:squarederror',
 'num_round': 10,
 'eta': 0.1,
 'max_depth': 8,
 'subsample': 0.8,
 'colsample_bytree': 0.8,
 'alpha': 0.1}

In [201]:
# Hyperparameter setup

# Creating a dict for hyperparameter
hyp_ranges = {
    "num_round": IntegerParameter(10, 20),
    "eta": ContinuousParameter(0.1, 0.3),
    "alpha": ContinuousParameter(0.1, 1.0),
    "max_depth": IntegerParameter(1, 10),
    "subsample": ContinuousParameter(0.5, 1.0),
    "colsample_bytree": ContinuousParameter(0.5, 1.0)
}

# Creating HYP tuner object
hyp_tuner = HyperparameterTuner(
    estimator = model,
    objective_metric_name = 'validation:rmse',
    hyperparameter_ranges = hyp_ranges,
    max_jobs = 20,    # Total hyperparameter combinations to try
    max_parallel_jobs = 10,    # Run 10 jobs at a time
    strategy = "Bayesian",    # by-default = Bayesian
    objective_type = "Minimize"
)

# 3.2 Data Channel Setup:
This is to inform the model where our data is. Means the `path to S3 bucket`.

In [202]:
def get_data_channel(name):
    bucket_path = f"s3://{bucket_name}/data/{name}"
    return TrainingInput(s3_data = bucket_path, content_type = 'csv')

train_data_channel = get_data_channel(name = 'train_pre.csv')
val_data_channel = get_data_channel(name = 'val_pre.csv')

In [203]:
f"s3://{bucket_name}/data"

's3://flight-price-prediction-project-sagemaker/data'

In [204]:
print(f'''
train_data_channel: {train_data_channel}
val_data_channel: {val_data_channel}''')


train_data_channel: <sagemaker.inputs.TrainingInput object at 0x7f7b0f656e30>
val_data_channel: <sagemaker.inputs.TrainingInput object at 0x7f7b0f656a40>


In [205]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

data_channels

{'train': <sagemaker.inputs.TrainingInput at 0x7f7b0f656e30>,
 'validation': <sagemaker.inputs.TrainingInput at 0x7f7b0f656a40>}

# 4. Train and Tune the model

In [208]:
hyp_tuner.fit(data_channels, wait = True, job_name = 'flight-price-hyp-final')

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...................................................................!


# Getting the best model and hyperparameters

In [209]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics as HPTJA

In [210]:
tuning_job_name = hyp_tuner.latest_tuning_job.name
tuner_analytics = HPTJA(tuning_job_name)
analytics_df = tuner_analytics.dataframe()

analytics_df.sort_values('FinalObjectiveValue', ascending = True)

# TrainingJobName shows the name of each compination of hyperparameters.
# We can get the name of best model from it

Unnamed: 0,alpha,colsample_bytree,eta,max_depth,num_round,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
3,1.0,0.5,0.3,7.0,20.0,0.969041,flight-price-hyp-final-017-ca71c61f,Completed,1950.072876,2025-10-15 12:44:35+00:00,2025-10-15 12:45:24+00:00,49.0
14,0.426505,0.578191,0.243415,7.0,17.0,0.647286,flight-price-hyp-final-006-a87ef0f4,Completed,1971.51709,2025-10-15 12:41:09+00:00,2025-10-15 12:43:59+00:00,170.0
2,1.0,0.964293,0.3,6.0,20.0,0.836976,flight-price-hyp-final-018-538fe55a,Completed,1992.966797,2025-10-15 12:44:49+00:00,2025-10-15 12:45:38+00:00,49.0
11,0.632131,0.852787,0.197246,7.0,16.0,0.740029,flight-price-hyp-final-009-b1a8f602,Completed,2005.565063,2025-10-15 12:41:11+00:00,2025-10-15 12:43:46+00:00,155.0
1,0.1,0.806492,0.226331,6.0,10.0,0.951792,flight-price-hyp-final-019-e8e37854,Completed,2190.054443,2025-10-15 12:44:45+00:00,2025-10-15 12:45:39+00:00,54.0
9,1.0,0.945436,0.3,3.0,17.0,0.858522,flight-price-hyp-final-011-2ac691ab,Completed,2199.920898,2025-10-15 12:43:58+00:00,2025-10-15 12:44:47+00:00,49.0
18,0.222732,0.608372,0.266632,3.0,16.0,0.533037,flight-price-hyp-final-002-3dd15217,Completed,2222.16626,2025-10-15 12:41:01+00:00,2025-10-15 12:43:41+00:00,160.0
16,0.381206,0.94142,0.200332,9.0,10.0,0.92367,flight-price-hyp-final-004-a7a496f8,Completed,2227.842773,2025-10-15 12:41:18+00:00,2025-10-15 12:44:01+00:00,163.0
0,0.246871,0.522602,0.234647,6.0,10.0,0.644773,flight-price-hyp-final-020-52eb1772,Completed,2230.67334,2025-10-15 12:44:48+00:00,2025-10-15 12:45:47+00:00,59.0
6,0.339819,0.964196,0.200811,7.0,10.0,0.769846,flight-price-hyp-final-014-a232947a,Completed,2255.034912,2025-10-15 12:44:14+00:00,2025-10-15 12:45:03+00:00,49.0


In [211]:
best_model_name = analytics_df.sort_values('FinalObjectiveValue', ascending = True).iloc[0]['TrainingJobName']
print(best_model_name)

# Get the S3 URL of this model by navigating to S3_bucket/ flight/ model

best_model_url = r"s3://flight-price-prediction-project-sagemaker/model/flight-price-hyp-final-017-ca71c61f/output/model.tar.gz"

flight-price-hyp-final-017-ca71c61f


# Model Evaluation

In [217]:
with open("xgboost-model-2", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f7b11023c40>

In [218]:
from sklearn.metrics import r2_score
import xgboost

# Evaluation function
def evaluate(file_name, model):
    data = pd.read_csv(file_name)
    
    x = data.iloc[:, 1:]
    y = data.iloc[:, 0].copy()

    # Converting to DMatrix: XGBoost expects the data to be in DMatrox format for prediction
    x = xgboost.DMatrix(x)

    pred = model.predict(x)

    return r2_score(y, pred)

In [219]:
train_data_eval = evaluate(file_name = "train_pre.csv", model = best_model)
val_data_eval = evaluate(file_name = "val_pre.csv", model = best_model)
test_data_eval = evaluate(file_name = "test_pre.csv", model = best_model)

print(f'''
Train data evaluation score: {train_data_eval}
Validation data evaluation score: {val_data_eval}
Test data evaluation score: {test_data_eval}
''')


Train data evaluation score: 0.8429283499717712
Validation data evaluation score: 0.7914255261421204
Test data evaluation score: 0.6518923044204712



In [195]:
# Exporting the final_pipeline because we will need it while creating the Streamlit app

import joblib

joblib.dump(final_pipeline, filename = 'flights_final_pipeline.pkl')

['flights_final_pipeline.pkl']

# Completed