In [2]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

### Apply the Display settings

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
sklearn.set_config(transform_output="pandas")

In [5]:
warnings.filterwarnings("ignore")

### Reading the data

In [6]:

file_path = r"C:\Users\shubh\OneDrive\Desktop\FLIGHT-PRICE-PREDICTOR\Data\train.csv"

train = pd.read_csv(file_path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-27,Chennai,Kolkata,13:15:00,15:35:00,140,0.0,No Info,3597
1,Jet Airways,2019-05-24,Kolkata,Banglore,20:25:00,23:35:00,1630,1.0,No Info,14151
2,Indigo,2019-05-27,Banglore,Delhi,10:10:00,13:00:00,170,0.0,No Info,3943
3,Jet Airways,2019-06-27,Delhi,Cochin,05:30:00,19:00:00,810,2.0,No Info,13014
4,Spicejet,2019-05-03,Kolkata,Banglore,06:55:00,09:30:00,155,0.0,No Info,3873
...,...,...,...,...,...,...,...,...,...,...
5879,Air India,2019-05-18,Kolkata,Banglore,05:50:00,20:25:00,875,2.0,No Info,12093
5880,Jet Airways,2019-05-24,Kolkata,Banglore,20:25:00,21:05:00,1480,1.0,No Info,14151
5881,Indigo,2019-05-18,Delhi,Cochin,17:10:00,01:30:00,500,1.0,No Info,7081
5882,Multiple Carriers,2019-06-06,Delhi,Cochin,13:00:00,21:00:00,480,1.0,No Info,13587


In [7]:

X_train = train.drop(columns="price")
y_train = train.price.copy()

## Transformation opeartions--

### Airline

In [8]:
air_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
5879,1.0,0.0,0.0,0.0,0.0
5880,0.0,0.0,1.0,0.0,0.0
5881,0.0,1.0,0.0,0.0,0.0
5882,0.0,0.0,0.0,1.0,0.0


### Date of journey

In [9]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,1.000000,0.500000,1.000000
1,0.666667,0.705882,0.666667,0.711864
2,0.666667,0.764706,0.000000,0.737288
3,1.000000,1.000000,0.500000,1.000000
4,0.666667,0.529412,0.666667,0.533898
...,...,...,...,...
5879,0.666667,0.647059,0.833333,0.661017
5880,0.666667,0.705882,0.666667,0.711864
5881,0.666667,0.647059,0.833333,0.661017
5882,1.000000,0.823529,0.500000,0.822034


### Source and Destination

In [10]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset

Unnamed: 0,source,destination
0,Chennai,Kolkata
1,Kolkata,Banglore
2,Banglore,Delhi
3,Delhi,Cochin
4,Kolkata,Banglore
...,...,...
5879,Kolkata,Banglore
5880,Kolkata,Banglore
5881,Delhi,Cochin
5882,Delhi,Cochin


In [11]:
location_pipe1 = Pipeline(steps=[
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", MeanEncoder()),
	("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,-1.884167,-0.777489
1,-0.238282,-0.244421
2,-0.839641,-1.834243
3,1.054867,1.052894
4,-0.238282,-0.244421
...,...,...
5879,-0.238282,-0.244421
5880,-0.238282,-0.244421
5881,1.054867,1.052894
5882,1.054867,1.052894


In [12]:
np.union1d(
	X_train.source.unique(),
	X_train.destination.unique()
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [13]:
def is_north(X):
	columns = X.columns.to_list()
	north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
	return (
		X
		.assign(**{
			f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
			for col in columns
		})
		.drop(columns=columns)
	)


FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5879,1,0
5880,1,0
5881,1,0
5882,1,0


In [14]:
location_transformer = FeatureUnion(transformer_list=[
	("part1", location_pipe1),
	("part2", FunctionTransformer(func=is_north))
])

location_transformer.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-1.884167,-0.777489,0,1
1,-0.238282,-0.244421,1,0
2,-0.839641,-1.834243,0,1
3,1.054867,1.052894,1,0
4,-0.238282,-0.244421,1,0
...,...,...,...,...
5879,-0.238282,-0.244421,1,0
5880,-0.238282,-0.244421,1,0
5881,1.054867,1.052894,1,0
5882,1.054867,1.052894,1,0


### dep_time & arrival_time

In [15]:

time_subset = X_train.loc[:, ["dep_time", "arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,13:15:00,15:35:00
1,20:25:00,23:35:00
2,10:10:00,13:00:00
3,05:30:00,19:00:00
4,06:55:00,09:30:00
...,...,...
5879,05:50:00,20:25:00
5880,20:25:00,21:05:00
5881,17:10:00,01:30:00
5882,13:00:00,21:00:00


In [16]:
time_pipe1 = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
	("scaler", MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.565217,0.272727,0.652174,0.636364
1,0.869565,0.454545,1.000000,0.636364
2,0.434783,0.181818,0.565217,0.000000
3,0.217391,0.545455,0.826087,0.000000
4,0.260870,1.000000,0.391304,0.545455
...,...,...,...,...
5879,0.217391,0.909091,0.869565,0.454545
5880,0.869565,0.454545,0.913043,0.090909
5881,0.739130,0.181818,0.043478,0.545455
5882,0.565217,0.000000,0.913043,0.000000


In [17]:
def part_of_day(X, morning=4, noon=12, eve=16, night=20):
	columns = X.columns.to_list()
	X_temp = X.assign(**{
		col: pd.to_datetime(X.loc[:, col]).dt.hour
		for col in columns
	})

	return (
		X_temp
		.assign(**{
			f"{col}_part_of_day": np.select(
				[X_temp.loc[:, col].between(morning, noon, inclusive="left"),
				 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
				 X_temp.loc[:, col].between(eve, night, inclusive="left")],
				["morning", "afternoon", "evening"],
				default="night"
			)
			for col in columns
		})
		.drop(columns=columns)
	)

FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,afternoon,afternoon
1,night,night
2,morning,afternoon
3,morning,evening
4,morning,morning
...,...,...
5879,morning,night
5880,night,night
5881,evening,night
5882,afternoon,night


In [18]:

time_pipe2 = Pipeline(steps=[
	("part", FunctionTransformer(func=part_of_day)),
	("encoder", CountFrequencyEncoder()),
	("scaler", MinMaxScaler())
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,0.000000,0.000000
1,0.119718,1.000000
2,1.000000,0.000000
3,1.000000,0.628707
4,1.000000,0.843416
...,...,...
5879,1.000000,1.000000
5880,0.119718,1.000000
5881,0.206237,1.000000
5882,0.000000,1.000000


In [19]:
time_transformer = FeatureUnion(transformer_list=[
	("part1", time_pipe1),
	("part2", time_pipe2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.565217,0.272727,0.652174,0.636364,0.000000,0.000000
1,0.869565,0.454545,1.000000,0.636364,0.119718,1.000000
2,0.434783,0.181818,0.565217,0.000000,1.000000,0.000000
3,0.217391,0.545455,0.826087,0.000000,1.000000,0.628707
4,0.260870,1.000000,0.391304,0.545455,1.000000,0.843416
...,...,...,...,...,...,...
5879,0.217391,0.909091,0.869565,0.454545,1.000000,1.000000
5880,0.869565,0.454545,0.913043,0.090909,0.119718,1.000000
5881,0.739130,0.181818,0.043478,0.545455,0.206237,1.000000
5882,0.565217,0.000000,0.913043,0.000000,0.000000,1.000000


### Duration

In [20]:
(
	X_train
	.duration
	.quantile([0.25, 0.5, 0.75])
	.values
	.reshape(-1, 1)
)

array([[170.],
       [505.],
       [910.]])

In [21]:

class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)

In [22]:
RBFPercentileSimilarity(percentiles=[0.4, 0.8]).fit_transform(X_train)


Unnamed: 0,duration_rbf_40,duration_rbf_80,total_stops_rbf_40,total_stops_rbf_80
0,0.000000,0.0,0.904837,0.904837
1,0.000000,0.0,1.000000,1.000000
2,0.000000,0.0,0.904837,0.904837
3,0.000000,0.0,0.904837,0.904837
4,0.000000,0.0,0.904837,0.904837
...,...,...,...,...
5879,0.000000,0.0,0.904837,0.904837
5880,0.000000,0.0,1.000000,1.000000
5881,0.000000,0.0,1.000000,1.000000
5882,0.000000,0.0,1.000000,1.000000


In [23]:

def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)

In [24]:
def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)

In [25]:
duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])

duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75,duration_cat,duration_over_1000,duration
0,-0.357292,-0.108617,-0.081637,0.0,0,-0.985940
1,-0.357292,-0.108617,-0.081637,2.0,1,2.015767
2,3.148349,-0.108617,-0.081637,0.0,0,-0.925503
3,-0.357292,-0.108617,-0.081637,2.0,0,0.363821
4,-0.357292,-0.108617,-0.081637,0.0,0,-0.955722
...,...,...,...,...,...,...
5879,-0.357292,-0.108617,-0.081637,2.0,0,0.494768
5880,-0.357292,-0.108617,-0.081637,2.0,1,1.713582
5881,-0.357292,9.247984,-0.081637,2.0,0,-0.260696
5882,-0.357292,-0.108617,-0.081637,2.0,0,-0.300987


### total_stops

In [26]:

def is_direct(X):
	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))



In [27]:

total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])

total_stops_transformer.fit_transform(X_train.loc[:, ["total_stops"]])

Unnamed: 0,total_stops,is_direct_flight
0,0.0,1
1,1.0,0
2,0.0,1
3,2.0,0
4,0.0,1
...,...,...
5879,2.0,0
5880,1.0,0
5881,1.0,0
5882,1.0,0


### additional_info

In [28]:
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

info_pipe1.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
5879,0.0,1.0,0.0
5880,0.0,1.0,0.0
5881,0.0,1.0,0.0
5882,0.0,1.0,0.0


In [29]:
def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

In [30]:
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

In [31]:
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

info_transformer.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,0.0,1.0,0.0,0
1,0.0,1.0,0.0,0
2,0.0,1.0,0.0,0
3,0.0,1.0,0.0,0
4,0.0,1.0,0.0,0
...,...,...,...,...
5879,0.0,1.0,0.0,0
5880,0.0,1.0,0.0,0
5881,0.0,1.0,0.0,0
5882,0.0,1.0,0.0,0


## Column Transformer

In [32]:
column_transformer = ColumnTransformer(transformers=[
	("air", air_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")

column_transformer.fit_transform(X_train, y_train)


Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,dur__duration_rbf_25,dur__duration_rbf_50,dur__duration_rbf_75,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.500000,1.000000,-1.884167,-0.777489,0,1,0.565217,0.272727,0.652174,0.636364,0.000000,0.000000,-0.357292,-0.108617,-0.081637,0.0,0,-0.985940,0.0,1,0.0,1.0,0.0,0
1,0.0,0.0,1.0,0.0,0.0,0.666667,0.705882,0.666667,0.711864,-0.238282,-0.244421,1,0,0.869565,0.454545,1.000000,0.636364,0.119718,1.000000,-0.357292,-0.108617,-0.081637,2.0,1,2.015767,1.0,0,0.0,1.0,0.0,0
2,0.0,1.0,0.0,0.0,0.0,0.666667,0.764706,0.000000,0.737288,-0.839641,-1.834243,0,1,0.434783,0.181818,0.565217,0.000000,1.000000,0.000000,3.148349,-0.108617,-0.081637,0.0,0,-0.925503,0.0,1,0.0,1.0,0.0,0
3,0.0,0.0,1.0,0.0,0.0,1.000000,1.000000,0.500000,1.000000,1.054867,1.052894,1,0,0.217391,0.545455,0.826087,0.000000,1.000000,0.628707,-0.357292,-0.108617,-0.081637,2.0,0,0.363821,2.0,0,0.0,1.0,0.0,0
4,0.0,0.0,0.0,0.0,1.0,0.666667,0.529412,0.666667,0.533898,-0.238282,-0.244421,1,0,0.260870,1.000000,0.391304,0.545455,1.000000,0.843416,-0.357292,-0.108617,-0.081637,0.0,0,-0.955722,0.0,1,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5879,1.0,0.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,-0.238282,-0.244421,1,0,0.217391,0.909091,0.869565,0.454545,1.000000,1.000000,-0.357292,-0.108617,-0.081637,2.0,0,0.494768,2.0,0,0.0,1.0,0.0,0
5880,0.0,0.0,1.0,0.0,0.0,0.666667,0.705882,0.666667,0.711864,-0.238282,-0.244421,1,0,0.869565,0.454545,0.913043,0.090909,0.119718,1.000000,-0.357292,-0.108617,-0.081637,2.0,1,1.713582,1.0,0,0.0,1.0,0.0,0
5881,0.0,1.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,1.054867,1.052894,1,0,0.739130,0.181818,0.043478,0.545455,0.206237,1.000000,-0.357292,9.247984,-0.081637,2.0,0,-0.260696,1.0,0,0.0,1.0,0.0,0
5882,0.0,0.0,0.0,1.0,0.0,1.000000,0.823529,0.500000,0.822034,1.054867,1.052894,1,0,0.565217,0.000000,0.913043,0.000000,0.000000,1.000000,-0.357292,-0.108617,-0.081637,2.0,0,-0.300987,1.0,0,0.0,1.0,0.0,0


## Feature Selection

In [33]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

In [34]:
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

preprocessor.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,0.0,1.000000,1.000000,-1.884167,-0.777489,-0.357292,0.0,0,-0.985940,0.0,1
1,0.0,1.0,0.0,0.705882,0.711864,-0.238282,-0.244421,-0.357292,2.0,1,2.015767,1.0,0
2,1.0,0.0,0.0,0.764706,0.737288,-0.839641,-1.834243,3.148349,0.0,0,-0.925503,0.0,1
3,0.0,1.0,0.0,1.000000,1.000000,1.054867,1.052894,-0.357292,2.0,0,0.363821,2.0,0
4,0.0,0.0,1.0,0.529412,0.533898,-0.238282,-0.244421,-0.357292,0.0,0,-0.955722,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5879,0.0,0.0,0.0,0.647059,0.661017,-0.238282,-0.244421,-0.357292,2.0,0,0.494768,2.0,0
5880,0.0,1.0,0.0,0.705882,0.711864,-0.238282,-0.244421,-0.357292,2.0,1,1.713582,1.0,0
5881,1.0,0.0,0.0,0.647059,0.661017,1.054867,1.052894,-0.357292,2.0,0,-0.260696,1.0,0
5882,0.0,0.0,0.0,0.823529,0.822034,1.054867,1.052894,-0.357292,2.0,0,-0.300987,1.0,0
