In [1]:
# !pip install feature_engine

In [84]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import (
    Pipeline,
    FeatureUnion
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler,
    PowerTransformer,
    FunctionTransformer
)
from sklearn.compose import ColumnTransformer

from feature_engine.outliers import Winsorizer
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures

## 2. Display Settings

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
sklearn.set_config(transform_output="pandas")

In [5]:
import warnings
warnings.filterwarnings("ignore")

## 3. Read the Data

In [6]:
path = r"C:\Users\skaur\OneDrive\Desktop\Flight Price Prediction Dataset\data\train.csv"

In [7]:
train = pd.read_csv(path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-24,Banglore,Delhi,16:55:00,19:55:00,180,0.0,No Info,4823
1,Spicejet,2019-06-09,Delhi,Cochin,06:05:00,11:20:00,315,1.0,No Info,5583
2,Indigo,2019-06-15,Delhi,Cochin,05:05:00,16:10:00,665,1.0,No Info,6818
3,Jet Airways,2019-05-01,Kolkata,Banglore,14:05:00,23:35:00,570,1.0,No Info,14781
4,Jet Airways,2019-03-24,Kolkata,Banglore,21:10:00,20:40:00,1410,1.0,No Info,13759
...,...,...,...,...,...,...,...,...,...,...
1275,Jet Airways,2019-05-06,Kolkata,Banglore,20:00:00,08:15:00,735,1.0,No Info,14388
1276,Jet Airways,2019-05-01,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included,2228
1277,Jet Airways,2019-06-03,Delhi,Cochin,13:00:00,19:00:00,360,1.0,No Info,14714
1278,Multiple Carriers,2019-05-18,Delhi,Cochin,08:30:00,19:15:00,645,1.0,No Info,7887


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280 entries, 0 to 1279
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          1280 non-null   object 
 1   date_of_journey  1280 non-null   object 
 2   source           1280 non-null   object 
 3   destination      1280 non-null   object 
 4   dep_time         1280 non-null   object 
 5   arrival_time     1280 non-null   object 
 6   duration         1280 non-null   int64  
 7   total_stops      1280 non-null   float64
 8   additional_info  1280 non-null   object 
 9   price            1280 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 100.1+ KB


In [9]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

In [10]:
X_train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
0,Indigo,2019-06-24,Banglore,Delhi,16:55:00,19:55:00,180,0.0,No Info
1,Spicejet,2019-06-09,Delhi,Cochin,06:05:00,11:20:00,315,1.0,No Info
2,Indigo,2019-06-15,Delhi,Cochin,05:05:00,16:10:00,665,1.0,No Info
3,Jet Airways,2019-05-01,Kolkata,Banglore,14:05:00,23:35:00,570,1.0,No Info
4,Jet Airways,2019-03-24,Kolkata,Banglore,21:10:00,20:40:00,1410,1.0,No Info
...,...,...,...,...,...,...,...,...,...
1275,Jet Airways,2019-05-06,Kolkata,Banglore,20:00:00,08:15:00,735,1.0,No Info
1276,Jet Airways,2019-05-01,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included
1277,Jet Airways,2019-06-03,Delhi,Cochin,13:00:00,19:00:00,360,1.0,No Info
1278,Multiple Carriers,2019-05-18,Delhi,Cochin,08:30:00,19:15:00,645,1.0,No Info


In [11]:
y_train

0        4823
1        5583
2        6818
3       14781
4       13759
        ...  
1275    14388
1276     2228
1277    14714
1278     7887
1279     6427
Name: price, Length: 1280, dtype: int64

## 4.Transformation Operations

### Airline

In [12]:
X_train.airline

0                  Indigo
1                Spicejet
2                  Indigo
3             Jet Airways
4             Jet Airways
              ...        
1275          Jet Airways
1276          Jet Airways
1277          Jet Airways
1278    Multiple Carriers
1279    Multiple Carriers
Name: airline, Length: 1280, dtype: object

In [13]:
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
1275,0.0,0.0,1.0,0.0,0.0
1276,0.0,0.0,1.0,0.0,0.0
1277,0.0,0.0,1.0,0.0,0.0
1278,0.0,0.0,0.0,1.0,0.0


### Date of Journey

In [24]:
X_train.date_of_journey

0       2019-06-24
1       2019-06-09
2       2019-06-15
3       2019-05-01
4       2019-03-24
           ...    
1275    2019-05-06
1276    2019-05-01
1277    2019-06-03
1278    2019-05-18
1279    2019-03-27
Name: date_of_journey, Length: 1280, dtype: object

In [21]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("minMaxScaler", MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:, ['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,1.000000,0.000000,0.974576
1,1.000000,0.823529,1.000000,0.847458
2,1.000000,0.882353,0.833333,0.898305
3,0.666667,0.529412,0.333333,0.516949
4,0.000000,0.176471,1.000000,0.194915
...,...,...,...,...
1275,0.666667,0.588235,0.000000,0.559322
1276,0.666667,0.529412,0.333333,0.516949
1277,1.000000,0.823529,0.000000,0.796610
1278,0.666667,0.647059,0.833333,0.661017


### Source & Destinaton

In [25]:
X_train.source

0       Banglore
1          Delhi
2          Delhi
3        Kolkata
4        Kolkata
          ...   
1275     Kolkata
1276      Mumbai
1277       Delhi
1278       Delhi
1279       Delhi
Name: source, Length: 1280, dtype: object

In [26]:
X_train.destination

0           Delhi
1          Cochin
2          Cochin
3        Banglore
4        Banglore
          ...    
1275     Banglore
1276    Hyderabad
1277       Cochin
1278       Cochin
1279       Cochin
Name: destination, Length: 1280, dtype: object

In [27]:
temp_subset = X_train.loc[:, ["source", "destination"]]

In [28]:
temp_subset

Unnamed: 0,source,destination
0,Banglore,Delhi
1,Delhi,Cochin
2,Delhi,Cochin
3,Kolkata,Banglore
4,Kolkata,Banglore
...,...,...
1275,Kolkata,Banglore
1276,Mumbai,Hyderabad
1277,Delhi,Cochin
1278,Delhi,Cochin


In [34]:
loc_pipe = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="others", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

loc_pipe.fit_transform(temp_subset, y_train)

Unnamed: 0,source,destination
0,-1.008705,-1.796318
1,1.044760,1.042077
2,1.044760,1.042077
3,-0.110600,-0.157115
4,-0.110600,-0.157115
...,...,...
1275,-0.110600,-0.157115
1276,-1.792716,-0.811326
1277,1.044760,1.042077
1278,1.044760,1.042077


In [37]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "New Delhi", "Kolkata", "Mumbai"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

FunctionTransformer(func=is_north).fit_transform(temp_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
1275,1,0
1276,1,0
1277,1,0
1278,1,0


In [38]:
loction_transformer = FeatureUnion(transformer_list=[
    ("transform1", loc_pipe),
    ("transform2", FunctionTransformer(is_north))
])

In [39]:
loction_transformer.fit_transform(temp_subset, y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-1.008705,-1.796318,0,1
1,1.044760,1.042077,1,0
2,1.044760,1.042077,1,0
3,-0.110600,-0.157115,1,0
4,-0.110600,-0.157115,1,0
...,...,...,...,...
1275,-0.110600,-0.157115,1,0
1276,-1.792716,-0.811326,1,0
1277,1.044760,1.042077,1,0
1278,1.044760,1.042077,1,0


### Departure Time & Arrival Time

In [43]:
time_subset = X_train.loc[:, ["dep_time", "arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,16:55:00,19:55:00
1,06:05:00,11:20:00
2,05:05:00,16:10:00
3,14:05:00,23:35:00
4,21:10:00,20:40:00
...,...,...
1275,20:00:00,08:15:00
1276,19:35:00,21:05:00
1277,13:00:00,19:00:00
1278,08:30:00,19:15:00


In [45]:
time_pipeline = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])
time_pipeline.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.695652,1.000000,0.826087,1.000000
1,0.260870,0.090909,0.478261,0.363636
2,0.217391,0.090909,0.695652,0.181818
3,0.608696,0.090909,1.000000,0.636364
4,0.913043,0.181818,0.869565,0.727273
...,...,...,...,...
1275,0.869565,0.000000,0.347826,0.272727
1276,0.826087,0.636364,0.913043,0.090909
1277,0.565217,0.000000,0.826087,0.000000
1278,0.347826,0.545455,0.826087,0.272727


In [49]:
def part_of_day(X, morning=4, afternoon=12, evening=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_the_day" : np.select(
                [X_temp.loc[:, col].between(morning, afternoon, inclusive="left"),
                X_temp.loc[:, col].between(afternoon, evening, inclusive="left"),
                X_temp.loc[:, col].between(evening, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_the_day,arrival_time_part_of_the_day
0,evening,evening
1,morning,morning
2,morning,evening
3,afternoon,night
4,night,night
...,...,...
1275,night,morning
1276,evening,night
1277,afternoon,evening
1278,morning,evening


In [53]:
time_pipeline2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_pipeline2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_the_day,arrival_time_part_of_the_day
0,0.254587,0.519048
1,1.000000,1.000000
2,1.000000,0.519048
3,0.000000,0.919048
4,0.176606,0.919048
...,...,...
1275,0.176606,1.000000
1276,0.254587,0.919048
1277,0.000000,0.519048
1278,1.000000,0.519048


In [54]:
time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipeline),
    ("part2", time_pipeline2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_the_day,arrival_time_part_of_the_day
0,0.695652,1.000000,0.826087,1.000000,0.254587,0.519048
1,0.260870,0.090909,0.478261,0.363636,1.000000,1.000000
2,0.217391,0.090909,0.695652,0.181818,1.000000,0.519048
3,0.608696,0.090909,1.000000,0.636364,0.000000,0.919048
4,0.913043,0.181818,0.869565,0.727273,0.176606,0.919048
...,...,...,...,...,...,...
1275,0.869565,0.000000,0.347826,0.272727,0.176606,1.000000
1276,0.826087,0.636364,0.913043,0.090909,0.254587,0.919048
1277,0.565217,0.000000,0.826087,0.000000,0.000000,0.519048
1278,0.347826,0.545455,0.826087,0.272727,1.000000,0.519048


### Duration

In [56]:
X_train.duration

0        180
1        315
2        665
3        570
4       1410
        ... 
1275     735
1276      90
1277     360
1278     645
1279     620
Name: duration, Length: 1280, dtype: int64

In [68]:
(
    X_train
    .duration
    .quantile([0.25, 0.5, 0.75])
    .values
    .reshape(-1,1)
)

array([[170.],
       [520.],
       [925.]])

In [79]:
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma
    
    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()
            
        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }
        return self
    
    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(100*percentile)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data = rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)

In [81]:
RBFPercentileSimilarity().fit_transform(X_train)

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75,total_stops_rbf_25,total_stops_rbf_50,total_stops_rbf_75
0,4.539993e-05,0.000000e+00,0.0,1.000000,0.904837,0.904837
1,0.000000e+00,0.000000e+00,0.0,0.904837,1.000000,1.000000
2,0.000000e+00,0.000000e+00,0.0,0.904837,1.000000,1.000000
3,0.000000e+00,2.669190e-109,0.0,0.904837,1.000000,1.000000
4,0.000000e+00,0.000000e+00,0.0,0.904837,1.000000,1.000000
...,...,...,...,...,...,...
1275,0.000000e+00,0.000000e+00,0.0,0.904837,1.000000,1.000000
1276,1.125982e-278,0.000000e+00,0.0,1.000000,0.904837,0.904837
1277,0.000000e+00,0.000000e+00,0.0,0.904837,1.000000,1.000000
1278,0.000000e+00,0.000000e+00,0.0,0.904837,1.000000,1.000000


In [83]:
duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", StandardScaler())
])

In [86]:
duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75,duration
0,-0.385534,-0.126228,-0.075415,-0.908830
1,-0.388253,-0.126228,-0.075415,-0.637656
2,-0.388253,-0.126228,-0.075415,0.065389
3,-0.388253,-0.126228,-0.075415,-0.125438
4,-0.388253,-0.126228,-0.075415,1.561869
...,...,...,...,...
1275,-0.388253,-0.126228,-0.075415,0.205997
1276,-0.388253,-0.126228,-0.075415,-1.089613
1277,-0.388253,-0.126228,-0.075415,-0.547264
1278,-0.388253,-0.126228,-0.075415,0.025215


### Total Stops

In [90]:
X_train.total_stops

0       0.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
1275    1.0
1276    0.0
1277    1.0
1278    1.0
1279    1.0
Name: total_stops, Length: 1280, dtype: float64

### Additional Info

## 4. Column Tranformation

In [87]:
column_transform = ColumnTransformer(transformers=[
    ("air", air_transformer, ["airline"]),
    ("doj", doj_transformer, ["date_of_journey"]),
    ("location", loction_transformer, ["source", "destination"]),
    ("time", time_transformer, ["dep_time", "arrival_time"]),
    ("dur", duration_transformer, ["duration"])
], remainder="passthrough")

column_transform.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_the_day,time__arrival_time_part_of_the_day,dur__duration_rbf_25,dur__duration_rbf_50,dur__duration_rbf_75,dur__duration,remainder__total_stops,remainder__additional_info
0,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.000000,0.974576,-1.008705,-1.796318,0,1,0.695652,1.000000,0.826087,1.000000,0.254587,0.519048,-0.385534,-0.126228,-0.075415,-0.908830,0.0,No Info
1,0.0,0.0,0.0,0.0,1.0,1.000000,0.823529,1.000000,0.847458,1.044760,1.042077,1,0,0.260870,0.090909,0.478261,0.363636,1.000000,1.000000,-0.388253,-0.126228,-0.075415,-0.637656,1.0,No Info
2,0.0,1.0,0.0,0.0,0.0,1.000000,0.882353,0.833333,0.898305,1.044760,1.042077,1,0,0.217391,0.090909,0.695652,0.181818,1.000000,0.519048,-0.388253,-0.126228,-0.075415,0.065389,1.0,No Info
3,0.0,0.0,1.0,0.0,0.0,0.666667,0.529412,0.333333,0.516949,-0.110600,-0.157115,1,0,0.608696,0.090909,1.000000,0.636364,0.000000,0.919048,-0.388253,-0.126228,-0.075415,-0.125438,1.0,No Info
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,1.000000,0.194915,-0.110600,-0.157115,1,0,0.913043,0.181818,0.869565,0.727273,0.176606,0.919048,-0.388253,-0.126228,-0.075415,1.561869,1.0,No Info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,0.0,0.0,1.0,0.0,0.0,0.666667,0.588235,0.000000,0.559322,-0.110600,-0.157115,1,0,0.869565,0.000000,0.347826,0.272727,0.176606,1.000000,-0.388253,-0.126228,-0.075415,0.205997,1.0,No Info
1276,0.0,0.0,1.0,0.0,0.0,0.666667,0.529412,0.333333,0.516949,-1.792716,-0.811326,1,0,0.826087,0.636364,0.913043,0.090909,0.254587,0.919048,-0.388253,-0.126228,-0.075415,-1.089613,0.0,In-flight meal not included
1277,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,0.000000,0.796610,1.044760,1.042077,1,0,0.565217,0.000000,0.826087,0.000000,0.000000,0.519048,-0.388253,-0.126228,-0.075415,-0.547264,1.0,No Info
1278,0.0,0.0,0.0,1.0,0.0,0.666667,0.647059,0.833333,0.661017,1.044760,1.042077,1,0,0.347826,0.545455,0.826087,0.272727,1.000000,0.519048,-0.388253,-0.126228,-0.075415,0.025215,1.0,No Info
