## 1. Import Libraries

In [145]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import (
    Pipeline,
    FeatureUnion
)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    StandardScaler,
    OrdinalEncoder
)

from feature_engine.outliers import Winsorizer
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures

import warnings

## 2. Display Settings

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
sklearn.set_config(transform_output="pandas")

In [5]:
warnings.filterwarnings("ignore")

## 3. Read the Data

In [6]:
path = r"C:\Users\Aayush\Desktop\Flight Fare Prediction\data\train.csv"

train = pd.read_csv(path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-09,Kolkata,Banglore,21:10:00,18:15:00,1265,1.0,No Info,14571
1,Jet Airways,2019-03-18,Banglore,New Delhi,18:40:00,07:40:00,780,1.0,In-flight meal not included,10972
2,Jet Airways,2019-05-24,Mumbai,Hyderabad,02:55:00,04:20:00,85,0.0,In-flight meal not included,4995
3,Indigo,2019-06-24,Banglore,Delhi,07:10:00,10:05:00,175,0.0,No Info,4823
4,Jet Airways,2019-05-09,Delhi,Cochin,20:55:00,19:00:00,1325,1.0,In-flight meal not included,12373
...,...,...,...,...,...,...,...,...,...,...
635,Indigo,2019-03-27,Delhi,Cochin,06:40:00,12:00:00,320,1.0,No Info,5298
636,Multiple Carriers,2019-05-15,Delhi,Cochin,08:45:00,21:00:00,735,1.0,No Info,7670
637,Multiple Carriers,2019-05-09,Delhi,Cochin,17:00:00,01:30:00,510,1.0,No Info,9424
638,Indigo,2019-05-09,Mumbai,Hyderabad,19:05:00,20:35:00,90,0.0,No Info,4392


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [8]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

## 4. Transformation Operations

### 4.1 airline

In [9]:
X_train.airline

0            Jet Airways
1            Jet Airways
2            Jet Airways
3                 Indigo
4            Jet Airways
             ...        
635               Indigo
636    Multiple Carriers
637    Multiple Carriers
638               Indigo
639             Spicejet
Name: airline, Length: 640, dtype: object

In [10]:
air_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

air_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
635,0.0,1.0,0.0,0.0,0.0
636,0.0,0.0,0.0,1.0,0.0
637,0.0,0.0,0.0,1.0,0.0
638,0.0,1.0,0.0,0.0,0.0


### 4.2 date_of_journey

In [11]:
X_train.date_of_journey

0      2019-06-09
1      2019-03-18
2      2019-05-24
3      2019-06-24
4      2019-05-09
          ...    
635    2019-03-27
636    2019-05-15
637    2019-05-09
638    2019-05-09
639    2019-06-01
Name: date_of_journey, Length: 640, dtype: object

In [12]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(
    steps=[
        ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
        ("scaler", MinMaxScaler())
    ]
)

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,0.823529,1.000000,0.847458
1,0.000000,0.176471,0.000000,0.144068
2,0.666667,0.705882,0.666667,0.711864
3,1.000000,1.000000,0.000000,0.974576
4,0.666667,0.588235,0.500000,0.584746
...,...,...,...,...
635,0.000000,0.235294,0.333333,0.220339
636,0.666667,0.647059,0.333333,0.635593
637,0.666667,0.588235,0.500000,0.584746
638,0.666667,0.588235,0.500000,0.584746


### 4.3 source & destination

In [13]:
X_train.source

0       Kolkata
1      Banglore
2        Mumbai
3      Banglore
4         Delhi
         ...   
635       Delhi
636       Delhi
637       Delhi
638      Mumbai
639      Mumbai
Name: source, Length: 640, dtype: object

In [14]:
X_train.destination

0       Banglore
1      New Delhi
2      Hyderabad
3          Delhi
4         Cochin
         ...    
635       Cochin
636       Cochin
637       Cochin
638    Hyderabad
639    Hyderabad
Name: destination, Length: 640, dtype: object

In [15]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset

Unnamed: 0,source,destination
0,Kolkata,Banglore
1,Banglore,New Delhi
2,Mumbai,Hyderabad
3,Banglore,Delhi
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Delhi,Cochin
637,Delhi,Cochin
638,Mumbai,Hyderabad


In [16]:
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,-0.271179,-0.253274
1,-0.903923,-0.982970
2,-1.796730,-0.982970
3,-0.903923,-1.789680
4,1.056385,1.056945
...,...,...
635,1.056385,1.056945
636,1.056385,1.056945
637,1.056385,1.056945
638,-1.796730,-0.982970


In [17]:
np.union1d(X_train.source, X_train.destination)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [18]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "New Delhi", "Mumbai"]

    return (

        X
        .assign(**{
        f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
        for col in columns
        })
        .drop(columns=["source", "destination"])
    )

FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0
...,...,...
635,1,0
636,1,0
637,1,0
638,1,0


In [19]:
location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])
location_transformer.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.271179,-0.253274,1,0
1,-0.903923,-0.982970,0,1
2,-1.796730,-0.982970,1,0
3,-0.903923,-1.789680,0,1
4,1.056385,1.056945,1,0
...,...,...,...,...
635,1.056385,1.056945,1,0
636,1.056385,1.056945,1,0
637,1.056385,1.056945,1,0
638,-1.796730,-0.982970,1,0


### 4.4 dep_time & arrival_time

In [20]:
X_train.dep_time

0      21:10:00
1      18:40:00
2      02:55:00
3      07:10:00
4      20:55:00
         ...   
635    06:40:00
636    08:45:00
637    17:00:00
638    19:05:00
639    22:45:00
Name: dep_time, Length: 640, dtype: object

In [21]:
X_train.arrival_time

0      18:15:00
1      07:40:00
2      04:20:00
3      10:05:00
4      19:00:00
         ...   
635    12:00:00
636    21:00:00
637    01:30:00
638    20:35:00
639    00:15:00
Name: arrival_time, Length: 640, dtype: object

In [22]:
time_subset = X_train.loc[:, ["dep_time", "arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,21:10:00,18:15:00
1,18:40:00,07:40:00
2,02:55:00,04:20:00
3,07:10:00,10:05:00
4,20:55:00,19:00:00
...,...,...
635,06:40:00,12:00:00
636,08:45:00,21:00:00
637,17:00:00,01:30:00
638,19:05:00,20:35:00


In [23]:
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.913043,0.181818,0.782609,0.272727
1,0.782609,0.727273,0.304348,0.727273
2,0.086957,1.000000,0.173913,0.363636
3,0.304348,0.181818,0.434783,0.090909
4,0.869565,1.000000,0.826087,0.000000
...,...,...,...,...
635,0.260870,0.727273,0.521739,0.000000
636,0.347826,0.818182,0.913043,0.000000
637,0.739130,0.000000,0.043478,0.545455
638,0.826087,0.090909,0.869565,0.636364


In [24]:
def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": pd.Series(np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning",
                "afternoon",
                "evening"],
                default="night"
            ))
            for col in columns
        })
        .drop(columns=columns)
    )

FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,night,evening
1,evening,morning
2,night,morning
3,morning,morning
4,night,evening
...,...,...
635,morning,afternoon
636,morning,night
637,evening,night
638,evening,night


In [25]:
time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
    
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,0.102439,0.765217
1,0.185366,1.000000
2,0.102439,1.000000
3,1.000000,1.000000
4,0.102439,0.765217
...,...,...
635,1.000000,0.000000
636,1.000000,0.773913
637,0.185366,0.773913
638,0.185366,0.773913


In [26]:
time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.913043,0.181818,0.782609,0.272727,0.102439,0.765217
1,0.782609,0.727273,0.304348,0.727273,0.185366,1.000000
2,0.086957,1.000000,0.173913,0.363636,0.102439,1.000000
3,0.304348,0.181818,0.434783,0.090909,1.000000,1.000000
4,0.869565,1.000000,0.826087,0.000000,0.102439,0.765217
...,...,...,...,...,...,...
635,0.260870,0.727273,0.521739,0.000000,1.000000,0.000000
636,0.347826,0.818182,0.913043,0.000000,1.000000,0.773913
637,0.739130,0.000000,0.043478,0.545455,0.185366,0.773913
638,0.826087,0.090909,0.869565,0.636364,0.185366,0.773913


### 4.5 duration

In [29]:
train.loc[:, "duration"]

0      1265
1       780
2        85
3       175
4      1325
       ... 
635     320
636     735
637     510
638      90
639      90
Name: duration, Length: 640, dtype: int64

In [32]:
dur_pipe1 = Pipeline(steps=[
    ("cat_encoder", FunctionTransformer(func=dur_cat)),
    ("ord_encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])
dur_pipe1.fit_transform(train.loc[:, ["duration"]])

Unnamed: 0,duration
0,2.0
1,2.0
2,0.0
3,0.0
4,2.0
...,...
635,1.0
636,2.0
637,2.0
638,0.0


In [35]:
(
    train
    .duration
    .quantile([0.25, 0.5, .75])
    .values
    .reshape(-1, 1)
)

array([[170. ],
       [522.5],
       [935. ]])

In [56]:
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma

    def fit(self, X, y=None):
        
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()
            
        self.refrence_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }
        return self

    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data= rbf_kernel(X.loc[:, [col]], self.refrence_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
            
            

In [57]:
RBFPercentileSimilarity(percentiles=[0.4, 0.8]).fit_transform(X_train)

Unnamed: 0,duration_rbf_40,duration_rbf_80,total_stops_rbf_40,total_stops_rbf_80
0,0.000000e+00,0.0,1.000000,1.000000
1,0.000000e+00,0.0,1.000000,1.000000
2,0.000000e+00,0.0,0.904837,0.904837
3,0.000000e+00,0.0,0.904837,0.904837
4,0.000000e+00,0.0,1.000000,1.000000
...,...,...,...,...
635,1.015500e-122,0.0,1.000000,1.000000
636,0.000000e+00,0.0,1.000000,1.000000
637,0.000000e+00,0.0,1.000000,1.000000
638,0.000000e+00,0.0,0.904837,0.904837


In [123]:
def duration_cat(X, short=180, medium=480):
    return (
        X
        .assign(
            duration_categories=pd.Series(np.select(
                [X.loc[:, "duration"].lt(short),
                 X.loc[:, "duration"].between(short, medium, inclusive="left")],
                ["short", "medium"],
                default="long"
            ))
        )
        .drop(columns="duration")
    )


In [124]:
def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

In [125]:
duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_cat)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75,duration_categories,duration_over_1000,duration
0,-0.356024,-0.097284,-0.080778,2.0,1,1.231011
1,-0.356024,-0.097284,-0.080778,2.0,0,0.271615
2,-0.356024,-0.097284,-0.080778,0.0,0,-1.103190
3,2.406182,-0.097284,-0.080778,0.0,0,-0.925157
4,-0.356024,-0.097284,-0.080778,2.0,1,1.349699
...,...,...,...,...,...,...
635,-0.356024,-0.097284,-0.080778,1.0,0,-0.638328
636,-0.356024,-0.097284,-0.080778,2.0,0,0.182599
637,-0.356024,-0.096862,-0.080778,2.0,0,-0.262482
638,-0.356024,-0.097284,-0.080778,0.0,0,-1.093299


## 4.6 total_stops

In [130]:
X_train.total_stops

0      1.0
1      1.0
2      0.0
3      0.0
4      1.0
      ... 
635    1.0
636    1.0
637    1.0
638    0.0
639    0.0
Name: total_stops, Length: 640, dtype: float64

In [133]:
def is_direct(X):
    return X.assign(is_direct_flight = X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("is_dir", FunctionTransformer(func=is_direct))
])
total_stops_transformer.fit_transform(X_train.loc[:, ["total_stops"]])

Unnamed: 0,total_stops,is_direct_flight
0,1.0,0
1,1.0,0
2,0.0,1
3,0.0,1
4,1.0,0
...,...,...
635,1.0,0
636,1.0,0
637,1.0,0
638,0.0,1


### 4.7 additional_info

In [134]:
X_train.additional_info

0                           No Info
1       In-flight meal not included
2       In-flight meal not included
3                           No Info
4       In-flight meal not included
                   ...             
635                         No Info
636                         No Info
637                         No Info
638                         No Info
639    No check-in baggage included
Name: additional_info, Length: 640, dtype: object

In [135]:
info_pipe1 = Pipeline(steps=[
    ("add", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

info_pipe1.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
635,0.0,1.0,0.0
636,0.0,1.0,0.0
637,0.0,1.0,0.0
638,0.0,1.0,0.0


In [137]:
def has_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

In [139]:
info_union = FeatureUnion(transformer_list=[
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func=has_info))
])

In [140]:
info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
])
info_transformer.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,0.0,1.0,0.0,0
1,1.0,0.0,0.0,1
2,1.0,0.0,0.0,1
3,0.0,1.0,0.0,0
4,1.0,0.0,0.0,1
...,...,...,...,...
635,0.0,1.0,0.0,0
636,0.0,1.0,0.0,0
637,0.0,1.0,0.0,0
638,0.0,1.0,0.0,0


## 5. Column Transformer

In [151]:
column_transformer = ColumnTransformer(transformers=[
    ("air", air_transformer, ["airline"]),
    ("doj", doj_transformer, ["date_of_journey"]),
    ("location", location_transformer, ["source",  "destination"]),
    ("time", time_transformer, ["dep_time", "arrival_time"]),
    ("dur", duration_transformer, ["duration"]),
    ("stops", total_stops_transformer, ["total_stops"]),
    ("info", info_transformer, ["additional_info"])
], remainder="passthrough")

column_transformer.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,dur__duration_rbf_25,dur__duration_rbf_50,dur__duration_rbf_75,dur__duration_categories,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,1.000000,0.847458,-0.271179,-0.253274,1,0,0.913043,0.181818,0.782609,0.272727,0.102439,0.765217,-0.356024,-0.097284,-0.080778,2.0,1,1.231011,1.0,0,0.0,1.0,0.0,0
1,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.000000,0.144068,-0.903923,-0.982970,0,1,0.782609,0.727273,0.304348,0.727273,0.185366,1.000000,-0.356024,-0.097284,-0.080778,2.0,0,0.271615,1.0,0,1.0,0.0,0.0,1
2,0.0,0.0,1.0,0.0,0.0,0.666667,0.705882,0.666667,0.711864,-1.796730,-0.982970,1,0,0.086957,1.000000,0.173913,0.363636,0.102439,1.000000,-0.356024,-0.097284,-0.080778,0.0,0,-1.103190,0.0,1,1.0,0.0,0.0,1
3,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.000000,0.974576,-0.903923,-1.789680,0,1,0.304348,0.181818,0.434783,0.090909,1.000000,1.000000,2.406182,-0.097284,-0.080778,0.0,0,-0.925157,0.0,1,0.0,1.0,0.0,0
4,0.0,0.0,1.0,0.0,0.0,0.666667,0.588235,0.500000,0.584746,1.056385,1.056945,1,0,0.869565,1.000000,0.826087,0.000000,0.102439,0.765217,-0.356024,-0.097284,-0.080778,2.0,1,1.349699,1.0,0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,1.056385,1.056945,1,0,0.260870,0.727273,0.521739,0.000000,1.000000,0.000000,-0.356024,-0.097284,-0.080778,1.0,0,-0.638328,1.0,0,0.0,1.0,0.0,0
636,0.0,0.0,0.0,1.0,0.0,0.666667,0.647059,0.333333,0.635593,1.056385,1.056945,1,0,0.347826,0.818182,0.913043,0.000000,1.000000,0.773913,-0.356024,-0.097284,-0.080778,2.0,0,0.182599,1.0,0,0.0,1.0,0.0,0
637,0.0,0.0,0.0,1.0,0.0,0.666667,0.588235,0.500000,0.584746,1.056385,1.056945,1,0,0.739130,0.000000,0.043478,0.545455,0.185366,0.773913,-0.356024,-0.096862,-0.080778,2.0,0,-0.262482,1.0,0,0.0,1.0,0.0,0
638,0.0,1.0,0.0,0.0,0.0,0.666667,0.588235,0.500000,0.584746,-1.796730,-0.982970,1,0,0.826087,0.090909,0.869565,0.636364,0.185366,0.773913,-0.356024,-0.097284,-0.080778,0.0,0,-1.093299,0.0,1,0.0,1.0,0.0,0


## 5. Feature Selection

In [168]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=12)

selector = SelectBySingleFeaturePerformance(
    estimator=estimator,
    scoring="r2",
    threshold=0.1
)

## 6. Putting Everything Together

In [169]:
preprocessor = Pipeline(steps=[
    ("ct", column_transformer),
    ("selector", selector)
])

preprocessor.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Indigo,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_categories,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.823529,0.847458,-0.271179,-0.253274,2.0,1.231011,1.0,0
1,0.0,0.176471,0.144068,-0.903923,-0.982970,2.0,0.271615,1.0,0
2,0.0,0.705882,0.711864,-1.796730,-0.982970,0.0,-1.103190,0.0,1
3,1.0,1.000000,0.974576,-0.903923,-1.789680,0.0,-0.925157,0.0,1
4,0.0,0.588235,0.584746,1.056385,1.056945,2.0,1.349699,1.0,0
...,...,...,...,...,...,...,...,...,...
635,1.0,0.235294,0.220339,1.056385,1.056945,1.0,-0.638328,1.0,0
636,0.0,0.647059,0.635593,1.056385,1.056945,2.0,0.182599,1.0,0
637,0.0,0.588235,0.584746,1.056385,1.056945,2.0,-0.262482,1.0,0
638,1.0,0.588235,0.584746,-1.796730,-0.982970,0.0,-1.093299,0.0,1


## 7. Visualizations

In [170]:
feature_performances = preprocessor.named_steps["selector"].feature_performance_
feature_performances

{'air__airline_Air India': -0.01131189859565048,
 'air__airline_Indigo': 0.10683204097535622,
 'air__airline_Jet Airways': 0.0916219398103002,
 'air__airline_Multiple Carriers': 0.060633751572740535,
 'air__airline_Other': 0.09102715187007682,
 'doj__date_of_journey_month': 0.03083361195961561,
 'doj__date_of_journey_week': 0.10616222052042106,
 'doj__date_of_journey_day_of_week': -0.02683340786536559,
 'doj__date_of_journey_day_of_year': 0.10501545587301904,
 'location__source': 0.12192967969333779,
 'location__destination': 0.11444276240007045,
 'location__source_is_north': 0.021782575267516813,
 'location__destination_is_north': 0.021782575267516813,
 'time__dep_time_hour': 0.01784987502679621,
 'time__dep_time_minute': 0.035877132781345,
 'time__arrival_time_hour': 0.040111582359424434,
 'time__arrival_time_minute': 0.009352293943437581,
 'time__dep_time_part_of_day': -0.0029410755073061936,
 'time__arrival_time_part_of_day': 0.009803341658818724,
 'dur__duration_rbf_25': 0.0839571