# Part 1: Feature Engineering

# What is Feature Engineering?
* `Feature Engineering` is the process of creating new features from the existing features.
* The primary aim is to create such features that can boost the performance of the model.
* This is totally experimental and depends on the creativity of the programmer.

# Data Preprocessing
`Data Preprocessing` is the process of converting the dataset into a form suitable for the model 

In [1]:
import sys

print(sys.executable)

c:\Users\jugal\AppData\Local\Programs\Python\Python311\python.exe


In [2]:
import pandas as pd
import numpy as np
import sklearn
from IPython.display import display


from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer

from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import MeanEncoder


# Display settings
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output = 'pandas') # To display sklearn outputs as pandas DataFrames

# 1. Reading the training data

In [3]:
path = r"D:\Nikss\Projects\Flight Price Prediction\Datasets\train_data.csv"
df = pd.read_csv(path)

df.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Spicejet,2019-05-01,Banglore,Delhi,21:10:00,00:05:00,175,0.0,No Info,4991
1,Vistara,2019-06-01,Chennai,Kolkata,17:45:00,20:05:00,140,0.0,No Info,11982
2,Jet Airways,2019-06-09,Delhi,Cochin,17:30:00,12:35:00,1145,1.0,In-flight meal not included,10262
3,Air India,2019-05-18,Kolkata,Banglore,12:00:00,18:30:00,1830,2.0,No Info,10361
4,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0.0,In-flight meal not included,2228


In [4]:
def check_missing(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
    missing_data = pd.concat([total, percent], axis = 1, keys = ['Total', 'Percent'])
    
    display(missing_data)

    user_input = input("Do you want to drop rows with missing values? (y/n): ")
    if user_input.lower() == 'y':
        df.dropna(inplace = True)
        print()
        
        print("Rows with missing values dropped.")
    return df


In [5]:
df = check_missing(df)

Unnamed: 0,Total,Percent
total_stops,1,0.013656
airline,0,0.0
date_of_journey,0,0.0
source,0,0.0
destination,0,0.0
dep_time,0,0.0
arrival_time,0,0.0
duration,0,0.0
additional_info,0,0.0
price,0,0.0



Rows with missing values dropped.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7322 entries, 0 to 7322
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          7322 non-null   object 
 1   date_of_journey  7322 non-null   object 
 2   source           7322 non-null   object 
 3   destination      7322 non-null   object 
 4   dep_time         7322 non-null   object 
 5   arrival_time     7322 non-null   object 
 6   duration         7322 non-null   int64  
 7   total_stops      7322 non-null   float64
 8   additional_info  7322 non-null   object 
 9   price            7322 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 629.2+ KB


In [7]:
# Splittinng into x and y
x = df.drop(columns = 'price')
y = df['price'].copy()

# 2. Transformation Operations (Column wise)

In [8]:
x.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

# 2.1 Airline

* Impute missing values  
* Group Rare Categories:
    * Some categories may have very few occurrences.
    * Group these into a single category called 'Other' to avoid overfitting.
* One-Hot Encoding

In [9]:
steps = [
    ('inputing', SimpleImputer(strategy = 'most_frequent')),
    ('rare_label_grouping', RareLabelEncoder(tol = 0.1, replace_with = "other", n_categories = 2)),
    ('ohe', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'))
]

airline_pipeline = Pipeline(steps = steps)

airline_pipeline.fit_transform(x[['airline']])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_other
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
7318,0.0,0.0,1.0,0.0,0.0
7319,0.0,1.0,0.0,0.0,0.0
7320,0.0,0.0,1.0,0.0,0.0
7321,0.0,0.0,0.0,1.0,0.0


# 2.2 date_of_journey
* Extract some date features
    


In [10]:
from feature_engine.datetime import DatetimeFeatures
from sklearn.preprocessing import MinMaxScaler

features_to_extract = ['month', 'week', 'day_of_week', 'day_of_month', 'weekend', 'month_start', 'month_end']

doj_steps = [
    ('extract_doj', DatetimeFeatures(features_to_extract = features_to_extract, format = "mixed", yearfirst = True)),
    ('scaling', MinMaxScaler())

]

doj_pipeline = Pipeline(steps = doj_steps)

doj_pipeline.fit_transform(x[['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_month,date_of_journey_weekend,date_of_journey_month_start,date_of_journey_month_end
0,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0
1,1.000000,0.764706,0.833333,0.000000,1.0,1.0,0.0
2,1.000000,0.823529,1.000000,0.307692,1.0,0.0,0.0
3,0.666667,0.647059,0.833333,0.653846,1.0,0.0,0.0
4,0.000000,0.176471,1.000000,0.884615,1.0,0.0,0.0
...,...,...,...,...,...,...,...
7318,0.666667,0.647059,0.333333,0.538462,0.0,0.0,0.0
7319,0.666667,0.705882,0.166667,0.769231,0.0,0.0,0.0
7320,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0
7321,1.000000,0.823529,0.000000,0.076923,0.0,0.0,0.0


# 2.3 `source` and `destination`: Both are having same type of values.
* Group rare values
* Mean encoding
* Power transformer

In [11]:
from feature_engine.encoding import MeanEncoder
from sklearn.preprocessing import PowerTransformer

In [12]:
df[['source', 'destination']]

Unnamed: 0,source,destination
0,Banglore,Delhi
1,Chennai,Kolkata
2,Delhi,Cochin
3,Kolkata,Banglore
4,Mumbai,Hyderabad
...,...,...
7318,Kolkata,Banglore
7319,Delhi,Cochin
7320,Kolkata,Banglore
7321,Delhi,Cochin


In [13]:
steps = [
    ('rare_label_grouping', RareLabelEncoder(tol = 0.1, replace_with = "other", n_categories = 2)),
    ('mean_encoding', MeanEncoder()),    # Mean Encoder needs target variable 'price' also, so we will pass y in fit_transform.
    ('power_transformer', PowerTransformer(standardize = True))
]

source_dest_pipeline = Pipeline(steps= steps)

source_dest_pipeline.fit_transform(
    X = x[['source', 'destination']],
    y = y)

Unnamed: 0,source,destination
0,-0.922934,-1.825340
1,-1.864720,-0.844206
2,1.047774,1.046261
3,-0.162824,-0.174735
4,-1.864720,-0.844206
...,...,...
7318,-0.162824,-0.174735
7319,1.047774,1.046261
7320,-0.162824,-0.174735
7321,1.047774,1.046261


# 2.4 `dep_time` and `arrival_time`: Having values of same types

In [14]:
time_columns = x[['dep_time', 'arrival_time']]

time_columns

Unnamed: 0,dep_time,arrival_time
0,21:10:00,00:05:00
1,17:45:00,20:05:00
2,17:30:00,12:35:00
3,12:00:00,18:30:00
4,15:50:00,17:20:00
...,...,...
7318,10:20:00,17:35:00
7319,10:35:00,21:00:00
7320,20:25:00,10:55:00
7321,07:00:00,21:00:00


In [15]:
def part_of_day(X, morning=4, noon=12, evening=16, night=20):
    X_temp = pd.DataFrame(index = X.index)
    for col in X.columns:
        hour = pd.to_datetime(X[col], errors='coerce', format = 'mixed').dt.hour
        X_temp[f"{col}_part_of_day"] = np.where(
            (hour >= morning) & (hour < noon), 'morning',
            np.where(
                (hour >= noon) & (hour < evening), 'afternoon',
                np.where(
                    (hour >= evening) & (hour < night), 'evening', 'night'
                )
            )
        )
    return X_temp


In [16]:
part_of_day(x[['dep_time', 'arrival_time']])

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,night,night
1,evening,night
2,evening,afternoon
3,afternoon,evening
4,afternoon,evening
...,...,...
7318,morning,evening
7319,morning,night
7320,night,morning
7321,morning,night


In [17]:
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

time_pipe_steps1 = [
    ('func_trans', FunctionTransformer(func=part_of_day, validate=False)),
    ('ordinal_encoder', OrdinalEncoder(categories=[['morning', 'afternoon', 'evening', 'night']] * 2)),
    ('scaler', MinMaxScaler())
]

time_pipeline1 = Pipeline(steps=time_pipe_steps1)

time_pipeline1.fit_transform(time_columns)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,1.000000,1.000000
1,0.666667,1.000000
2,0.666667,0.333333
3,0.333333,0.666667
4,0.333333,0.666667
...,...,...
7318,0.000000,0.666667
7319,0.000000,1.000000
7320,1.000000,0.000000
7321,0.000000,1.000000


In [18]:
time_columns = x[['dep_time', 'arrival_time']]
features_to_extract = ['hour', 'minute']

time_pipe_steps2 = [
    ('datetime', DatetimeFeatures(features_to_extract = features_to_extract, format = 'mixed')),
    ('scaler', MinMaxScaler())
    ]


time_pipeline2 = Pipeline(steps = time_pipe_steps2)

time_pipeline2.fit_transform(time_columns)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.913043,0.181818,0.000000,0.090909
1,0.739130,0.818182,0.869565,0.090909
2,0.739130,0.545455,0.521739,0.636364
3,0.521739,0.000000,0.782609,0.545455
4,0.652174,0.909091,0.739130,0.363636
...,...,...,...,...
7318,0.434783,0.363636,0.739130,0.636364
7319,0.434783,0.636364,0.913043,0.000000
7320,0.869565,0.454545,0.434783,1.000000
7321,0.304348,0.000000,0.913043,0.000000


# 2.5 Duration

* Only applying PowerTransformer.
* PT has a parameter `standardize`, if set to `True`, it does scaling also.


In [19]:
duration_steps = [
    ('capping', Winsorizer(capping_method = 'iqr')),
    ('power_transformer', PowerTransformer(standardize = True))
]

duration_pipeline = Pipeline(steps = duration_steps)

duration_pipeline.fit_transform(x[['duration']])

Unnamed: 0,duration
0,-1.013801
1,-1.236654
2,1.088183
3,1.680771
4,-1.662039
...,...
7318,-0.046843
7319,0.364748
7320,0.754289
7321,0.712317


# 2.6 total_stops
* Custom Function: `is_direct_flight()` 
* MinMaxScaler

Directly apply `MinMaxScaler` bcz the data is already `ordinal-numeric`

In [20]:
x['total_stops']

0       0.0
1       0.0
2       1.0
3       2.0
4       0.0
       ... 
7318    1.0
7319    1.0
7320    1.0
7321    1.0
7322    1.0
Name: total_stops, Length: 7322, dtype: float64

In [21]:
def is_direct_flight(x):
    y = pd.DataFrame(index= x.index)

    y['is_direct_flight'] = np.where(x['total_stops'] == 0, 1, 0)
    return y

is_direct_flight(x)

Unnamed: 0,is_direct_flight
0,1
1,1
2,0
3,0
4,1
...,...
7318,0
7319,0
7320,0
7321,0


In [22]:
total_stops_pipeline1 = FunctionTransformer(func=is_direct_flight, validate=False)

total_stops_pipeline1.fit_transform(x[['total_stops']])

Unnamed: 0,is_direct_flight
0,1
1,1
2,0
3,0
4,1
...,...
7318,0
7319,0
7320,0
7321,0


In [23]:
steps_total_stops = [
    ('scaling', MinMaxScaler())
]

total_stops_pipeline2 = Pipeline(steps = steps_total_stops)
total_stops_pipeline2.fit_transform(x[['total_stops']])

Unnamed: 0,total_stops
0,0.00
1,0.00
2,0.25
3,0.50
4,0.00
...,...
7318,0.25
7319,0.25
7320,0.25
7321,0.25


# 2.7 additional_info
* Dropping this column as it is not much necessary

In [24]:
x.drop(columns = 'additional_info', inplace = True)

# 3. Column Transformer

In [25]:
transformers = [
    ('airline', airline_pipeline, ['airline']),
    ('doj', doj_pipeline, ['date_of_journey']),
    ('source_dest', source_dest_pipeline, ['source', 'destination']),
    ('time_pipe1', time_pipeline1, ['dep_time', 'arrival_time']),
    ('time_pipe2', time_pipeline2, ['dep_time', 'arrival_time']),
    ('duration', duration_pipeline, ['duration']),
    ('total_stops1', total_stops_pipeline1, ['total_stops']),
    ('total_stops2', total_stops_pipeline2, ['total_stops']),
]

ct_pipeline = ColumnTransformer(transformers = transformers, remainder = 'passthrough')

ct_pipeline.fit_transform(X = x, y = y)

Unnamed: 0,airline__airline_Air India,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_Multiple Carriers,airline__airline_other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_month,doj__date_of_journey_weekend,doj__date_of_journey_month_start,doj__date_of_journey_month_end,source_dest__source,source_dest__destination,time_pipe1__dep_time_part_of_day,time_pipe1__arrival_time_part_of_day,time_pipe2__dep_time_hour,time_pipe2__dep_time_minute,time_pipe2__arrival_time_hour,time_pipe2__arrival_time_minute,duration__duration,total_stops1__is_direct_flight,total_stops2__total_stops
0,0.0,0.0,0.0,0.0,1.0,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0,-0.922934,-1.825340,1.000000,1.000000,0.913043,0.181818,0.000000,0.090909,-1.013801,1,0.00
1,0.0,0.0,0.0,0.0,1.0,1.000000,0.764706,0.833333,0.000000,1.0,1.0,0.0,-1.864720,-0.844206,0.666667,1.000000,0.739130,0.818182,0.869565,0.090909,-1.236654,1,0.00
2,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,1.000000,0.307692,1.0,0.0,0.0,1.047774,1.046261,0.666667,0.333333,0.739130,0.545455,0.521739,0.636364,1.088183,0,0.25
3,1.0,0.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.653846,1.0,0.0,0.0,-0.162824,-0.174735,0.333333,0.666667,0.521739,0.000000,0.782609,0.545455,1.680771,0,0.50
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,1.000000,0.884615,1.0,0.0,0.0,-1.864720,-0.844206,0.333333,0.666667,0.652174,0.909091,0.739130,0.363636,-1.662039,1,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7318,0.0,0.0,1.0,0.0,0.0,0.666667,0.647059,0.333333,0.538462,0.0,0.0,0.0,-0.162824,-0.174735,0.000000,0.666667,0.434783,0.363636,0.739130,0.636364,-0.046843,0,0.25
7319,0.0,1.0,0.0,0.0,0.0,0.666667,0.705882,0.166667,0.769231,0.0,0.0,0.0,1.047774,1.046261,0.000000,1.000000,0.434783,0.636364,0.913043,0.000000,0.364748,0,0.25
7320,0.0,0.0,1.0,0.0,0.0,0.666667,0.529412,0.333333,0.000000,0.0,1.0,0.0,-0.162824,-0.174735,1.000000,0.000000,0.869565,0.454545,0.434783,1.000000,0.754289,0,0.25
7321,0.0,0.0,0.0,1.0,0.0,1.000000,0.823529,0.000000,0.076923,0.0,0.0,0.0,1.047774,1.046261,0.000000,1.000000,0.304348,0.000000,0.913043,0.000000,0.712317,0,0.25


In [26]:
for name, pipe, cols in transformers:
    # if your transformer needs y during fit_transform (e.g. MeanEncoder), pass it
    try:
        out = pipe.fit_transform(x[cols], y)  # passes y to those that need it
    except TypeError:
        out = pipe.fit_transform(x[cols])
    print(name, "->", type(out), getattr(out, "shape", None))
    # for pandas outputs also print index info
    if hasattr(out, "index"):
        print("   index equals input index:", out.index.equals(x[cols].index))


airline -> <class 'pandas.core.frame.DataFrame'> (7322, 5)
   index equals input index: True
doj -> <class 'pandas.core.frame.DataFrame'> (7322, 7)
   index equals input index: True
source_dest -> <class 'pandas.core.frame.DataFrame'> (7322, 2)
   index equals input index: True
time_pipe1 -> <class 'pandas.core.frame.DataFrame'> (7322, 2)
   index equals input index: True
time_pipe2 -> <class 'pandas.core.frame.DataFrame'> (7322, 4)
   index equals input index: True
duration -> <class 'pandas.core.frame.DataFrame'> (7322, 1)
   index equals input index: True
total_stops1 -> <class 'pandas.core.frame.DataFrame'> (7322, 1)
   index equals input index: True
total_stops2 -> <class 'pandas.core.frame.DataFrame'> (7322, 1)
   index equals input index: True


# Part 2: Feature Selection

In [27]:
from feature_engine.selection import SelectBySingleFeaturePerformance
from sklearn.ensemble import RandomForestRegressor


In [28]:
estimator = RandomForestRegressor(n_estimators = 20, max_depth = 5, random_state = 1)
selector = SelectBySingleFeaturePerformance(
    estimator = estimator,
    scoring = 'r2',
    threshold = 0.1
)

# Creating the final pipeline
final_pipeline = Pipeline(steps = [
    ('column_transformer', ct_pipeline),
    ('feature_selection', selector)
]) 

final_pipeline.fit_transform(X = x, y = y)

Unnamed: 0,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_other,doj__date_of_journey_week,source_dest__source,source_dest__destination,time_pipe2__arrival_time_hour,duration__duration,total_stops1__is_direct_flight,total_stops2__total_stops
0,0.0,0.0,1.0,0.529412,-0.922934,-1.825340,0.000000,-1.013801,1,0.00
1,0.0,0.0,1.0,0.764706,-1.864720,-0.844206,0.869565,-1.236654,1,0.00
2,0.0,1.0,0.0,0.823529,1.047774,1.046261,0.521739,1.088183,0,0.25
3,0.0,0.0,0.0,0.647059,-0.162824,-0.174735,0.782609,1.680771,0,0.50
4,0.0,1.0,0.0,0.176471,-1.864720,-0.844206,0.739130,-1.662039,1,0.00
...,...,...,...,...,...,...,...,...,...,...
7318,0.0,1.0,0.0,0.647059,-0.162824,-0.174735,0.739130,-0.046843,0,0.25
7319,1.0,0.0,0.0,0.705882,1.047774,1.046261,0.913043,0.364748,0,0.25
7320,0.0,1.0,0.0,0.529412,-0.162824,-0.174735,0.434783,0.754289,0,0.25
7321,0.0,0.0,0.0,0.823529,1.047774,1.046261,0.913043,0.712317,0,0.25


In [29]:
feature_performance = final_pipeline.named_steps['feature_selection'].feature_performance_
feature_performance

{'airline__airline_Air India': 0.002164161719206007,
 'airline__airline_Indigo': 0.1304685854559775,
 'airline__airline_Jet Airways': 0.17747908206974086,
 'airline__airline_Multiple Carriers': 0.023670755844778586,
 'airline__airline_other': 0.11382330665356306,
 'doj__date_of_journey_month': 0.09257108730694212,
 'doj__date_of_journey_week': 0.19880224650655076,
 'doj__date_of_journey_day_of_week': 0.004037300003953526,
 'doj__date_of_journey_day_of_month': 0.03265359713452783,
 'doj__date_of_journey_weekend': -0.0002788474239527883,
 'doj__date_of_journey_month_start': 0.00919686178886451,
 'doj__date_of_journey_month_end': -0.0005203120844960255,
 'source_dest__source': 0.13345045789214835,
 'source_dest__destination': 0.13608356693474802,
 'time_pipe1__dep_time_part_of_day': -7.984605189144862e-05,
 'time_pipe1__arrival_time_part_of_day': 0.03193302206860971,
 'time_pipe2__dep_time_hour': 0.018386426318662052,
 'time_pipe2__dep_time_minute': 0.04745652918226225,
 'time_pipe2__arri

In [30]:
# Descending
sorted_dict_desc = dict(sorted(feature_performance.items(), key=lambda item: item[1], reverse=True))
sorted_dict_desc

{'duration__duration': 0.4521077726963904,
 'total_stops2__total_stops': 0.4142148505511029,
 'total_stops1__is_direct_flight': 0.38930682669073796,
 'doj__date_of_journey_week': 0.19880224650655076,
 'airline__airline_Jet Airways': 0.17747908206974086,
 'source_dest__destination': 0.13608356693474802,
 'source_dest__source': 0.13345045789214835,
 'airline__airline_Indigo': 0.1304685854559775,
 'airline__airline_other': 0.11382330665356306,
 'time_pipe2__arrival_time_hour': 0.11091793919084451,
 'doj__date_of_journey_month': 0.09257108730694212,
 'time_pipe2__dep_time_minute': 0.04745652918226225,
 'time_pipe2__arrival_time_minute': 0.047102223679684974,
 'doj__date_of_journey_day_of_month': 0.03265359713452783,
 'time_pipe1__arrival_time_part_of_day': 0.03193302206860971,
 'airline__airline_Multiple Carriers': 0.023670755844778586,
 'time_pipe2__dep_time_hour': 0.018386426318662052,
 'doj__date_of_journey_month_start': 0.00919686178886451,
 'doj__date_of_journey_day_of_week': 0.004037

Observation:

* The dataset had `23 columns after Feature Engineering`
* The `Feature Selection algorithm selected 10` features out of that

In [31]:
final_pipeline

0,1,2
,steps,"[('column_transformer', ...), ('feature_selection', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('airline', ...), ('doj', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,variables,
,features_to_extract,"['month', 'week', ...]"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,True
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'
,smoothing,0.0

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function par...0028F96130720>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,"[['morning', 'afternoon', ...], ['morning', 'afternoon', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,variables,
,features_to_extract,"['hour', 'minute']"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,False
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,'auto'
,add_indicators,False
,variables,
,missing_values,'raise'

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function is_...0028F96131440>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,estimator,RandomForestR...andom_state=1)
,scoring,'r2'
,cv,3
,groups,
,threshold,0.1
,variables,
,confirm_variables,False

0,1,2
,n_estimators,20
,criterion,'squared_error'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
# Exporting the final Pipeline

import cloudpickle

with open("flights_preprocessor.pkl", 'wb') as f:
    cloudpickle.dump(final_pipeline, file = f)