## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import sklearn 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion 
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import(
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer
)

from feature_engine.outliers import Winsorizer 
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
import matplotlib.pyplot as plt 
import warnings

## Display Settings

In [2]:
pd.set_option("display.max_columns",None)

In [3]:
sklearn.set_config(transform_output="pandas")

In [4]:
warnings.filterwarnings("ignore")

## Read the data

In [5]:
train=pd.read_csv("data/train.csv")
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-03-21,Delhi,Cochin,07:05:00,21:00:00,835,1.0,No Info,13062
1,Vistara,2019-06-03,Chennai,Kolkata,07:05:00,09:20:00,135,0.0,No Info,3687
2,Indigo,2019-06-03,Delhi,Cochin,11:25:00,01:30:00,845,1.0,No Info,5883
3,Jet Airways,2019-06-06,Delhi,Cochin,20:55:00,04:25:00,450,1.0,In-flight meal not included,10262
4,Jet Airways,2019-06-27,Delhi,Cochin,18:20:00,19:00:00,1480,2.0,No Info,14300


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [10]:
X_train=train.drop(columns="price")
y_train=train.price.copy()

## Transformation Operations

### Airline

In [14]:
X_train.airline

0      Multiple Carriers
1                Vistara
2                 Indigo
3            Jet Airways
4            Jet Airways
             ...        
635               Indigo
636          Jet Airways
637               Indigo
638            Air India
639               Indigo
Name: airline, Length: 640, dtype: object

In [31]:
air_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('grouper',RareLabelEncoder(tol=0.1,replace_with='Other',n_categories=2)),
    ('encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

air_transformer.fit_transform(X_train.loc[:,['airline']])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
635,0.0,1.0,0.0,0.0,0.0
636,0.0,0.0,1.0,0.0,0.0
637,0.0,1.0,0.0,0.0,0.0
638,1.0,0.0,0.0,0.0,0.0


In [28]:
type(X_train.iloc[:,0])

pandas.core.series.Series

In [29]:
X_train.iloc[:,[0]]

Unnamed: 0,airline
0,Multiple Carriers
1,Vistara
2,Indigo
3,Jet Airways
4,Jet Airways
...,...
635,Indigo
636,Jet Airways
637,Indigo
638,Air India


In [26]:
type(X_train.iloc[:,[0]])

pandas.core.frame.DataFrame

In [25]:
type(X_train.loc[:,['airline']])

pandas.core.frame.DataFrame

### Date of journey

In [33]:
X_train.date_of_journey

0      2019-03-21
1      2019-06-03
2      2019-06-03
3      2019-06-06
4      2019-06-27
          ...    
635    2019-05-21
636    2019-05-18
637    2019-06-06
638    2019-06-12
639    2019-05-12
Name: date_of_journey, Length: 640, dtype: object

In [35]:
feature_to_extract=['month','week','day_of_week','day_of_year']
doj_transformer=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=feature_to_extract,yearfirst=True,format='mixed')),
    ('scaler',MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:,['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.176471,0.500000,0.169492
1,1.000000,0.823529,0.000000,0.796610
2,1.000000,0.823529,0.000000,0.796610
3,1.000000,0.823529,0.500000,0.822034
4,1.000000,1.000000,0.500000,1.000000
...,...,...,...,...
635,0.666667,0.705882,0.166667,0.686441
636,0.666667,0.647059,0.833333,0.661017
637,1.000000,0.823529,0.500000,0.822034
638,1.000000,0.882353,0.333333,0.872881


### Source and Destination

In [36]:
X_train.source

0        Delhi
1      Chennai
2        Delhi
3        Delhi
4        Delhi
        ...   
635      Delhi
636    Kolkata
637    Kolkata
638      Delhi
639      Delhi
Name: source, Length: 640, dtype: object

In [37]:
X_train.destination

0        Cochin
1       Kolkata
2        Cochin
3        Cochin
4        Cochin
         ...   
635      Cochin
636    Banglore
637    Banglore
638      Cochin
639      Cochin
Name: destination, Length: 640, dtype: object

In [38]:
location_subset=X_train.loc[:,['source','destination']]
location_subset

Unnamed: 0,source,destination
0,Delhi,Cochin
1,Chennai,Kolkata
2,Delhi,Cochin
3,Delhi,Cochin
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Kolkata,Banglore
637,Kolkata,Banglore
638,Delhi,Cochin


In [51]:
location_pipe1=Pipeline(steps=[
    ('grouper',RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ('encoder',MeanEncoder()),
    ('scaler',PowerTransformer())
])

location_pipe1.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination
0,1.028250,0.807027
1,-1.848876,-1.724971
2,1.028250,0.807027
3,1.028250,0.807027
4,1.028250,0.807027
...,...,...
635,1.028250,0.807027
636,-0.054619,-0.230447
637,-0.054619,-0.230447
638,1.028250,0.807027


In [45]:
np.union1d(X_train.source.unique(),X_train.destination.unique())

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [49]:
def is_north(X):
    columns=X.columns.to_list()
    north_cities=['Delhi','Mumbai','New Delhi','Kolkata']
    return (X
            .assign(**{
                f'{col}_is_north': X.loc[:,col].isin(north_cities).astype(int)
                for col in columns
            })
            .drop(columns=columns))

FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
635,1,0
636,1,0
637,1,0
638,1,0


In [52]:
location_transformer=FeatureUnion(transformer_list=[
    ("part1",location_pipe1),
    ("part2",FunctionTransformer(func=is_north))
])

location_transformer.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,1.028250,0.807027,1,0
1,-1.848876,-1.724971,0,1
2,1.028250,0.807027,1,0
3,1.028250,0.807027,1,0
4,1.028250,0.807027,1,0
...,...,...,...,...
635,1.028250,0.807027,1,0
636,-0.054619,-0.230447,1,0
637,-0.054619,-0.230447,1,0
638,1.028250,0.807027,1,0


### Deaparture and Arrival time

In [54]:
X_train.dep_time

0      07:05:00
1      07:05:00
2      11:25:00
3      20:55:00
4      18:20:00
         ...   
635    06:50:00
636    08:25:00
637    15:15:00
638    05:10:00
639    14:25:00
Name: dep_time, Length: 640, dtype: object

In [55]:
X_train.arrival_time

0      21:00:00
1      09:20:00
2      01:30:00
3      04:25:00
4      19:00:00
         ...   
635    12:10:00
636    22:35:00
637    20:30:00
638    08:00:00
639    17:40:00
Name: arrival_time, Length: 640, dtype: object

In [56]:
time_subset=X_train.loc[:,['dep_time','arrival_time']]
time_subset

Unnamed: 0,dep_time,arrival_time
0,07:05:00,21:00:00
1,07:05:00,09:20:00
2,11:25:00,01:30:00
3,20:55:00,04:25:00
4,18:20:00,19:00:00
...,...,...
635,06:50:00,12:10:00
636,08:25:00,22:35:00
637,15:15:00,20:30:00
638,05:10:00,08:00:00


In [57]:
time_pipe1=Pipeline(steps=[
    ('dt',DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaler',MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.304348,0.090909,0.913043,0.000000
1,0.304348,0.090909,0.391304,0.363636
2,0.478261,0.454545,0.043478,0.545455
3,0.869565,1.000000,0.173913,0.454545
4,0.782609,0.363636,0.826087,0.000000
...,...,...,...,...
635,0.260870,0.909091,0.521739,0.181818
636,0.347826,0.454545,0.956522,0.636364
637,0.652174,0.272727,0.869565,0.545455
638,0.217391,0.181818,0.347826,0.000000


In [59]:
def part_of_day(X,morning=4,afternoon=12,evening=16,night=20):
    columns=X.columns.to_list()
    X_temp=X.assign(**{
        col: pd.to_datetime(X.loc[:,col]).dt.hour 
        for col in columns
    })

    return (X_temp
            .assign(**{
                f'{col}_part_of_day':np.select(
                    [X_temp.loc[:,col].between(morning,afternoon,inclusive='left'),
                    X_temp.loc[:,col].between(afternoon,evening,inclusive='left'),
                    X_temp.loc[:,col].between(evening,night,inclusive='left')],
                    ['morning','afternoon','evening'],
                    default='night'
                )
                for col in columns
            })
            .drop(columns=columns))


FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,morning,night
1,morning,morning
2,morning,night
3,night,morning
4,evening,evening
...,...,...
635,morning,afternoon
636,morning,night
637,afternoon,night
638,morning,morning


In [60]:
time_pipe2=Pipeline(steps=[
    ('part',FunctionTransformer(func=part_of_day)),
    ('encoder',CountFrequencyEncoder()),
    ('scaler',MinMaxScaler())
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,1.000000,1.000000
1,1.000000,0.404040
2,1.000000,1.000000
3,0.191919,0.404040
4,0.161616,0.333333
...,...,...
635,1.000000,0.000000
636,1.000000,1.000000
637,0.000000,1.000000
638,1.000000,0.404040


In [61]:
time_transformer=FeatureUnion(transformer_list=[
    ('part1',time_pipe1),
    ('part2',time_pipe2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.304348,0.090909,0.913043,0.000000,1.000000,1.000000
1,0.304348,0.090909,0.391304,0.363636,1.000000,0.404040
2,0.478261,0.454545,0.043478,0.545455,1.000000,1.000000
3,0.869565,1.000000,0.173913,0.454545,0.191919,0.404040
4,0.782609,0.363636,0.826087,0.000000,0.161616,0.333333
...,...,...,...,...,...,...
635,0.260870,0.909091,0.521739,0.181818,1.000000,0.000000
636,0.347826,0.454545,0.956522,0.636364,1.000000,1.000000
637,0.652174,0.272727,0.869565,0.545455,0.000000,1.000000
638,0.217391,0.181818,0.347826,0.000000,1.000000,0.404040
