# 1. Importing Module

In [183]:
import pandas as pd
import numpy as np
import sklearn
import warnings
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PowerTransformer, FunctionTransformer, OrdinalEncoder, StandardScaler
from feature_engine.encoding import RareLabelEncoder, MeanEncoder, CountFrequencyEncoder
from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer

# 2. Setting the Config 

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
sklearn.set_config(transform_output="pandas")

In [4]:
warnings.filterwarnings('ignore')

# 3. Getting X and y from training data

In [5]:
train_df = pd.read_csv('../data/train_data.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6695 non-null   object 
 1   date_of_journey  6695 non-null   object 
 2   source           6695 non-null   object 
 3   destination      6695 non-null   object 
 4   dep_time         6695 non-null   object 
 5   arrival_time     6695 non-null   object 
 6   duration         6695 non-null   int64  
 7   total_stops      6694 non-null   float64
 8   additional_info  6695 non-null   object 
 9   price            6695 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 523.2+ KB


In [6]:
train_df[['airline','date_of_journey','source','destination','dep_time','arrival_time','additional_info']] = train_df[['airline','date_of_journey','source','destination','dep_time','arrival_time','additional_info']].astype('object',)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6695 non-null   object 
 1   date_of_journey  6695 non-null   object 
 2   source           6695 non-null   object 
 3   destination      6695 non-null   object 
 4   dep_time         6695 non-null   object 
 5   arrival_time     6695 non-null   object 
 6   duration         6695 non-null   int64  
 7   total_stops      6694 non-null   float64
 8   additional_info  6695 non-null   object 
 9   price            6695 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 523.2+ KB


In [8]:
X = train_df.drop(columns=['price'])
y = train_df.price

# 4. Feature engineering

## 4.1. airline

In [9]:
X.airline

0       Jet Airways
1       Jet Airways
2             Goair
3         Air India
4       Jet Airways
           ...     
6690    Jet Airways
6691      Air India
6692    Jet Airways
6693       Air Asia
6694      Air India
Name: airline, Length: 6695, dtype: object

In [15]:
airline_transformer = Pipeline(
steps=[
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),
    ('group_rare_labels', RareLabelEncoder(tol=0.01,n_categories=3,replace_with='Other',missing_values='ignore')),
    ('one_hot_encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
]
)

In [16]:
airline_transformer.fit_transform(X[['airline']])

Unnamed: 0,airline_Air Asia,airline_Air India,airline_Goair,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other,airline_Spicejet,airline_Vistara
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6691,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6692,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6693,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4.2. date_of_journey

In [25]:
X.date_of_journey

0       2019-03-21
1       2019-03-27
2       2019-03-09
3       2019-06-12
4       2019-03-12
           ...    
6690    2019-03-21
6691    2019-05-01
6692    2019-06-01
6693    2019-06-24
6694    2019-03-01
Name: date_of_journey, Length: 6695, dtype: object

In [18]:
features_to_extract = ["month","week","day_of_week","day_of_year"]
date_of_journey_transformer = Pipeline(
    steps=[
        ('dt',DatetimeFeatures(features_to_extract=features_to_extract,yearfirst=True,format='mixed')),
        ('scaler',MinMaxScaler())
    ]
)

In [19]:
date_of_journey_transformer.fit_transform(X.loc[:,['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.176471,0.500000,0.169492
1,0.000000,0.235294,0.333333,0.220339
2,0.000000,0.058824,0.833333,0.067797
3,1.000000,0.882353,0.333333,0.872881
4,0.000000,0.117647,0.166667,0.093220
...,...,...,...,...
6690,0.000000,0.176471,0.500000,0.169492
6691,0.666667,0.529412,0.333333,0.516949
6692,1.000000,0.764706,0.833333,0.779661
6693,1.000000,1.000000,0.000000,0.974576


## 4.3. source & destination

In [27]:
X.source,X.destination

(0       Banglore
 1          Delhi
 2       Banglore
 3        Kolkata
 4       Banglore
           ...   
 6690       Delhi
 6691     Kolkata
 6692       Delhi
 6693       Delhi
 6694    Banglore
 Name: source, Length: 6695, dtype: object,
 0          Delhi
 1         Cochin
 2          Delhi
 3       Banglore
 4          Delhi
           ...   
 6690      Cochin
 6691    Banglore
 6692      Cochin
 6693      Cochin
 6694       Delhi
 Name: destination, Length: 6695, dtype: object)

In [65]:
city_subset_X = X[['source','destination']]
city_subset_X

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


Unnamed: 0,source,destination
0,Banglore,Delhi
1,Delhi,Cochin
2,Banglore,Delhi
3,Kolkata,Banglore
4,Banglore,Delhi
...,...,...
6690,Delhi,Cochin
6691,Kolkata,Banglore
6692,Delhi,Cochin
6693,Delhi,Cochin


In [66]:
city_transformer = Pipeline(
    steps=[
        ('group_rare', RareLabelEncoder(tol=0.1, n_categories=3, replace_with='Other')),
        ('encoder', MeanEncoder()),
        ('scaler', PowerTransformer(standardize=True))
    ]
)

city_transformer.fit_transform(city_subset_X, y)

Unnamed: 0,source,destination
0,-0.857930,-0.857930
1,1.065418,1.065418
2,-0.857930,-0.857930
3,-0.203927,-0.203927
4,-0.857930,-0.857930
...,...,...
6690,1.065418,1.065418
6691,-0.203927,-0.203927
6692,1.065418,1.065418
6693,1.065418,1.065418


In [34]:
np.union1d(X.source.unique(),X.destination.unique())

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai'], dtype=object)

In [67]:
def is_north(X):
    north_cities = ['Delhi','Kolkata','Mumbai']
    cols = X.columns.to_list()
    return (X
        .assign(
        **{
            f"{col}_is_north": X.loc[:,col].isin(north_cities).astype('int') for col in cols
        }
        )
        .drop(columns=cols)
    )

FunctionTransformer(func=is_north).fit_transform(city_subset_X)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,0,1
3,1,0
4,0,1
...,...,...
6690,1,0
6691,1,0
6692,1,0
6693,1,0


In [68]:
location_transformer = FeatureUnion(transformer_list=[
    ("step_1",city_transformer),
    ("step_2",FunctionTransformer(func=is_north))
])
location_transformer.fit_transform(city_subset_X,y)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.857930,-0.857930,0,1
1,1.065418,1.065418,1,0
2,-0.857930,-0.857930,0,1
3,-0.203927,-0.203927,1,0
4,-0.857930,-0.857930,0,1
...,...,...,...,...
6690,1.065418,1.065418,1,0
6691,-0.203927,-0.203927,1,0
6692,1.065418,1.065418,1,0
6693,1.065418,1.065418,1,0


## 4.4. dep_time, arrival_time 

In [70]:
time_subset_X = X[['dep_time','arrival_time']]
time_subset_X

Unnamed: 0,dep_time,arrival_time
0,08:55:00,19:10:00
1,17:30:00,04:25:00
2,11:40:00,14:35:00
3,09:25:00,18:30:00
4,22:55:00,07:40:00
...,...,...
6690,10:45:00,18:50:00
6691,09:25:00,18:30:00
6692,14:00:00,19:00:00
6693,07:55:00,13:25:00


In [72]:
time_pipe_1 = Pipeline(
    steps=[
        ('time_feat',DatetimeFeatures(features_to_extract=['hour','minute'])),
        ('scaler',MinMaxScaler())
    ]
)
time_pipe_1.fit_transform(time_subset_X)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.347826,1.000000,0.826087,0.181818
1,0.739130,0.545455,0.173913,0.454545
2,0.478261,0.727273,0.608696,0.636364
3,0.391304,0.454545,0.782609,0.545455
4,0.956522,1.000000,0.304348,0.727273
...,...,...,...,...
6690,0.434783,0.818182,0.782609,0.909091
6691,0.391304,0.454545,0.782609,0.545455
6692,0.608696,0.000000,0.826087,0.000000
6693,0.304348,1.000000,0.565217,0.454545


In [94]:
def get_part_of_day(X, early_morning=4, morning=8, noon=12, afternoon=16, evening=19, night=22, late_night=1):
    cols = X.columns.to_list()
    
    temp_time_df = X.assign(**{
        col: pd.to_datetime(X.loc[:,col]).dt.hour for col in cols
    })
    
    return temp_time_df.assign(**{
        col: np.select(
            [
                temp_time_df.loc[:,col].between(early_morning,morning,inclusive='left'),
                temp_time_df.loc[:,col].between(morning,noon,inclusive='left'),
                temp_time_df.loc[:,col].between(noon,afternoon,inclusive='left'),
                temp_time_df.loc[:,col].between(afternoon,evening,inclusive='left'),
                temp_time_df.loc[:,col].between(evening,night,inclusive='left'),
                temp_time_df.loc[:,col].between(night,late_night,inclusive='left'),
                
            ],
            ['early_morning','morning','noon','afternoon','evening','night'],
            default='late_night'
        ) for col in cols
    })

In [95]:
FunctionTransformer(func=get_part_of_day).fit_transform(time_subset_X)

Unnamed: 0,dep_time,arrival_time
0,morning,evening
1,afternoon,early_morning
2,morning,noon
3,morning,afternoon
4,late_night,early_morning
...,...,...
6690,morning,afternoon
6691,morning,afternoon
6692,noon,evening
6693,early_morning,noon


In [99]:
time_pipe_2 = Pipeline(steps=[
    ("part_of_the_day_trans", FunctionTransformer(func=get_part_of_day)),
    ("count_enco", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

In [100]:
time_transformer = FeatureUnion([
    ('step_1', time_pipe_1,),
    ('step_2',time_pipe_2)
])

## 4.5. duration

In [113]:
X.loc[:,'duration'].between(1,120)

0       False
1       False
2       False
3       False
4       False
        ...  
6690    False
6691    False
6692    False
6693    False
6694    False
Name: duration, Length: 6695, dtype: bool

In [175]:
def duration_category(X, quick=0, medium=120, long=500):
    return (X.assign(duration_cat=np.select(
        [X.duration.between(quick,medium,inclusive='left'), 
         X.duration.between(medium, long, inclusive='left')],
        ['quick','medium'],
        default='long'
        )).drop(columns="duration"))
dur_cat_trans = FunctionTransformer(func=duration_category)

In [176]:
duration_pipe_1 = Pipeline(
    steps=[
        ("dur_cat",dur_cat_trans),
        ("ecoder", OrdinalEncoder(categories=[["quick", "medium", "long"]]))
    ]
)

In [177]:
duration_pipe_1.fit_transform(X.loc[:,['duration']])

Unnamed: 0,duration_cat
0,2.0
1,2.0
2,1.0
3,2.0
4,2.0
...,...
6690,2.0
6691,2.0
6692,1.0
6693,1.0


In [186]:
duration_pipe_2 = Pipeline(steps=[
    ('outliers',Winsorizer(capping_method='iqr',fold=1.5)),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [187]:
duration_trans = FeatureUnion([
    ("duration_pipe1",duration_pipe_2),
    ("duration_pipe2",duration_pipe_1)
])

duration_trans.fit_transform(X.loc[:,['duration']])

Unnamed: 0,duration,duration_cat
0,-0.033916,2.0
1,0.046422,2.0
2,-0.917631,1.0
3,-0.174507,2.0
4,-0.214676,2.0
...,...,...
6690,2.597145,2.0
6691,-0.174507,2.0
6692,-0.666576,1.0
6693,-0.606322,1.0


# 5. Column Transformer

In [188]:
col_transformer = ColumnTransformer(
    [
        ('airline_trans',airline_transformer,['airline']),
        ('doj_trans',date_of_journey_transformer,['date_of_journey']),
        ('loc_trans',location_transformer,['source','destination']),
        ('time_trans',time_transformer,['dep_time','arrival_time']),
        ('dur_trans',duration_trans,['duration'])
    ],
    remainder='passthrough'
)

col_transformer.fit_transform(X,y)

Unnamed: 0,airline_trans__airline_Air Asia,airline_trans__airline_Air India,airline_trans__airline_Goair,airline_trans__airline_Indigo,airline_trans__airline_Jet Airways,airline_trans__airline_Multiple Carriers,airline_trans__airline_Other,airline_trans__airline_Spicejet,airline_trans__airline_Vistara,doj_trans__date_of_journey_month,doj_trans__date_of_journey_week,doj_trans__date_of_journey_day_of_week,doj_trans__date_of_journey_day_of_year,loc_trans__source,loc_trans__destination,loc_trans__source_is_north,loc_trans__destination_is_north,time_trans__dep_time_hour,time_trans__dep_time_minute,time_trans__arrival_time_hour,time_trans__arrival_time_minute,time_trans__dep_time,time_trans__arrival_time,dur_trans__duration,dur_trans__duration_cat,remainder__total_stops,remainder__additional_info
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.176471,0.500000,0.169492,-0.857930,-0.857930,0,1,0.347826,1.000000,0.826087,0.181818,1.000000,1.000000,-0.033916,2.0,1.0,In-flight meal not included
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,1.065418,1.065418,1,0,0.739130,0.545455,0.173913,0.454545,0.403639,0.140187,0.046422,2.0,1.0,In-flight meal not included
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.058824,0.833333,0.067797,-0.857930,-0.857930,0,1,0.478261,0.727273,0.608696,0.636364,1.000000,0.336449,-0.917631,1.0,0.0,No info
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,-0.203927,-0.203927,1,0,0.391304,0.454545,0.782609,0.545455,1.000000,0.000000,-0.174507,2.0,1.0,No info
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.117647,0.166667,0.093220,-0.857930,-0.857930,0,1,0.956522,1.000000,0.304348,0.727273,0.000000,0.140187,-0.214676,2.0,1.0,In-flight meal not included
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.176471,0.500000,0.169492,1.065418,1.065418,1,0,0.434783,0.818182,0.782609,0.909091,1.000000,0.000000,2.597145,2.0,2.0,No info
6691,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.529412,0.333333,0.516949,-0.203927,-0.203927,1,0,0.391304,0.454545,0.782609,0.545455,1.000000,0.000000,-0.174507,2.0,1.0,No info
6692,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.000000,0.764706,0.833333,0.779661,1.065418,1.065418,1,0,0.608696,0.000000,0.826087,0.000000,0.299421,1.000000,-0.666576,1.0,1.0,In-flight meal not included
6693,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,1.000000,0.000000,0.974576,1.065418,1.065418,1,0,0.304348,1.000000,0.565217,0.454545,0.781638,0.336449,-0.606322,1.0,1.0,No info
