## 1. Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import warnings
import os
from feature_engine.encoding import RareLabelEncoder
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,PowerTransformer,FunctionTransformer,OrdinalEncoder,StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from feature_engine.selection import SelectBySingleFeaturePerformance

## 2. Display Setting

In [2]:
pd.set_option("display.max_columns",None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings("ignore")

## 3. Read Data

In [3]:
path = r"D:\FLIGHT_PRICE_PREDICTION\DATA"
def get_data(name):
    file_name = f"{name}.csv"
    file_path = os.path.join(path,file_name)
    return pd.read_csv(file_path)

In [4]:
train_data = get_data('train')
train_data

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-04-01,Kolkata,Banglore,16:45:00,23:15:00,1830,3,No Info,8607
1,Multiple Carriers,2019-06-01,Delhi,Cochin,13:00:00,21:00:00,480,1,No Info,13587
2,Jet Airways,2019-05-21,Kolkata,Banglore,14:05:00,23:35:00,570,1,In-flight meal not included,10844
3,Jet Airways,2019-03-09,Delhi,Cochin,05:25:00,04:25:00,1380,2,No Info,16914
4,Jet Airways,2019-05-18,Kolkata,Banglore,16:30:00,08:15:00,945,1,In-flight meal not included,8586
...,...,...,...,...,...,...,...,...,...,...
7318,Indigo,2019-06-03,Banglore,Delhi,08:30:00,11:20:00,170,0,No Info,4823
7319,Jet Airways,2019-03-21,Banglore,New Delhi,07:00:00,21:20:00,860,1,In-flight meal not included,7832
7320,Indigo,2019-04-03,Delhi,Cochin,10:35:00,15:35:00,300,1,No Info,5073
7321,Indigo,2019-04-27,Delhi,Cochin,05:35:00,08:50:00,195,0,No Info,6015


In [5]:
# Viewing the overall dataset characteristic
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7323 entries, 0 to 7322
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   airline          7323 non-null   object
 1   date_of_journey  7323 non-null   object
 2   source           7323 non-null   object
 3   destination      7323 non-null   object
 4   dep_time         7323 non-null   object
 5   arrival_time     7323 non-null   object
 6   duration         7323 non-null   int64 
 7   total_stops      7323 non-null   int64 
 8   additional_info  7323 non-null   object
 9   price            7323 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 572.2+ KB


- There is no null values

In [6]:
# Checking for duplicates
train_data.duplicated().sum()

np.int64(0)

- There is no duplicate value

In [7]:
# Spliting our data for feature engineering
X = train_data.drop(['price'],axis=1)
y = train_data.price

## 4. Transform Operations

### 4.1 Airline

In [8]:
# Overview of the column
X.airline.value_counts(normalize=True)

airline
Jet Airways          0.352315
Indigo               0.193636
Air India            0.162638
Multiple Carriers    0.115253
Spicejet             0.077291
Vistara              0.048341
Air Asia             0.031818
Goair                0.018708
Name: proportion, dtype: float64

In [9]:
# Transforming the column
airline_transformer = Pipeline(steps = [
    ('group',RareLabelEncoder(tol = 0.1,replace_with='other',n_categories=2)),
    ('ohe',OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first'))
])
airline_transformer.fit_transform(X.loc[:,['airline']])

Unnamed: 0,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_other
0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
...,...,...,...,...
7318,1.0,0.0,0.0,0.0
7319,0.0,1.0,0.0,0.0
7320,1.0,0.0,0.0,0.0
7321,1.0,0.0,0.0,0.0


### 4.2 Date of Journey

In [10]:
# Overview of the column
X.date_of_journey

0       2019-04-01
1       2019-06-01
2       2019-05-21
3       2019-03-09
4       2019-05-18
           ...    
7318    2019-06-03
7319    2019-03-21
7320    2019-04-03
7321    2019-04-27
7322    2019-05-21
Name: date_of_journey, Length: 7323, dtype: object

In [11]:
# Transforming the column
date_transformer = Pipeline(steps=[
    ('extraction',DatetimeFeatures(features_to_extract=['month','week','day_of_week','day_of_year'],yearfirst=True,format='mixed')),
    ('scaler',MinMaxScaler())
])
date_transformer.fit_transform(X.loc[:,['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.333333,0.294118,0.000000,0.262712
1,1.000000,0.764706,0.833333,0.779661
2,0.666667,0.705882,0.166667,0.686441
3,0.000000,0.058824,0.833333,0.067797
4,0.666667,0.647059,0.833333,0.661017
...,...,...,...,...
7318,1.000000,0.823529,0.000000,0.796610
7319,0.000000,0.176471,0.500000,0.169492
7320,0.333333,0.294118,0.333333,0.279661
7321,0.333333,0.470588,0.833333,0.483051


### 4.3 Source & Destination

- Since both columns are of similar type so we will transform them together

In [12]:
# Viewing the columns
loc = X[['source','destination']]
loc

Unnamed: 0,source,destination
0,Kolkata,Banglore
1,Delhi,Cochin
2,Kolkata,Banglore
3,Delhi,Cochin
4,Kolkata,Banglore
...,...,...
7318,Banglore,Delhi
7319,Banglore,New Delhi
7320,Delhi,Cochin
7321,Delhi,Cochin


In [13]:
# First step -- Rare label grouping, encoding with mean of price and scaling
loc_pipe1 = Pipeline(steps=[
    ('grouping',RareLabelEncoder(tol=0.1,replace_with='other',n_categories=2)),
    ('encoding',OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first'))
])
loc_pipe1.fit_transform(loc,y)

Unnamed: 0,source_Delhi,source_Kolkata,source_other,destination_Cochin,destination_Delhi,destination_other
0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
7318,0.0,0.0,0.0,0.0,1.0,0.0
7319,0.0,0.0,0.0,0.0,0.0,1.0
7320,1.0,0.0,0.0,1.0,0.0,0.0
7321,1.0,0.0,0.0,1.0,0.0,0.0


In [14]:
# Making new feature
def is_south(data):
    south_cities = ['Banglore','Chennai','Cochin','Hyderabad']
    columns = data.columns.to_list()
    return (
        data
        .assign(**{
            f'{col}_is_south' : data.loc[:,col].isin(south_cities).astype(int)
            for col in columns
        })
        .drop(columns = columns)
    )
FunctionTransformer(func=is_south).fit_transform(loc)

Unnamed: 0,source_is_south,destination_is_south
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
7318,1,0
7319,1,0
7320,0,1
7321,0,1


In [15]:
# Combining the above transformations
loc_transformer = FeatureUnion(transformer_list=[
    ('pipeline',loc_pipe1),
    ('function',FunctionTransformer(func=is_south))
])
loc_transformer.fit_transform(loc,y)

Unnamed: 0,source_Delhi,source_Kolkata,source_other,destination_Cochin,destination_Delhi,destination_other,source_is_south,destination_is_south
0,0.0,1.0,0.0,0.0,0.0,0.0,0,1
1,1.0,0.0,0.0,1.0,0.0,0.0,0,1
2,0.0,1.0,0.0,0.0,0.0,0.0,0,1
3,1.0,0.0,0.0,1.0,0.0,0.0,0,1
4,0.0,1.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...
7318,0.0,0.0,0.0,0.0,1.0,0.0,1,0
7319,0.0,0.0,0.0,0.0,0.0,1.0,1,0
7320,1.0,0.0,0.0,1.0,0.0,0.0,0,1
7321,1.0,0.0,0.0,1.0,0.0,0.0,0,1


### 4.4 Departure & Arrival Time

- Since these two column are of similar type so we will transform them together

In [16]:
# Viewing the column
time = X[['dep_time','arrival_time']]
time

Unnamed: 0,dep_time,arrival_time
0,16:45:00,23:15:00
1,13:00:00,21:00:00
2,14:05:00,23:35:00
3,05:25:00,04:25:00
4,16:30:00,08:15:00
...,...,...
7318,08:30:00,11:20:00
7319,07:00:00,21:20:00
7320,10:35:00,15:35:00
7321,05:35:00,08:50:00


In [17]:
# First step -- Extracting hour and minute and scaling them
time_pipe1 = Pipeline(steps=[
    ('extraction',DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaing',MinMaxScaler())
])
time_pipe1.fit_transform(time)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.695652,0.818182,1.000000,0.272727
1,0.565217,0.000000,0.913043,0.000000
2,0.608696,0.090909,1.000000,0.636364
3,0.217391,0.454545,0.173913,0.454545
4,0.695652,0.545455,0.347826,0.272727
...,...,...,...,...
7318,0.347826,0.545455,0.478261,0.363636
7319,0.304348,0.000000,0.913043,0.363636
7320,0.434783,0.636364,0.652174,0.636364
7321,0.217391,0.636364,0.347826,0.909091


In [18]:
# Creating new feature
def day_time(data):
    columns = data.columns.to_list()
    tem_data = data.assign(**{
        col : pd.to_datetime(data.loc[:,col]).dt.hour
        for col in columns
    })
    return(
        data
        .assign(**{
            f'{col}_day_time' : np.select(
                [
                    tem_data.loc[:,col].between(4,12,inclusive='left'),
                    tem_data.loc[:,col].between(12,21,inclusive='left')
                ],
                ['morning','afternoon'],
                default='night'
            )
            for col in columns
        })
        .drop(columns=columns)
    )
FunctionTransformer(func=day_time).fit_transform(time)

Unnamed: 0,dep_time_day_time,arrival_time_day_time
0,afternoon,night
1,afternoon,night
2,afternoon,night
3,morning,morning
4,afternoon,morning
...,...,...
7318,morning,morning
7319,morning,night
7320,morning,afternoon
7321,morning,morning


In [19]:
# Encoding the new feature
time_pipe2 = Pipeline(steps=[
    ('creation',FunctionTransformer(func=day_time)),
    ('encoding',OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first'))
])
time_pipe2.fit_transform(time)

Unnamed: 0,dep_time_day_time_morning,dep_time_day_time_night,arrival_time_day_time_morning,arrival_time_day_time_night
0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
7318,1.0,0.0,1.0,0.0
7319,1.0,0.0,0.0,1.0
7320,1.0,0.0,0.0,0.0
7321,1.0,0.0,1.0,0.0


In [20]:
# Combining the above transformations
time_transformer = FeatureUnion(transformer_list=[
    ('pipe1',time_pipe1),
    ('pipe2',time_pipe2)
])
time_transformer.fit_transform(time)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_day_time_morning,dep_time_day_time_night,arrival_time_day_time_morning,arrival_time_day_time_night
0,0.695652,0.818182,1.000000,0.272727,0.0,0.0,0.0,1.0
1,0.565217,0.000000,0.913043,0.000000,0.0,0.0,0.0,1.0
2,0.608696,0.090909,1.000000,0.636364,0.0,0.0,0.0,1.0
3,0.217391,0.454545,0.173913,0.454545,1.0,0.0,1.0,0.0
4,0.695652,0.545455,0.347826,0.272727,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
7318,0.347826,0.545455,0.478261,0.363636,1.0,0.0,1.0,0.0
7319,0.304348,0.000000,0.913043,0.363636,1.0,0.0,0.0,1.0
7320,0.434783,0.636364,0.652174,0.636364,1.0,0.0,0.0,0.0
7321,0.217391,0.636364,0.347826,0.909091,1.0,0.0,1.0,0.0


### 4.5 Duration

In [21]:
# Viewing the column
X.duration

0       1830
1        480
2        570
3       1380
4        945
        ... 
7318     170
7319     860
7320     300
7321     195
7322    1125
Name: duration, Length: 7323, dtype: int64

- Now we are going to make two new features

In [22]:
# Feature to show the duration category
def dur_cat(data,start = 0, short = 180, med = 540, high=5000):
    return (
        data
        .assign(duration_category = np.select(
            [
                data.duration.between(start,short,inclusive = 'left'),
                data.duration.between(short,med,inclusive = 'left')
            ],
            ['short','medium'],
        default='long'
        )
                  )
        .drop(columns = 'duration')
    )
FunctionTransformer(func=dur_cat).fit_transform(X.loc[:,['duration']])

Unnamed: 0,duration_category
0,long
1,medium
2,long
3,long
4,long
...,...
7318,short
7319,long
7320,medium
7321,medium


In [23]:
# Feature to show if duration is greater than 1000 or not
def is_over_1000(data):
    return (
        data
        .assign(over_1000 = data.duration.ge(1000).astype(int))
        .drop(columns = 'duration')
    )
FunctionTransformer(func=is_over_1000).fit_transform(X.loc[:,['duration']])

Unnamed: 0,over_1000
0,1
1,0
2,0
3,1
4,0
...,...
7318,0
7319,0
7320,0
7321,0


In [24]:
# Encoding duration category
dur_pipe1 = Pipeline(steps=[
    ('function',FunctionTransformer(func=dur_cat)),
    ('encoding',OrdinalEncoder(categories=[['short','medium','long']]))
])
dur_pipe1.fit_transform(X.loc[:,['duration']])

Unnamed: 0,duration_category
0,2.0
1,1.0
2,2.0
3,2.0
4,2.0
...,...
7318,0.0
7319,2.0
7320,1.0
7321,1.0


In [25]:
# Combining everything 
dur_union = FeatureUnion(transformer_list=[
    ('pipe1',dur_pipe1),
    ('func',FunctionTransformer(func=is_over_1000)),
    ('scaling',StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ('outlier',Winsorizer(capping_method='iqr',fold=1.5)),                                # handling outliers by capping them using IQR
    ('union',dur_union)
])

duration_transformer.fit_transform(X.loc[:,['duration']])

Unnamed: 0,duration_category,over_1000,duration
0,2.0,1,2.441302
1,1.0,0,-0.301855
2,2.0,0,-0.118978
3,2.0,1,1.526916
4,2.0,0,0.643010
...,...,...,...
7318,0.0,0,-0.931765
7319,2.0,0,0.470293
7320,1.0,0,-0.667610
7321,1.0,0,-0.880966


### 4.6 Total Stops

In [26]:
# Viewing the column
X.total_stops.value_counts(normalize=True)

total_stops
1    0.537894
0    0.330466
2    0.127953
3    0.003687
Name: proportion, dtype: float64

- Since there is only one category which has less than 0.1 contribution so there is no meaning of grouping

In [27]:
# Making a new feature
def direct_flight(data):
    return (
        data
        .assign(is_direct_flight = data.total_stops.eq(0).astype(int))
    )

stop_transformer = FunctionTransformer(func=direct_flight)
stop_transformer.fit_transform(X.loc[:,['total_stops']])

Unnamed: 0,total_stops,is_direct_flight
0,3,0
1,1,0
2,1,0
3,2,0
4,1,0
...,...,...
7318,0,1
7319,1,0
7320,1,0
7321,0,1


### 4.7 Additional Info

In [28]:
# Viewing the column
X.additional_info.value_counts(normalize=True)

additional_info
No Info                         0.786699
In-flight meal not included     0.179844
No check-in baggage included    0.030042
1 Long layover                  0.001912
Change airports                 0.000683
Business class                  0.000410
2 Long layover                  0.000137
Red-eye flight                  0.000137
1 Short layover                 0.000137
Name: proportion, dtype: float64

In [29]:
# Transforming the column
info_pipe1 = Pipeline(steps=[
    ('grouping',RareLabelEncoder(tol=0.1,n_categories=2,replace_with='other')),
    ('encoding',OneHotEncoder(handle_unknown='ignore',sparse_output=False,drop='first'))
])
info_pipe1.fit_transform(X.loc[:,['additional_info']])

Unnamed: 0,additional_info_No Info,additional_info_other
0,1.0,0.0
1,1.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0
...,...,...
7318,1.0,0.0
7319,0.0,0.0
7320,1.0,0.0
7321,1.0,0.0


In [30]:
# Making a new feature
def info(data):
    return (
        data
        .assign(additional_info = data.additional_info.ne('No Info').astype(int))
    )
FunctionTransformer(func=info).fit_transform(X.loc[:,['additional_info']])

Unnamed: 0,additional_info
0,0
1,0
2,1
3,0
4,1
...,...
7318,0
7319,1
7320,0
7321,0


In [31]:
# Combining the above two transformations
info_transformer = FeatureUnion(transformer_list=[
    ('pipe',info_pipe1),
    ('func',FunctionTransformer(func=info))
])

info_transformer.fit_transform(X.loc[:,['additional_info']])

Unnamed: 0,additional_info_No Info,additional_info_other,additional_info
0,1.0,0.0,0
1,1.0,0.0,0
2,0.0,0.0,1
3,1.0,0.0,0
4,0.0,0.0,1
...,...,...,...
7318,1.0,0.0,0
7319,0.0,0.0,1
7320,1.0,0.0,0
7321,1.0,0.0,0


### 4.8 Column Transformer

In [38]:
transformer = ColumnTransformer(transformers=[
    ('air',airline_transformer,['airline']),
    ('journey',date_transformer,['date_of_journey']),
    ('location',loc_transformer,['source','destination']),
    ('time',time_transformer,['dep_time','arrival_time']),
    ('flight',duration_transformer,['duration']),
    ('break',stop_transformer,['total_stops']),
    ('travel',info_transformer,['additional_info'])
],remainder='passthrough')

transformer.fit_transform(X,y)

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_other,journey__date_of_journey_month,journey__date_of_journey_week,journey__date_of_journey_day_of_week,journey__date_of_journey_day_of_year,location__source_Delhi,location__source_Kolkata,location__source_other,location__destination_Cochin,location__destination_Delhi,location__destination_other,location__source_is_south,location__destination_is_south,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_day_time_morning,time__dep_time_day_time_night,time__arrival_time_day_time_morning,time__arrival_time_day_time_night,flight__duration_category,flight__over_1000,flight__duration,break__total_stops,break__is_direct_flight,travel__additional_info_No Info,travel__additional_info_other,travel__additional_info
0,0.0,0.0,0.0,0.0,0.333333,0.294118,0.000000,0.262712,0.0,1.0,0.0,0.0,0.0,0.0,0,1,0.695652,0.818182,1.000000,0.272727,0.0,0.0,0.0,1.0,2.0,1,2.441302,3,0,1.0,0.0,0
1,0.0,0.0,1.0,0.0,1.000000,0.764706,0.833333,0.779661,1.0,0.0,0.0,1.0,0.0,0.0,0,1,0.565217,0.000000,0.913043,0.000000,0.0,0.0,0.0,1.0,1.0,0,-0.301855,1,0,1.0,0.0,0
2,0.0,1.0,0.0,0.0,0.666667,0.705882,0.166667,0.686441,0.0,1.0,0.0,0.0,0.0,0.0,0,1,0.608696,0.090909,1.000000,0.636364,0.0,0.0,0.0,1.0,2.0,0,-0.118978,1,0,0.0,0.0,1
3,0.0,1.0,0.0,0.0,0.000000,0.058824,0.833333,0.067797,1.0,0.0,0.0,1.0,0.0,0.0,0,1,0.217391,0.454545,0.173913,0.454545,1.0,0.0,1.0,0.0,2.0,1,1.526916,2,0,1.0,0.0,0
4,0.0,1.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,0.0,1.0,0.0,0.0,0.0,0.0,0,1,0.695652,0.545455,0.347826,0.272727,0.0,0.0,1.0,0.0,2.0,0,0.643010,1,0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7318,1.0,0.0,0.0,0.0,1.000000,0.823529,0.000000,0.796610,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.347826,0.545455,0.478261,0.363636,1.0,0.0,1.0,0.0,0.0,0,-0.931765,0,1,1.0,0.0,0
7319,0.0,1.0,0.0,0.0,0.000000,0.176471,0.500000,0.169492,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0.304348,0.000000,0.913043,0.363636,1.0,0.0,0.0,1.0,2.0,0,0.470293,1,0,0.0,0.0,1
7320,1.0,0.0,0.0,0.0,0.333333,0.294118,0.333333,0.279661,1.0,0.0,0.0,1.0,0.0,0.0,0,1,0.434783,0.636364,0.652174,0.636364,1.0,0.0,0.0,0.0,1.0,0,-0.667610,1,0,1.0,0.0,0
7321,1.0,0.0,0.0,0.0,0.333333,0.470588,0.833333,0.483051,1.0,0.0,0.0,1.0,0.0,0.0,0,1,0.217391,0.636364,0.347826,0.909091,1.0,0.0,1.0,0.0,1.0,0,-0.880966,0,1,1.0,0.0,0


- At this moment we have 32 features. Now we will perform feature selection to get the best features among them

## 5. Feature Selection

In [41]:
# Defining estimator and the selector for feature selection
estimator = RandomForestRegressor(n_estimators=40,max_depth=5,n_jobs=-1,random_state=42)

selector = SelectBySingleFeaturePerformance(estimator=estimator,scoring='r2',cv = 10,threshold=0.05)

In [42]:
# Selecting the best features
features = Pipeline(steps=[
    ('column transform',transformer),
    ('selection',selector)
])
features.fit_transform(X,y)

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,journey__date_of_journey_month,journey__date_of_journey_week,journey__date_of_journey_day_of_year,location__source_Delhi,location__source_other,location__destination_Cochin,location__destination_Delhi,time__arrival_time_hour,flight__duration_category,flight__over_1000,flight__duration,break__total_stops,break__is_direct_flight
0,0.0,0.0,0.0,0.333333,0.294118,0.262712,0.0,0.0,0.0,0.0,1.000000,2.0,1,2.441302,3,0
1,0.0,0.0,0.0,1.000000,0.764706,0.779661,1.0,0.0,1.0,0.0,0.913043,1.0,0,-0.301855,1,0
2,0.0,1.0,0.0,0.666667,0.705882,0.686441,0.0,0.0,0.0,0.0,1.000000,2.0,0,-0.118978,1,0
3,0.0,1.0,0.0,0.000000,0.058824,0.067797,1.0,0.0,1.0,0.0,0.173913,2.0,1,1.526916,2,0
4,0.0,1.0,0.0,0.666667,0.647059,0.661017,0.0,0.0,0.0,0.0,0.347826,2.0,0,0.643010,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7318,1.0,0.0,0.0,1.000000,0.823529,0.796610,0.0,0.0,0.0,1.0,0.478261,0.0,0,-0.931765,0,1
7319,0.0,1.0,0.0,0.000000,0.176471,0.169492,0.0,0.0,0.0,0.0,0.913043,2.0,0,0.470293,1,0
7320,1.0,0.0,0.0,0.333333,0.294118,0.279661,1.0,0.0,1.0,0.0,0.652174,1.0,0,-0.667610,1,0
7321,1.0,0.0,0.0,0.333333,0.470588,0.483051,1.0,0.0,1.0,0.0,0.347826,1.0,0,-0.880966,0,1


- So we have extracted best 16 features from 32 features

## 6. Transforming and Exporting

- Now we will be transforming train,validation and test datasets using our transformation and export them to csv for model training, evaluation and testing

In [43]:
# Function for exporting with transformation
def export(x,y,name):
    filename = f"{name}.csv"
    filepath = os.path.join(r"D:\FLIGHT_PRICE_PREDICTION\DATA",filename)
    features.fit_transform(x,y).join(y).to_csv(filepath,index=False)
    return pd.read_csv(filepath)

### 6.1 Training Data Transformation

In [44]:
X_train = train_data.drop(['price'],axis=1)
y_train = train_data.price

export(X_train,y_train,'transformed_train')

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,journey__date_of_journey_month,journey__date_of_journey_week,journey__date_of_journey_day_of_year,location__source_Delhi,location__source_other,location__destination_Cochin,location__destination_Delhi,time__arrival_time_hour,flight__duration_category,flight__over_1000,flight__duration,break__total_stops,break__is_direct_flight,price
0,0.0,0.0,0.0,0.333333,0.294118,0.262712,0.0,0.0,0.0,0.0,1.000000,2.0,1,2.441302,3,0,8607
1,0.0,0.0,0.0,1.000000,0.764706,0.779661,1.0,0.0,1.0,0.0,0.913043,1.0,0,-0.301855,1,0,13587
2,0.0,1.0,0.0,0.666667,0.705882,0.686441,0.0,0.0,0.0,0.0,1.000000,2.0,0,-0.118978,1,0,10844
3,0.0,1.0,0.0,0.000000,0.058824,0.067797,1.0,0.0,1.0,0.0,0.173913,2.0,1,1.526916,2,0,16914
4,0.0,1.0,0.0,0.666667,0.647059,0.661017,0.0,0.0,0.0,0.0,0.347826,2.0,0,0.643010,1,0,8586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7318,1.0,0.0,0.0,1.000000,0.823529,0.796610,0.0,0.0,0.0,1.0,0.478261,0.0,0,-0.931765,0,1,4823
7319,0.0,1.0,0.0,0.000000,0.176471,0.169492,0.0,0.0,0.0,0.0,0.913043,2.0,0,0.470293,1,0,7832
7320,1.0,0.0,0.0,0.333333,0.294118,0.279661,1.0,0.0,1.0,0.0,0.652174,1.0,0,-0.667610,1,0,5073
7321,1.0,0.0,0.0,0.333333,0.470588,0.483051,1.0,0.0,1.0,0.0,0.347826,1.0,0,-0.880966,0,1,6015


### 6.2 Test Data Transformation

In [45]:
test_data = get_data('test')
X_test = test_data.drop(['price'],axis=1)
y_test = test_data.price
export(X_test,y_test,'transformed_test')

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,journey__date_of_journey_month,journey__date_of_journey_week,journey__date_of_journey_day_of_year,location__source_Delhi,location__source_other,location__destination_Cochin,location__destination_Delhi,time__arrival_time_hour,time__arrival_time_minute,flight__duration_category,flight__over_1000,flight__duration,break__total_stops,break__is_direct_flight,price
0,0.0,1.0,0.0,0.000000,0.058824,0.042373,0.0,0.0,0.0,0.0,0.347826,0.272727,2.0,1,1.645847,1,0,17996
1,0.0,0.0,1.0,1.000000,0.823529,0.822034,0.0,0.0,0.0,0.0,0.000000,0.727273,0.0,0,-0.969282,0,1,3873
2,1.0,0.0,0.0,0.000000,0.176471,0.144068,0.0,0.0,0.0,0.0,0.347826,0.363636,0.0,0,-0.909621,0,1,4462
3,1.0,0.0,0.0,1.000000,1.000000,1.000000,0.0,1.0,0.0,0.0,0.913043,1.000000,0.0,0,-0.969282,0,1,3597
4,1.0,0.0,0.0,0.666667,0.588235,0.559322,0.0,0.0,0.0,0.0,0.739130,0.818182,0.0,0,-0.949395,0,1,4804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3134,0.0,1.0,0.0,1.000000,0.823529,0.822034,1.0,0.0,1.0,0.0,0.826087,0.000000,1.0,0,-0.651091,1,0,10262
3135,0.0,0.0,0.0,0.000000,0.235294,0.220339,1.0,0.0,1.0,0.0,0.304348,0.727273,2.0,1,0.969692,2,0,8892
3136,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.956522,1.000000,2.0,1,2.322002,2,0,14887
3137,1.0,0.0,0.0,1.000000,0.764706,0.779661,0.0,0.0,0.0,1.0,0.434783,0.090909,0.0,0,-0.899677,0,1,4823


- So we have to drop the column 'time__arrival_time_minute' from the testing data at the time of model evaluation as the column is absent in training data