In [1]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

In [2]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings("ignore")

In [3]:
train=pd.read_csv(r"C:\Users\DELL\Desktop\Projects\Flight_Price_Predictor\data\train.csv")

In [4]:
train=train.dropna()
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-21,Banglore,New Delhi,08:55:00,19:10:00,615,1.0,In-flight meal not included,7832
1,Jet Airways,2019-03-27,Delhi,Cochin,17:30:00,04:25:00,655,1.0,In-flight meal not included,6540
2,Goair,2019-03-09,Banglore,New Delhi,11:40:00,14:35:00,175,0.0,No Info,7305
3,Air India,2019-06-12,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8366
4,Jet Airways,2019-03-12,Banglore,New Delhi,22:55:00,07:40:00,525,1.0,In-flight meal not included,11087
...,...,...,...,...,...,...,...,...,...,...
6690,Jet Airways,2019-03-21,Delhi,Cochin,10:45:00,18:50:00,1925,2.0,No Info,11093
6691,Air India,2019-05-01,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8891
6692,Jet Airways,2019-06-01,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included,10262
6693,Air Asia,2019-06-24,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,6152


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6694 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6694 non-null   object 
 1   date_of_journey  6694 non-null   object 
 2   source           6694 non-null   object 
 3   destination      6694 non-null   object 
 4   dep_time         6694 non-null   object 
 5   arrival_time     6694 non-null   object 
 6   duration         6694 non-null   int64  
 7   total_stops      6694 non-null   float64
 8   additional_info  6694 non-null   object 
 9   price            6694 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 575.3+ KB


In [6]:
x_train=train.drop(columns="price")
y_train=train.price.copy()

In [7]:
x_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

## Airline

In [8]:
x_train.airline

0       Jet Airways
1       Jet Airways
2             Goair
3         Air India
4       Jet Airways
           ...     
6690    Jet Airways
6691      Air India
6692    Jet Airways
6693       Air Asia
6694      Air India
Name: airline, Length: 6694, dtype: object

In [9]:
air_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("grouper",RareLabelEncoder(tol=0.1,n_categories=2,replace_with="other")),
    ("onehot_encoder",OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])
air_transformer.fit_transform(x_train.loc[:,["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0
6691,1.0,0.0,0.0,0.0,0.0
6692,0.0,0.0,1.0,0.0,0.0
6693,0.0,0.0,0.0,0.0,1.0


## date_of_journey

In [10]:
x_train.date_of_journey

0       2019-03-21
1       2019-03-27
2       2019-03-09
3       2019-06-12
4       2019-03-12
           ...    
6690    2019-03-21
6691    2019-05-01
6692    2019-06-01
6693    2019-06-24
6694    2019-03-01
Name: date_of_journey, Length: 6694, dtype: object

In [11]:
feature_to_extract=["month","day_of_week","week","day_of_year"]
doj_transformer=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=feature_to_extract,yearfirst=True,format="mixed")),
    ("scaler",MinMaxScaler())
])
doj_transformer.fit_transform(x_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_day_of_week,date_of_journey_week,date_of_journey_day_of_year
0,0.000000,0.500000,0.176471,0.169492
1,0.000000,0.333333,0.235294,0.220339
2,0.000000,0.833333,0.058824,0.067797
3,1.000000,0.333333,0.882353,0.872881
4,0.000000,0.166667,0.117647,0.093220
...,...,...,...,...
6690,0.000000,0.500000,0.176471,0.169492
6691,0.666667,0.333333,0.529412,0.516949
6692,1.000000,0.833333,0.764706,0.779661
6693,1.000000,0.000000,1.000000,0.974576


## Source and Destination

In [12]:
location_subset=x_train[["source","destination"]]

In [13]:
location_subset

Unnamed: 0,source,destination
0,Banglore,New Delhi
1,Delhi,Cochin
2,Banglore,New Delhi
3,Kolkata,Banglore
4,Banglore,New Delhi
...,...,...
6690,Delhi,Cochin
6691,Kolkata,Banglore
6692,Delhi,Cochin
6693,Delhi,Cochin


In [14]:
location_pipe1= Pipeline(steps=[
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="other",n_categories=2)),
    ("encoder",MeanEncoder()),
    ("scaler",PowerTransformer())
])
location_pipe1.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination
0,-0.857629,-0.736209
1,1.065619,1.061892
2,-0.857629,-0.736209
3,-0.203923,-0.224330
4,-0.857629,-0.736209
...,...,...
6690,1.065619,1.061892
6691,-0.203923,-0.224330
6692,1.065619,1.061892
6693,1.065619,1.061892


In [15]:
np.union1d(
    x_train.source.unique(),
    x_train.destination.unique()
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [16]:
def is_north(X):
    columns=X.columns.to_list()
    north_cities=["Delhi","Kolkata","Mumbai","New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north":X.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

# is_north(location_subset)    
FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,0,1
3,1,0
4,0,1
...,...,...
6690,1,0
6691,1,0
6692,1,0
6693,1,0


In [17]:
location_transformer= FeatureUnion(transformer_list=[
    ("part1",location_pipe1),
    ("part2",FunctionTransformer(func=is_north))
])

location_transformer.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.857629,-0.736209,0,1
1,1.065619,1.061892,1,0
2,-0.857629,-0.736209,0,1
3,-0.203923,-0.224330,1,0
4,-0.857629,-0.736209,0,1
...,...,...,...,...
6690,1.065619,1.061892,1,0
6691,-0.203923,-0.224330,1,0
6692,1.065619,1.061892,1,0
6693,1.065619,1.061892,1,0


## departue_time and arrival_time

In [18]:
x_train.dep_time

0       08:55:00
1       17:30:00
2       11:40:00
3       09:25:00
4       22:55:00
          ...   
6690    10:45:00
6691    09:25:00
6692    14:00:00
6693    07:55:00
6694    11:50:00
Name: dep_time, Length: 6694, dtype: object

In [19]:
x_train.arrival_time

0       19:10:00
1       04:25:00
2       14:35:00
3       18:30:00
4       07:40:00
          ...   
6690    18:50:00
6691    18:30:00
6692    19:00:00
6693    13:25:00
6694    08:55:00
Name: arrival_time, Length: 6694, dtype: object

In [20]:
time_subset=x_train[['dep_time','arrival_time']]
time_subset

Unnamed: 0,dep_time,arrival_time
0,08:55:00,19:10:00
1,17:30:00,04:25:00
2,11:40:00,14:35:00
3,09:25:00,18:30:00
4,22:55:00,07:40:00
...,...,...
6690,10:45:00,18:50:00
6691,09:25:00,18:30:00
6692,14:00:00,19:00:00
6693,07:55:00,13:25:00


In [21]:
time_pipe1 = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler",MinMaxScaler())
])
time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.347826,1.000000,0.826087,0.181818
1,0.739130,0.545455,0.173913,0.454545
2,0.478261,0.727273,0.608696,0.636364
3,0.391304,0.454545,0.782609,0.545455
4,0.956522,1.000000,0.304348,0.727273
...,...,...,...,...
6690,0.434783,0.818182,0.782609,0.909091
6691,0.391304,0.454545,0.782609,0.545455
6692,0.608696,0.000000,0.826087,0.000000
6693,0.304348,1.000000,0.565217,0.454545


In [22]:
def part_of_day(X,morning=4,noon=12,eve=16,night=20):
    columns=X.columns.to_list()
    X_temp=X.assign(**{
        col:pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day":np.select(
                [X_temp.loc[:,col].between(morning,noon,inclusive="left"),
                 X_temp.loc[:,col].between(noon,eve,inclusive="left"),
                 X_temp.loc[:,col].between(eve,night,inclusive="left"),
                ],
                ["morning","afternoon","evening"],
                default="night"
            )
            for col in columns
        }
        )
        .drop(columns=columns)
    )
FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,morning,evening
1,evening,morning
2,morning,afternoon
3,morning,evening
4,night,morning
...,...,...
6690,morning,evening
6691,morning,evening
6692,afternoon,evening
6693,morning,afternoon


In [23]:
time_pipe2= Pipeline(steps=[
    ("part",FunctionTransformer(func=part_of_day)),
    ("encoder",CountFrequencyEncoder()),
    ("scaler",MinMaxScaler())
])
time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,1.000000,0.667335
1,0.202773,0.951904
2,1.000000,0.000000
3,1.000000,0.667335
4,0.174177,0.951904
...,...,...
6690,1.000000,0.667335
6691,1.000000,0.667335
6692,0.000000,0.667335
6693,1.000000,0.000000


In [24]:
time_transformer=FeatureUnion(transformer_list=[
  ("part1",time_pipe1),
  ("part2",time_pipe2)  
])
time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.347826,1.000000,0.826087,0.181818,1.000000,0.667335
1,0.739130,0.545455,0.173913,0.454545,0.202773,0.951904
2,0.478261,0.727273,0.608696,0.636364,1.000000,0.000000
3,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335
4,0.956522,1.000000,0.304348,0.727273,0.174177,0.951904
...,...,...,...,...,...,...
6690,0.434783,0.818182,0.782609,0.909091,1.000000,0.667335
6691,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335
6692,0.608696,0.000000,0.826087,0.000000,0.000000,0.667335
6693,0.304348,1.000000,0.565217,0.454545,1.000000,0.000000


## duration

In [25]:
x_train.duration

0        615
1        655
2        175
3        545
4        525
        ... 
6690    1925
6691     545
6692     300
6693     330
6694    1265
Name: duration, Length: 6694, dtype: int64

In [41]:
duration_pipe1 = Pipeline(steps=[
	("scaler", PowerTransformer())
])
duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", StandardScaler())
])
duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])
duration_transformer.fit_transform(x_train.loc[:,["duration"]])

Unnamed: 0,part1__duration,part2__duration
0,0.330340,-0.033600
1,0.403254,0.046768
2,-1.020908,-0.917646
3,0.191947,-0.174244
4,0.149506,-0.214428
...,...,...
6690,1.733333,2.598445
6691,0.191947,-0.174244
6692,-0.464892,-0.666496
6693,-0.362973,-0.606221


## total_stops

In [47]:
x_train.total_stops

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
6690    2.0
6691    1.0
6692    1.0
6693    1.0
6694    1.0
Name: total_stops, Length: 6694, dtype: float64

In [48]:
def is_direct(X):
	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])

total_stops_transformer.fit_transform(x_train.loc[:, ["total_stops"]])

Unnamed: 0,total_stops,is_direct_flight
0,1.0,0
1,1.0,0
2,0.0,1
3,1.0,0
4,1.0,0
...,...,...
6690,2.0,0
6691,1.0,0
6692,1.0,0
6693,1.0,0


## additional_info

In [52]:
x_train.additional_info.value_counts()

additional_info
No Info                         5248
In-flight meal not included     1215
No check-in baggage included     207
1 Long layover                    16
Change airports                    5
Business class                     2
Red-eye flight                     1
Name: count, dtype: int64

In [51]:
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

info_pipe1.fit_transform(x_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
6690,0.0,1.0,0.0
6691,0.0,1.0,0.0
6692,1.0,0.0,0.0
6693,0.0,1.0,0.0


In [53]:
def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

In [54]:
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

In [56]:
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

info_transformer.fit_transform(x_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,1.0,0.0,0.0,1
1,1.0,0.0,0.0,1
2,0.0,1.0,0.0,0
3,0.0,1.0,0.0,0
4,1.0,0.0,0.0,1
...,...,...,...,...
6690,0.0,1.0,0.0,0
6691,0.0,1.0,0.0,0
6692,1.0,0.0,0.0,1
6693,0.0,1.0,0.0,0


In [57]:
column_transformer=ColumnTransformer(transformers=[
    ("air",air_transformer,["airline"]),
    ("doj",doj_transformer,["date_of_journey"]),
    ("location",location_transformer,["source","destination"]),
    ("time",time_transformer,["dep_time","arrival_time"]),
    ("dur", duration_transformer, ["duration"]),
    ("stops", total_stops_transformer, ["total_stops"]),
    ("info", info_transformer, ["additional_info"])
],remainder="passthrough")
column_transformer.fit_transform(x_train,y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_other,doj__date_of_journey_month,doj__date_of_journey_day_of_week,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,dur__part1__duration,dur__part2__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,0.0,1.0,0.0,0.0,0.000000,0.500000,0.176471,0.169492,-0.857629,-0.736209,0,1,0.347826,1.000000,0.826087,0.181818,1.000000,0.667335,0.330340,-0.033600,1.0,0,1.0,0.0,0.0,1
1,0.0,0.0,1.0,0.0,0.0,0.000000,0.333333,0.235294,0.220339,1.065619,1.061892,1,0,0.739130,0.545455,0.173913,0.454545,0.202773,0.951904,0.403254,0.046768,1.0,0,1.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,1.0,0.000000,0.833333,0.058824,0.067797,-0.857629,-0.736209,0,1,0.478261,0.727273,0.608696,0.636364,1.000000,0.000000,-1.020908,-0.917646,0.0,1,0.0,1.0,0.0,0
3,1.0,0.0,0.0,0.0,0.0,1.000000,0.333333,0.882353,0.872881,-0.203923,-0.224330,1,0,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335,0.191947,-0.174244,1.0,0,0.0,1.0,0.0,0
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.166667,0.117647,0.093220,-0.857629,-0.736209,0,1,0.956522,1.000000,0.304348,0.727273,0.174177,0.951904,0.149506,-0.214428,1.0,0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0,0.000000,0.500000,0.176471,0.169492,1.065619,1.061892,1,0,0.434783,0.818182,0.782609,0.909091,1.000000,0.667335,1.733333,2.598445,2.0,0,0.0,1.0,0.0,0
6691,1.0,0.0,0.0,0.0,0.0,0.666667,0.333333,0.529412,0.516949,-0.203923,-0.224330,1,0,0.391304,0.454545,0.782609,0.545455,1.000000,0.667335,0.191947,-0.174244,1.0,0,0.0,1.0,0.0,0
6692,0.0,0.0,1.0,0.0,0.0,1.000000,0.833333,0.764706,0.779661,1.065619,1.061892,1,0,0.608696,0.000000,0.826087,0.000000,0.000000,0.667335,-0.464892,-0.666496,1.0,0,1.0,0.0,0.0,1
6693,0.0,0.0,0.0,0.0,1.0,1.000000,0.000000,1.000000,0.974576,1.065619,1.061892,1,0,0.304348,1.000000,0.565217,0.454545,1.000000,0.000000,-0.362973,-0.606221,1.0,0,0.0,1.0,0.0,0


In [58]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

In [59]:
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

preprocessor.fit_transform(x_train, y_train)

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__part1__duration,dur__part2__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.176471,0.169492,-0.857629,-0.736209,0.330340,-0.033600,1.0,0
1,0.0,1.0,0.0,0.235294,0.220339,1.065619,1.061892,0.403254,0.046768,1.0,0
2,0.0,0.0,1.0,0.058824,0.067797,-0.857629,-0.736209,-1.020908,-0.917646,0.0,1
3,0.0,0.0,0.0,0.882353,0.872881,-0.203923,-0.224330,0.191947,-0.174244,1.0,0
4,0.0,1.0,0.0,0.117647,0.093220,-0.857629,-0.736209,0.149506,-0.214428,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,1.0,0.0,0.176471,0.169492,1.065619,1.061892,1.733333,2.598445,2.0,0
6691,0.0,0.0,0.0,0.529412,0.516949,-0.203923,-0.224330,0.191947,-0.174244,1.0,0
6692,0.0,1.0,0.0,0.764706,0.779661,1.065619,1.061892,-0.464892,-0.666496,1.0,0
6693,0.0,0.0,1.0,1.000000,0.974576,1.065619,1.061892,-0.362973,-0.606221,1.0,0
