## 1. Import Libraries

In [95]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.pipeline import (
    Pipeline,
    FeatureUnion
)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder
)
from feature_engine.datetime import DatetimeFeatures

import warnings

## 2. Display Settings

In [8]:
pd.set_option("display.max_columns", None)

In [9]:
sklearn.set_config(transform_output="pandas")

In [10]:
warnings.filterwarnings("ignore")

## 3. Read the Data

In [11]:
path = r"C:\Users\Aayush\Desktop\Flight Fare Prediction\data\train.csv"

train = pd.read_csv(path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-09,Kolkata,Banglore,21:10:00,18:15:00,1265,1.0,No Info,14571
1,Jet Airways,2019-03-18,Banglore,New Delhi,18:40:00,07:40:00,780,1.0,In-flight meal not included,10972
2,Jet Airways,2019-05-24,Mumbai,Hyderabad,02:55:00,04:20:00,85,0.0,In-flight meal not included,4995
3,Indigo,2019-06-24,Banglore,Delhi,07:10:00,10:05:00,175,0.0,No Info,4823
4,Jet Airways,2019-05-09,Delhi,Cochin,20:55:00,19:00:00,1325,1.0,In-flight meal not included,12373
...,...,...,...,...,...,...,...,...,...,...
635,Indigo,2019-03-27,Delhi,Cochin,06:40:00,12:00:00,320,1.0,No Info,5298
636,Multiple Carriers,2019-05-15,Delhi,Cochin,08:45:00,21:00:00,735,1.0,No Info,7670
637,Multiple Carriers,2019-05-09,Delhi,Cochin,17:00:00,01:30:00,510,1.0,No Info,9424
638,Indigo,2019-05-09,Mumbai,Hyderabad,19:05:00,20:35:00,90,0.0,No Info,4392


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [13]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

## 4. Transformation Operations

### 4.1 airline

In [18]:
X_train.airline

0            Jet Airways
1            Jet Airways
2            Jet Airways
3                 Indigo
4            Jet Airways
             ...        
635               Indigo
636    Multiple Carriers
637    Multiple Carriers
638               Indigo
639             Spicejet
Name: airline, Length: 640, dtype: object

In [49]:
air_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

air_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
635,0.0,1.0,0.0,0.0,0.0
636,0.0,0.0,0.0,1.0,0.0
637,0.0,0.0,0.0,1.0,0.0
638,0.0,1.0,0.0,0.0,0.0


### 4.2 date_of_journey

In [41]:
X_train.date_of_journey

0      2019-06-09
1      2019-03-18
2      2019-05-24
3      2019-06-24
4      2019-05-09
          ...    
635    2019-03-27
636    2019-05-15
637    2019-05-09
638    2019-05-09
639    2019-06-01
Name: date_of_journey, Length: 640, dtype: object

In [57]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(
    steps=[
        ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
        ("scaler", MinMaxScaler())
    ]
)

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,0.823529,1.000000,0.847458
1,0.000000,0.176471,0.000000,0.144068
2,0.666667,0.705882,0.666667,0.711864
3,1.000000,1.000000,0.000000,0.974576
4,0.666667,0.588235,0.500000,0.584746
...,...,...,...,...
635,0.000000,0.235294,0.333333,0.220339
636,0.666667,0.647059,0.333333,0.635593
637,0.666667,0.588235,0.500000,0.584746
638,0.666667,0.588235,0.500000,0.584746


### 4.3 source & destination

In [58]:
X_train.source

0       Kolkata
1      Banglore
2        Mumbai
3      Banglore
4         Delhi
         ...   
635       Delhi
636       Delhi
637       Delhi
638      Mumbai
639      Mumbai
Name: source, Length: 640, dtype: object

In [59]:
X_train.destination

0       Banglore
1      New Delhi
2      Hyderabad
3          Delhi
4         Cochin
         ...    
635       Cochin
636       Cochin
637       Cochin
638    Hyderabad
639    Hyderabad
Name: destination, Length: 640, dtype: object

In [64]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset

Unnamed: 0,source,destination
0,Kolkata,Banglore
1,Banglore,New Delhi
2,Mumbai,Hyderabad
3,Banglore,Delhi
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Delhi,Cochin
637,Delhi,Cochin
638,Mumbai,Hyderabad


In [72]:
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,-0.271179,-0.253274
1,-0.903923,-0.982970
2,-1.796730,-0.982970
3,-0.903923,-1.789680
4,1.056385,1.056945
...,...,...
635,1.056385,1.056945
636,1.056385,1.056945
637,1.056385,1.056945
638,-1.796730,-0.982970


In [61]:
np.union1d(X_train.source, X_train.destination)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [94]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "New Delhi", "Mumbai"]

    return (

        X
        .assign(**{
        f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
        for col in columns
        })
        .drop(columns=["source", "destination"])
    )

FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0
...,...,...
635,1,0
636,1,0
637,1,0
638,1,0


In [96]:
location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])
location_transformer.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.271179,-0.253274,1,0
1,-0.903923,-0.982970,0,1
2,-1.796730,-0.982970,1,0
3,-0.903923,-1.789680,0,1
4,1.056385,1.056945,1,0
...,...,...,...,...
635,1.056385,1.056945,1,0
636,1.056385,1.056945,1,0
637,1.056385,1.056945,1,0
638,-1.796730,-0.982970,1,0


### 4.4 dep_time & arrival_time

In [100]:
X_train.dep_time

0      21:10:00
1      18:40:00
2      02:55:00
3      07:10:00
4      20:55:00
         ...   
635    06:40:00
636    08:45:00
637    17:00:00
638    19:05:00
639    22:45:00
Name: dep_time, Length: 640, dtype: object

In [99]:
X_train.arrival_time

0      18:15:00
1      07:40:00
2      04:20:00
3      10:05:00
4      19:00:00
         ...   
635    12:00:00
636    21:00:00
637    01:30:00
638    20:35:00
639    00:15:00
Name: arrival_time, Length: 640, dtype: object

In [103]:
time_subset = X_train.loc[:, ["dep_time", "arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,21:10:00,18:15:00
1,18:40:00,07:40:00
2,02:55:00,04:20:00
3,07:10:00,10:05:00
4,20:55:00,19:00:00
...,...,...
635,06:40:00,12:00:00
636,08:45:00,21:00:00
637,17:00:00,01:30:00
638,19:05:00,20:35:00


In [107]:
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.913043,0.181818,0.782609,0.272727
1,0.782609,0.727273,0.304348,0.727273
2,0.086957,1.000000,0.173913,0.363636
3,0.304348,0.181818,0.434783,0.090909
4,0.869565,1.000000,0.826087,0.000000
...,...,...,...,...
635,0.260870,0.727273,0.521739,0.000000
636,0.347826,0.818182,0.913043,0.000000
637,0.739130,0.000000,0.043478,0.545455
638,0.826087,0.090909,0.869565,0.636364


## 5. Column Transformer

In [98]:
column_transformer = ColumnTransformer(transformers=[
    ("air", air_transformer, ["airline"]),
    ("dt", doj_transformer, ["date_of_journey"]),
    ("location", location_transformer, ["source",  "destination"])
], remainder="passthrough")

column_transformer.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,dt__date_of_journey_month,dt__date_of_journey_week,dt__date_of_journey_day_of_week,dt__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,remainder__dep_time,remainder__arrival_time,remainder__duration,remainder__total_stops,remainder__additional_info
0,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,1.000000,0.847458,-0.271179,-0.253274,1,0,21:10:00,18:15:00,1265,1.0,No Info
1,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.000000,0.144068,-0.903923,-0.982970,0,1,18:40:00,07:40:00,780,1.0,In-flight meal not included
2,0.0,0.0,1.0,0.0,0.0,0.666667,0.705882,0.666667,0.711864,-1.796730,-0.982970,1,0,02:55:00,04:20:00,85,0.0,In-flight meal not included
3,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.000000,0.974576,-0.903923,-1.789680,0,1,07:10:00,10:05:00,175,0.0,No Info
4,0.0,0.0,1.0,0.0,0.0,0.666667,0.588235,0.500000,0.584746,1.056385,1.056945,1,0,20:55:00,19:00:00,1325,1.0,In-flight meal not included
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,1.056385,1.056945,1,0,06:40:00,12:00:00,320,1.0,No Info
636,0.0,0.0,0.0,1.0,0.0,0.666667,0.647059,0.333333,0.635593,1.056385,1.056945,1,0,08:45:00,21:00:00,735,1.0,No Info
637,0.0,0.0,0.0,1.0,0.0,0.666667,0.588235,0.500000,0.584746,1.056385,1.056945,1,0,17:00:00,01:30:00,510,1.0,No Info
638,0.0,1.0,0.0,0.0,0.0,0.666667,0.588235,0.500000,0.584746,-1.796730,-0.982970,1,0,19:05:00,20:35:00,90,0.0,No Info
