In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
MinMaxScaler,
PowerTransformer,
FunctionTransformer
)
from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
RareLabelEncoder,
MeanEncoder,
CountFrequencyEncoder
)
import matplotlib.pyplot as plt
import warnings

In [3]:
pd.set_option("display.max_columns",None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings("ignore")

In [4]:
train=pd.read_csv("train_data.csv")

In [5]:
train.head(3)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-05-24,Kolkata,Banglore,16:50:00,07:55:00,905,1.0,No Info,8576
1,Indigo,2019-06-03,Delhi,Cochin,15:10:00,20:00:00,290,1.0,No Info,6493
2,Air India,2019-05-15,Delhi,Cochin,20:40:00,09:25:00,765,1.0,No Info,7480


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [8]:
X_train=train.drop(columns=["price"])

In [9]:
X_train.head(2)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
0,Air India,2019-05-24,Kolkata,Banglore,16:50:00,07:55:00,905,1.0,No Info
1,Indigo,2019-06-03,Delhi,Cochin,15:10:00,20:00:00,290,1.0,No Info


In [10]:
y=train.price.copy()

In [11]:
y

0      8576
1      6493
2      7480
3      9486
4      1759
       ... 
635    7038
636    4423
637    6992
638    4667
639    3943
Name: price, Length: 640, dtype: int64

## Transformation Operations

### 1.Airline

In [12]:
train.airline

0              Air India
1                 Indigo
2              Air India
3      Multiple Carriers
4               Spicejet
             ...        
635    Multiple Carriers
636             Spicejet
637          Jet Airways
638            Air India
639               Indigo
Name: airline, Length: 640, dtype: object

In [14]:
air_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])
air_transformer.fit_transform(X_train.loc[:,["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
635,0.0,0.0,0.0,1.0,0.0
636,0.0,0.0,0.0,0.0,1.0
637,0.0,0.0,1.0,0.0,0.0
638,1.0,0.0,0.0,0.0,0.0


### 2.Date of Journey

In [15]:
X_train.date_of_journey

0      2019-05-24
1      2019-06-03
2      2019-05-15
3      2019-05-21
4      2019-03-21
          ...    
635    2019-05-18
636    2019-04-06
637    2019-03-21
638    2019-05-01
639    2019-05-06
Name: date_of_journey, Length: 640, dtype: object

In [20]:
feature_to_extract=["month","week","day_of_week","day_of_year"]
obj_transformer=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=feature_to_extract,yearfirst=True)),
    ("scaler",MinMaxScaler())
    
])

In [21]:
obj_transformer.fit_transform(X_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.666667,0.705882,0.666667,0.711864
1,1.000000,0.823529,0.000000,0.796610
2,0.666667,0.647059,0.333333,0.635593
3,0.666667,0.705882,0.166667,0.686441
4,0.000000,0.176471,0.500000,0.169492
...,...,...,...,...
635,0.666667,0.647059,0.833333,0.661017
636,0.333333,0.294118,0.833333,0.305085
637,0.000000,0.176471,0.500000,0.169492
638,0.666667,0.529412,0.333333,0.516949


### 3.Source & Destination

In [22]:
X_train.source

0       Kolkata
1         Delhi
2         Delhi
3         Delhi
4        Mumbai
         ...   
635       Delhi
636    Banglore
637    Banglore
638     Chennai
639    Banglore
Name: source, Length: 640, dtype: object

In [23]:
location_subset=X_train.loc[:,["source","destination"]]
location_subset

Unnamed: 0,source,destination
0,Kolkata,Banglore
1,Delhi,Cochin
2,Delhi,Cochin
3,Delhi,Cochin
4,Mumbai,Hyderabad
...,...,...
635,Delhi,Cochin
636,Banglore,Delhi
637,Banglore,New Delhi
638,Chennai,Kolkata


In [26]:
location_pipeline1=Pipeline(steps=[
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("encoder",MeanEncoder()),
    ("scaler",PowerTransformer())
])
location_pipeline1.fit_transform(location_subset,y)

Unnamed: 0,source,destination
0,-0.381476,-0.395194
1,1.074822,0.716118
2,1.074822,0.716118
3,1.074822,0.716118
4,-1.886768,-1.595677
...,...,...
635,1.074822,0.716118
636,-0.758376,-1.570495
637,-0.758376,1.416486
638,-1.886768,-1.595677


In [27]:
np.union1d(
    X_train.source.unique(),
    X_train.destination.unique()
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [29]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X.assign(**{
            f"{col}_is_north": X[col].isin(north_cities).astype(int)
            for col in columns
        }).drop(columns=columns)
    )
FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
635,1,0
636,0,1
637,0,1
638,0,1


In [30]:
location_transformer=FeatureUnion(transformer_list=[
    ("part1",location_pipeline1),
    ("part2",FunctionTransformer(func=is_north))
])

In [31]:
location_transformer.fit_transform(location_subset,y)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.381476,-0.395194,1,0
1,1.074822,0.716118,1,0
2,1.074822,0.716118,1,0
3,1.074822,0.716118,1,0
4,-1.886768,-1.595677,1,0
...,...,...,...,...
635,1.074822,0.716118,1,0
636,-0.758376,-1.570495,0,1
637,-0.758376,1.416486,0,1
638,-1.886768,-1.595677,0,1


### 4.Deptime && Arivaltime

In [32]:
X_train.dep_time

0      16:50:00
1      15:10:00
2      20:40:00
3      13:05:00
4      22:45:00
         ...   
635    07:10:00
636    09:30:00
637    11:40:00
638    11:40:00
639    21:15:00
Name: dep_time, Length: 640, dtype: object

In [33]:
X_train.arrival_time

0      07:55:00
1      20:00:00
2      09:25:00
3      00:55:00
4      00:10:00
         ...   
635    16:10:00
636    12:20:00
637    23:00:00
638    13:55:00
639    00:15:00
Name: arrival_time, Length: 640, dtype: object

In [35]:
time_subset=X_train.loc[:,["dep_time","arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,16:50:00,07:55:00
1,15:10:00,20:00:00
2,20:40:00,09:25:00
3,13:05:00,00:55:00
4,22:45:00,00:10:00
...,...,...
635,07:10:00,16:10:00
636,09:30:00,12:20:00
637,11:40:00,23:00:00
638,11:40:00,13:55:00


In [36]:
time_pipe1=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler",MinMaxScaler())
])

In [37]:
time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.695652,0.909091,0.304348,1.000000
1,0.652174,0.181818,0.869565,0.000000
2,0.869565,0.727273,0.391304,0.454545
3,0.565217,0.090909,0.000000,1.000000
4,0.956522,0.818182,0.000000,0.181818
...,...,...,...,...
635,0.304348,0.181818,0.695652,0.181818
636,0.391304,0.545455,0.521739,0.363636
637,0.478261,0.727273,1.000000,0.000000
638,0.478261,0.727273,0.565217,1.000000


In [38]:
def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [
                    X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                    X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                    X_temp.loc[:, col].between(eve, night, inclusive="left")
                ],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )
FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,evening,morning
1,afternoon,night
2,night,morning
3,afternoon,night
4,night,night
...,...,...
635,morning,evening
636,morning,afternoon
637,morning,night
638,morning,afternoon


In [39]:
time_pipe2 = Pipeline(steps=[
("part", FunctionTransformer(func=part_of_day)),
("encoder", CountFrequencyEncoder()),
("scaler", MinMaxScaler())
])
time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,0.186047,0.949367
1,0.000000,1.000000
2,0.023256,0.949367
3,0.000000,1.000000
4,0.023256,1.000000
...,...,...
635,1.000000,0.531646
636,1.000000,0.000000
637,1.000000,1.000000
638,1.000000,0.000000


In [40]:
time_transformer = FeatureUnion(transformer_list=[
("part1", time_pipe1),
("part2", time_pipe2)
])
time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.695652,0.909091,0.304348,1.000000,0.186047,0.949367
1,0.652174,0.181818,0.869565,0.000000,0.000000,1.000000
2,0.869565,0.727273,0.391304,0.454545,0.023256,0.949367
3,0.565217,0.090909,0.000000,1.000000,0.000000,1.000000
4,0.956522,0.818182,0.000000,0.181818,0.023256,1.000000
...,...,...,...,...,...,...
635,0.304348,0.181818,0.695652,0.181818,1.000000,0.531646
636,0.391304,0.545455,0.521739,0.363636,1.000000,0.000000
637,0.478261,0.727273,1.000000,0.000000,1.000000,1.000000
638,0.478261,0.727273,0.565217,1.000000,1.000000,0.000000


### Duration

In [42]:
X_train.duration

0      905
1      290
2      765
3      710
4       85
      ... 
635    540
636    170
637    680
638    135
639    180
Name: duration, Length: 640, dtype: int64