In [84]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.pipeline  import Pipeline,FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
                                    OneHotEncoder,
                                    MinMaxScaler,
                                PowerTransformer,
                                FunctionTransformer )


from feature_engine.encoding import (
                                    RareLabelEncoder,
                                    MeanEncoder
)

from feature_engine.datetime import DatetimeFeatures

import warnings

## Display Settings

In [2]:
pd.set_option("display.max_columns",None)

In [3]:
sklearn.set_config(transform_output = "pandas" )

In [6]:
warnings.filterwarnings("ignore")

## READ DAta 

In [8]:
path =r"C:\Users\USER\Desktop\sagemaker2\data\train.csv"
train = pd.read_csv(path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-03-24,Chennai,Kolkata,11:40:00,13:55:00,135,0.0,No Info,4405
1,Indigo,2019-06-27,Banglore,Delhi,22:10:00,01:00:00,170,0.0,No Info,3419
2,Indigo,2019-05-18,Kolkata,Banglore,08:10:00,13:00:00,290,1.0,No Info,5279
3,Indigo,2019-06-06,Delhi,Cochin,06:50:00,22:30:00,940,1.0,No Info,6674
4,Jet Airways,2019-05-27,Delhi,Cochin,11:30:00,12:35:00,1505,1.0,In-flight meal not included,12898
...,...,...,...,...,...,...,...,...,...,...
635,Jet Airways,2019-04-06,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included,5177
636,Air India,2019-03-27,Mumbai,Hyderabad,06:20:00,07:40:00,80,0.0,No Info,2575
637,Jet Airways,2019-03-21,Delhi,Cochin,19:10:00,18:50:00,1420,2.0,No Info,10929
638,Vistara,2019-04-09,Delhi,Cochin,06:00:00,09:10:00,190,0.0,No Info,5586


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [10]:
x_train =train.drop(columns = "price")
y_train =train.price.copy()

###  Transformation operation

#### Airline

In [14]:
train.airline

0              Air India
1                 Indigo
2                 Indigo
3                 Indigo
4            Jet Airways
             ...        
635          Jet Airways
636            Air India
637          Jet Airways
638              Vistara
639    Multiple Carriers
Name: airline, Length: 640, dtype: object

In [27]:
air_transformer =Pipeline(steps= [
    ("imputer" , SimpleImputer(strategy="most_frequent")),
   ("grouper" , RareLabelEncoder (tol=0.1 , replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown= "ignore"))
])

air_transformer.fit_transform(x_train.loc[:,["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
635,0.0,0.0,1.0,0.0,0.0
636,1.0,0.0,0.0,0.0,0.0
637,0.0,0.0,1.0,0.0,0.0
638,0.0,0.0,0.0,0.0,1.0


In [51]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline( steps = [
                            ("dt", DatetimeFeatures( features_to_extract = feature_to_extract,
                                                        yearfirst = True,
                                                        format = "mixed")),
                            ("scalar",MinMaxScaler())
])

doj_transformer.fit_transform(x_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.176471,1.000000,0.194915
1,1.000000,1.000000,0.500000,1.000000
2,0.666667,0.647059,0.833333,0.661017
3,1.000000,0.823529,0.500000,0.822034
4,0.666667,0.764706,0.000000,0.737288
...,...,...,...,...
635,0.333333,0.294118,0.833333,0.305085
636,0.000000,0.235294,0.333333,0.220339
637,0.000000,0.176471,0.500000,0.169492
638,0.333333,0.352941,0.166667,0.330508


Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,remainder__source,remainder__destination,remainder__dep_time,remainder__arrival_time,remainder__duration,remainder__total_stops,remainder__additional_info
0,1.0,0.0,0.0,0.0,0.0,0.000000,0.176471,1.000000,0.194915,Chennai,Kolkata,11:40:00,13:55:00,135,0.0,No Info
1,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.500000,1.000000,Banglore,Delhi,22:10:00,01:00:00,170,0.0,No Info
2,0.0,1.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,Kolkata,Banglore,08:10:00,13:00:00,290,1.0,No Info
3,0.0,1.0,0.0,0.0,0.0,1.000000,0.823529,0.500000,0.822034,Delhi,Cochin,06:50:00,22:30:00,940,1.0,No Info
4,0.0,0.0,1.0,0.0,0.0,0.666667,0.764706,0.000000,0.737288,Delhi,Cochin,11:30:00,12:35:00,1505,1.0,In-flight meal not included
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,1.0,0.0,0.0,0.333333,0.294118,0.833333,0.305085,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included
636,1.0,0.0,0.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,Mumbai,Hyderabad,06:20:00,07:40:00,80,0.0,No Info
637,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.500000,0.169492,Delhi,Cochin,19:10:00,18:50:00,1420,2.0,No Info
638,0.0,0.0,0.0,0.0,1.0,0.333333,0.352941,0.166667,0.330508,Delhi,Cochin,06:00:00,09:10:00,190,0.0,No Info


In [35]:
x_train.date_of_journey

0      2019-03-24
1      2019-06-27
2      2019-05-18
3      2019-06-06
4      2019-05-27
          ...    
635    2019-04-06
636    2019-03-27
637    2019-03-21
638    2019-04-09
639    2019-05-18
Name: date_of_journey, Length: 640, dtype: object

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.176471,1.000000,0.194915
1,1.000000,1.000000,0.500000,1.000000
2,0.666667,0.647059,0.833333,0.661017
3,1.000000,0.823529,0.500000,0.822034
4,0.666667,0.764706,0.000000,0.737288
...,...,...,...,...
635,0.333333,0.294118,0.833333,0.305085
636,0.000000,0.235294,0.333333,0.220339
637,0.000000,0.176471,0.500000,0.169492
638,0.333333,0.352941,0.166667,0.330508


Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,3,12,6,83
1,6,26,3,178
2,5,20,5,138
3,6,23,3,157
4,5,22,0,147
...,...,...,...,...
635,4,14,5,96
636,3,13,2,86
637,3,12,3,80
638,4,15,1,99


### source &  Destination

In [54]:
x_train.source

0       Chennai
1      Banglore
2       Kolkata
3         Delhi
4         Delhi
         ...   
635       Delhi
636      Mumbai
637       Delhi
638       Delhi
639       Delhi
Name: source, Length: 640, dtype: object

In [55]:
x_train.destination

0        Kolkata
1          Delhi
2       Banglore
3         Cochin
4         Cochin
         ...    
635       Cochin
636    Hyderabad
637       Cochin
638       Cochin
639       Cochin
Name: destination, Length: 640, dtype: object

In [57]:
location_subset= x_train.loc[:, ["source","destination"]]
location_subset

Unnamed: 0,source,destination
0,Chennai,Kolkata
1,Banglore,Delhi
2,Kolkata,Banglore
3,Delhi,Cochin
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Mumbai,Hyderabad
637,Delhi,Cochin
638,Delhi,Cochin


In [80]:
location_pipe1=Pipeline(steps=[
    ("grouper" ,RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("encoder",MeanEncoder()),
    ("scaler_power",PowerTransformer())
])

location_pipe1.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination
0,-1.959028,-1.169443
1,-1.132143,-1.836194
2,0.189169,0.194751
3,0.863446,0.868528
4,0.863446,0.868528
...,...,...
635,0.863446,0.868528
636,-1.959028,-1.169443
637,0.863446,0.868528
638,0.863446,0.868528


In [69]:
np.union1d(
    x_train.source.unique(),
    x_train.destination.unique()
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [83]:
def is_north(x):
    columns = x.columns.to_list()
    north_cities=[ 'Delhi', 'Kolkata', 'Mumbai', 'New Delhi']
    return(
            x
            .assign(**{
                f"{col}_is_north": x.loc[:, col].isin(north_cities).astype(int)
                for col in columns
        })
        .drop(columns=columns)
    )

FunctionTransformer(func=is_north).fit_transform(location_subset)




Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
635,1,0
636,1,0
637,1,0
638,1,0


In [86]:
location_transformer =FeatureUnion(transformer_list=[
    ("part1" ,location_pipe1),
    ("part2",FunctionTransformer(func=is_north))
])

location_transformer.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-1.959028,-1.169443,0,1
1,-1.132143,-1.836194,0,1
2,0.189169,0.194751,1,0
3,0.863446,0.868528,1,0
4,0.863446,0.868528,1,0
...,...,...,...,...
635,0.863446,0.868528,1,0
636,-1.959028,-1.169443,1,0
637,0.863446,0.868528,1,0
638,0.863446,0.868528,1,0


##  Departure Time and arrival Time

In [90]:
x_train.dep_time

0      11:40:00
1      22:10:00
2      08:10:00
3      06:50:00
4      11:30:00
         ...   
635    14:00:00
636    06:20:00
637    19:10:00
638    06:00:00
639    08:00:00
Name: dep_time, Length: 640, dtype: object

In [91]:
x_train.arrival_time

0      13:55:00
1      01:00:00
2      13:00:00
3      22:30:00
4      12:35:00
         ...   
635    19:00:00
636    07:40:00
637    18:50:00
638    09:10:00
639    19:00:00
Name: arrival_time, Length: 640, dtype: object

In [95]:
time_subset=x_train.loc[:,["dep_time","arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,11:40:00,13:55:00
1,22:10:00,01:00:00
2,08:10:00,13:00:00
3,06:50:00,22:30:00
4,11:30:00,12:35:00
...,...,...
635,14:00:00,19:00:00
636,06:20:00,07:40:00
637,19:10:00,18:50:00
638,06:00:00,09:10:00


In [107]:
time_pipe1= Pipeline(steps=[
        ("dt",DatetimeFeatures(features_to_extract=["hour","minute"])),
        ("scaler",MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.478261,0.727273,0.565217,1.000000
1,0.956522,0.181818,0.043478,0.000000
2,0.347826,0.181818,0.565217,0.000000
3,0.260870,0.909091,0.956522,0.545455
4,0.478261,0.545455,0.521739,0.636364
...,...,...,...,...
635,0.608696,0.000000,0.826087,0.000000
636,0.260870,0.363636,0.304348,0.727273
637,0.826087,0.181818,0.782609,0.909091
638,0.260870,0.000000,0.391304,0.181818


In [109]:
def  part_of_day(x,morning=4,noon=12 ,eve=16,night=20):
    columns =x.columns.tolist()
    x_temp =x.assign(**{
        col:pd.to_datetime(x.loc[:,col]).dt.hour
        for col in columns
    })
    return (
        x_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [x_temp.loc[:col].betweeen(morning , noon, inclusive="left"),
                x_temp.loc[:col].betweeen(noon, eve , inclusive="left"),
                x_temp.loc[:col].betweeen(eve,night,inclusive="left")],
                
                ["morning","noon","evening"],
                 
                default="night"
            )
            for col in columns
        })
    )

FunctionTransformer(func=part_of_day).fit_transform(time_subset)

AttributeError: 'DataFrame' object has no attribute 'betweeen'

## Column Encoder

In [89]:
column_transformer =ColumnTransformer(transformers=[
    ("air" , air_transformer, ["airline"]),
    ("doj" , doj_transformer, ["date_of_journey"]),
    ("location",location_transformer,["source","destination"])
] ,remainder = "passthrough")


column_transformer.fit_transform(x_train,y_train)



Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,remainder__dep_time,remainder__arrival_time,remainder__duration,remainder__total_stops,remainder__additional_info
0,1.0,0.0,0.0,0.0,0.0,0.000000,0.176471,1.000000,0.194915,-1.959028,-1.169443,0,1,11:40:00,13:55:00,135,0.0,No Info
1,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.500000,1.000000,-1.132143,-1.836194,0,1,22:10:00,01:00:00,170,0.0,No Info
2,0.0,1.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,0.189169,0.194751,1,0,08:10:00,13:00:00,290,1.0,No Info
3,0.0,1.0,0.0,0.0,0.0,1.000000,0.823529,0.500000,0.822034,0.863446,0.868528,1,0,06:50:00,22:30:00,940,1.0,No Info
4,0.0,0.0,1.0,0.0,0.0,0.666667,0.764706,0.000000,0.737288,0.863446,0.868528,1,0,11:30:00,12:35:00,1505,1.0,In-flight meal not included
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,1.0,0.0,0.0,0.333333,0.294118,0.833333,0.305085,0.863446,0.868528,1,0,14:00:00,19:00:00,300,1.0,In-flight meal not included
636,1.0,0.0,0.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,-1.959028,-1.169443,1,0,06:20:00,07:40:00,80,0.0,No Info
637,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,0.500000,0.169492,0.863446,0.868528,1,0,19:10:00,18:50:00,1420,2.0,No Info
638,0.0,0.0,0.0,0.0,1.0,0.333333,0.352941,0.166667,0.330508,0.863446,0.868528,1,0,06:00:00,09:10:00,190,0.0,No Info
