# 1. Import Libraries

In [1]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

  from pandas.core import (


## 2. display Settings

In [2]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings("ignore")

## 3. Read the Data

In [3]:
path =r"C:\Users\lenovo i3\OneDrive\Documents\Project Data\Compus x project\filght-price-prediction\dataset\train.csv"
train = pd.read_csv(path)

In [4]:
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Delhi,Cochin,05:30:00,19:45:00,2295,2.0,In-flight meal not included,15864
1,Indigo,2019-05-06,Banglore,Delhi,13:00:00,15:50:00,170,0.0,No Info,3943
2,Multiple Carriers,2019-04-27,Delhi,Cochin,07:10:00,16:10:00,540,1.0,In-flight meal not included,6093
3,Jet Airways,2019-03-21,Delhi,Cochin,18:20:00,18:50:00,1470,2.0,No Info,12604
4,Air India,2019-03-09,Delhi,Cochin,08:00:00,19:15:00,675,1.0,No Info,16754


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [6]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

## 4. Transformation Operations

### 4.1 airline

In [7]:
X_train.airline

0            Jet Airways
1                 Indigo
2      Multiple Carriers
3            Jet Airways
4              Air India
             ...        
635            Air India
636          Jet Airways
637          Jet Airways
638             Spicejet
639                Goair
Name: airline, Length: 640, dtype: object

In [8]:
air_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])#.airline.value_counts()

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
635,1.0,0.0,0.0,0.0,0.0
636,0.0,0.0,1.0,0.0,0.0
637,0.0,0.0,1.0,0.0,0.0
638,0.0,0.0,0.0,0.0,1.0


### 4.2 date_of_journey

In [9]:
train.date_of_journey

0      2019-03-06
1      2019-05-06
2      2019-04-27
3      2019-03-21
4      2019-03-09
          ...    
635    2019-03-01
636    2019-06-24
637    2019-03-06
638    2019-04-15
639    2019-06-12
Name: date_of_journey, Length: 640, dtype: object

In [10]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.058824,0.333333,0.042373
1,0.666667,0.588235,0.000000,0.559322
2,0.333333,0.470588,0.833333,0.483051
3,0.000000,0.176471,0.500000,0.169492
4,0.000000,0.058824,0.833333,0.067797
...,...,...,...,...
635,0.000000,0.000000,0.666667,0.000000
636,1.000000,1.000000,0.000000,0.974576
637,0.000000,0.058824,0.333333,0.042373
638,0.333333,0.411765,0.000000,0.381356
