In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

import os
import xgboost as xgb

import sklearn




In [2]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import ColumnTransformer

from feature_engine.selection import SelectBySingleFeaturePerformance
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer
)

from feature_engine.outliers import Winsorizer

from feature_engine.datetime import DatetimeFeatures
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)

# import matplotlib.pyplot as plt
import warnings

# Display Settings

In [3]:
pd.set_option("display.max_columns", None)

In [5]:
sklearn.set_config(transform_output = "pandas")

In [6]:
warnings.filterwarnings("ignore")

# Read Datsets

In [7]:
train = pd.read_csv("train.csv")


In [8]:
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Vistara,2019-04-21,Banglore,Delhi,11:30:00,14:20:00,170,0.0,No Info,5403
1,GoAir,2019-04-01,Kolkata,Banglore,16:40:00,00:15:00,455,1.0,No Info,5171
2,IndiGo,2019-04-15,Kolkata,Banglore,15:30:00,18:05:00,155,0.0,No Info,4174
3,Jet Airways,2019-03-27,Delhi,Cochin,19:10:00,19:45:00,1475,2.0,In-flight meal not included,8834
4,Air India,2019-03-03,Banglore,New Delhi,06:10:00,08:55:00,165,0.0,No Info,7591


In [9]:
val = pd.read_csv("val.csv")
val.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-06,Kolkata,Banglore,09:25:00,01:20:00,955,2.0,No Info,11458
1,Jet Airways,2019-06-21,Banglore,Delhi,19:50:00,22:50:00,180,0.0,No Info,8541
2,IndiGo,2019-05-06,Mumbai,Hyderabad,02:35:00,04:05:00,90,0.0,No Info,3342
3,IndiGo,2019-05-09,Delhi,Cochin,16:10:00,22:30:00,380,1.0,No Info,6704
4,IndiGo,2019-05-06,Kolkata,Banglore,14:25:00,16:55:00,150,0.0,No Info,5224


In [11]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-06,Kolkata,Banglore,20:45:00,23:20:00,155,0.0,No Info,4145
1,Air India,2019-05-21,Delhi,Cochin,05:00:00,07:40:00,1600,2.0,No Info,11806
2,SpiceJet,2019-04-24,Kolkata,Banglore,06:55:00,09:30:00,155,0.0,No Info,3873
3,Vistara,2019-06-24,Banglore,Delhi,07:00:00,09:40:00,160,0.0,No Info,4668
4,Vistara,2019-05-06,Kolkata,Banglore,07:10:00,22:40:00,930,1.0,No Info,8452


# Prprocessing operations

In [12]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [13]:
preprocessor.fit(
    train.drop(columns = "price"),
    train.price.copy()
)

In [14]:
preprocessor.transform(train.drop(columns = "price"))

Unnamed: 0,air__airline_IndiGo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.0,0.411765,0.432203,-1.194179,-1.696872,3.122263,0.0,-0.909348,0.0,1
1,0.0,0.0,0.294118,0.262712,-0.091122,-0.088366,-0.361050,2.0,-0.347894,1.0,0
2,1.0,0.0,0.411765,0.381356,-0.091122,-0.088366,-0.361050,0.0,-0.938898,0.0,1
3,0.0,1.0,0.235294,0.220339,0.986831,0.986759,-0.361050,2.0,1.661520,2.0,0
4,0.0,0.0,0.000000,0.016949,-1.194179,-1.211369,2.326592,0.0,-0.919198,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,0.000000,0.016949,-1.704171,-1.211369,-0.361050,0.0,-1.086649,0.0,1
636,0.0,1.0,0.823529,0.822034,-0.091122,-0.088366,-0.361050,2.0,0.115059,1.0,0
637,0.0,0.0,0.764706,0.779661,0.986831,0.986759,-0.361050,2.0,-0.003142,1.0,0
638,0.0,0.0,0.235294,0.220339,0.986831,0.986759,-0.361050,2.0,0.144609,1.0,0


In [17]:
export_data(train, "train", preprocessor)

In [18]:
export_data(val, "val", preprocessor)

In [19]:
export_data(test, "test", preprocessor)

In [20]:
xgb_r = xgb.XGBRegressor(
    objective = "reg:linear",
    num_round = 10,
    eta = 0.1,
    max_depth = 5,
    subsample = 0.8,
    colsample_bytree = 0.8,
    alpha = 0.1
)

In [42]:
data = pd.read_csv("train-pre.csv")

x = data.iloc[:, 1:].copy()
y = data.iloc[:, 0].copy()

xgb_r.fit(x, y)


Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [45]:
## Save the model

import pickle
filename = "filght_price.sav"
pickle.dump(xgb_r, open(filename, 'wb'))

In [48]:
load_model = pickle.load(open('filght_price.sav', 'rb'))

arg = x
load_model.predict(arg)

array([ 6225.589 ,  4384.028 ,  9133.897 ,  7780.8066,  5545.532 ,
        5101.479 , 12421.441 ,  7040.468 , 17182.002 ,  4466.1963,
        5198.005 , 10920.265 ,  4325.7383, 12649.899 ,  7128.0776,
       13427.64  , 12274.762 , 11888.19  ,  6485.54  ,  4384.028 ,
        7963.3047,  4772.7954, 11311.401 , 10748.757 , 13458.343 ,
       10882.817 ,  6102.835 , 10108.514 ,  4785.315 ,  3845.406 ,
       11354.674 ,  6463.1885, 11970.329 , 12295.616 , 12250.939 ,
        5139.9824, 11014.887 , 10799.282 ,  4974.449 , 11500.916 ,
        4872.947 ,  4463.49  , 11977.5   , 13035.33  , 12371.188 ,
        6929.013 ,  6892.3345, 13220.768 , 11786.137 ,  4339.522 ,
       11139.38  ,  7936.2075, 11769.411 , 12695.511 , 12038.602 ,
        4384.028 ,  4602.957 , 11333.978 ,  6481.4673,  8509.659 ,
        7716.7373, 17333.775 , 11802.522 ,  9907.452 , 11127.502 ,
       12942.185 , 15607.331 , 11042.973 ,  3365.1323,  4870.497 ,
       12118.306 ,  4402.6787,  4782.8477, 12575.337 ,  4708.1