<a href="https://www.kaggle.com/code/owentamunogilbert/airline-price-prediction-by-awa?scriptVersionId=286222571" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<font size="+3"><b>Airline Price Prediction</font>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math
import warnings

warnings.filterwarnings('ignore')

# Problem Case and MetaData

## Problem Case

> Most travel search engines do not have a variation of constraint for a traveller to know the price of their flights. However, many times the travelers' major constraint is their budget.
>
> Travellers might want to consider options for their flight based on their budgets, like what time of the day to fly, or how early should they book their flights, or if they can afford Business class for the distance they are travelling
>
> They want to know what they need to change to save cost or what is affordable with what they have
>

## Solution

> So basically, I'm building a machine learning model to perform prescriptive analysis on flight data, using counterfactual reasoning to simulate alternative booking scenarios and recommend optimal trade-offs between convenience and cost.

## Metadata


1) Airline: The name of the airline company is stored in the airline column. It is a categorical feature having 6 different airlines.
2) Flight: Flight stores information regarding the plane's flight code. It is a categorical feature.
3) Source City: City from which the flight takes off. It is a categorical feature having 6 unique cities.
4) Departure Time: This is a derived categorical feature obtained created by grouping time periods into bins. It stores information about the departure time and have 6 unique time labels.
5) Stops: A categorical feature with 3 distinct values that stores the number of stops between the source and destination cities.
6) Arrival Time: This is a derived categorical feature created by grouping time intervals into bins. It has six distinct time labels and keeps information about the arrival time.
7) Destination City: City where the flight will land. It is a categorical feature having 6 unique cities.
8) Class: A categorical feature that contains information on seat class; it has two distinct values: Business and Economy.
9) Duration: A continuous feature that displays the overall amount of time it takes to travel between cities in hours.
10)Days Left: This is a derived characteristic that is calculated by subtracting the trip date by the booking date.
11) Price: Target variable stores information of the ticket price.

# Previewing the Data

In [None]:
df = pd.read_csv('/kaggle/input/flight-price-prediction/Clean_Dataset.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
for col in df.select_dtypes('object').columns:
    print(f'{col}: {df[col].nunique()} unique items')
    print(f'{df[col].unique()}')
    print('***')

# Data Cleaning and Preparation

In [None]:
df['route'] = df['source_city']+" - "+df['destination_city']

In [None]:
df['trip_time'] = df['departure_time']+" - "+df['arrival_time']

In [None]:
df.drop(columns=['Unnamed: 0', 'source_city', 'destination_city', 'departure_time', 'arrival_time','flight'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
for col in df.select_dtypes('object').columns:
    print(col)
    print(df[col].unique())

# Visualization

In [None]:
plt.rcParams.update({'axes.labelweight':'bold', 'axes.labelcolor':'blue','legend.edgecolor':"brown",'xtick.color':'blue',
                     'ytick.color':'blue',
                    'axes.grid':True,'grid.color':'#BFBFBF','grid.linestyle':'--','axes.edgecolor':'blue',
                    'axes.titlecolor':"blue",'axes.titleweight':'bold','figure.facecolor':'lightgray'})


In [None]:
plt.figure(figsize=(5,3))
sns.histplot(df['price'], kde=True,color='blue')
plt.title('Price Distribution')
plt.show()

In [None]:
plt.figure(figsize=(5,3))
sns.histplot(np.log1p(df['price']), kde=True,color='blue')
plt.title('Log(Price) Distribution')
plt.xlabel('Log of Price')
plt.show()

In [None]:
plt.figure(figsize=(6,3))
sns.boxplot(data=df, x='airline', y='price',color='blue')
plt.xticks(rotation=45)
plt.title('Price by Airline')
plt.show()

In [None]:
plot_cols = [col for col in df.select_dtypes('object').columns if col != 'flight']
avg_per_col = {col: df.groupby(col)['price'].mean().sort_values(ascending=False).head(10) for col in plot_cols}

ncols=2
nrows=math.ceil(len(plot_cols)/ncols)

fig, axes = plt.subplots(nrows,ncols,figsize=(20,6*nrows))
axes = axes.flatten()

for ax, col in zip(axes,plot_cols):
    avgcol = avg_per_col[col]
    labels = avgcol.index.tolist()

    ax.barh(avgcol.index,avgcol.values, color='blue')
    ax.set_title(col.title(),fontsize=25)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.tick_params(labelsize=20)

for i in range(len(plot_cols),len(axes)):
    plt.delaxes(axes[i])

plt.tight_layout()
plt.suptitle('Average Prices per Category', size=35, weight='bold',y=1.02, color='blue')
plt.show()

In [None]:
plt.figure(figsize=(5,3))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Feature Engineering

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact

import plotly.express as px
import shap

import plotly.graph_objs as go
import plotly.io as pio
from ipywidgets import widgets, VBox, HBox, Output

In [None]:
x = df.drop(columns=['price'])
y = df['price']

# Machine Learning

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.7,random_state=1)

In [None]:
ord_cols = ['stops', 'class']
targ_cols = ["route", "trip_time"]
ohe_cols = ["airline",]

In [None]:
preprocess = ColumnTransformer([
    ('ordinal',OrdinalEncoder(categories=[
        ["zero","one","two_or_more"],
        ["Economy","Business"]]),ord_cols),
    ('target',TargetEncoder(smoothing=1.0),targ_cols),
    ('ohe',OneHotEncoder(),ohe_cols)
],remainder='passthrough')

In [None]:
pipe = Pipeline([('preprocess',preprocess),
                  ('model',XGBRegressor())])

In [None]:
param_grid = {
    'model__n_estimators': [100, 500, 1000],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7, 10],
    'model__subsample': [0.7, 0.9, 1.0]
}

In [None]:
search = RandomizedSearchCV(
    pipe, param_grid,
    cv=3, n_jobs=-1,
    scoring="r2", return_train_score=True,
    verbose=1,random_state=42
)

In [None]:
search.fit(xtrain,ytrain)

In [None]:
print(f"Best Parameters: {search.best_params_}")
print(f"\nBest Score: {search.best_score_}")

In [None]:
model = search.best_estimator_

In [None]:
model.fit(xtrain,ytrain)

In [None]:
ypred = model.predict(xtest)

In [None]:
model.score(xtrain,ytrain)

In [None]:
r2_score(ytest, ypred)

# Interactive Dashboard

In [None]:
x_cat = x.select_dtypes("object").columns.tolist()
x_num = x.select_dtypes("number").columns.tolist()

In [None]:
def price_predictor(**kwargs):
    df_input = pd.DataFrame([kwargs])
    result = model.predict(df_input)[0]
    print(f'The Predicted Price: {result:.2f}')

params={}

for col in x_cat:
    params[col] = Dropdown(options=sorted(df[col].unique()))
for col in x_num:
    params[col] = FloatSlider(mini=float(df[col].min()),maxi=float(df[col].max()),step=1,value=float(df[col].mean()))

interact(price_predictor, **params)
plt.show()

In [None]:
x_transformed = model.named_steps['preprocess'].transform(x)

In [None]:
idx = np.random.choice(x_transformed.shape[0],10000,replace=False)
x_small = x_transformed[idx]

In [None]:
explainer = shap.TreeExplainer(model.named_steps['model'],
                              feature_perturbation='tree_path_dependent')
shap_values = explainer.shap_values(x_small)

In [None]:
feature_names = x_cat + x_num

In [None]:
shap.summary_plot(shap_values, feature_names,plot_type='bar')

In [None]:
# Create widgets for input
input_widgets = {}
for col in x_cat:
    input_widgets[col] = widgets.Dropdown(
        options=sorted(df[col].unique()),
        description=col.replace('_', ' ').title(),
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )
for col in x_num:
    input_widgets[col] = widgets.FloatSlider(
        min=float(df[col].min()),
        max=float(df[col].max()),
        step=1,
        value=float(df[col].mean()),
        description=col.replace('_', ' ').title(),
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

output = Output()

def plotly_price_predictor(**kwargs):
    df_input = pd.DataFrame([kwargs])
    result = model.predict(df_input)[0]
    with output:
        output.clear_output()
        fig = go.Figure(go.Indicator(
            mode = "number+gauge+delta",
            value = result,
            number = {'prefix': "â‚¹", 'font': {'size': 48}},
            title = {'text': "<b>Predicted Flight Price</b>", 'font': {'size': 24}},
            gauge = {
                'axis': {'range': [df['price'].min(), df['price'].max()]},
                'bar': {'color': "royalblue"},
                'steps': [
                    {'range': [df['price'].min(), df['price'].quantile(0.33)], 'color': "lightgreen"},
                    {'range': [df['price'].quantile(0.33), df['price'].quantile(0.66)], 'color': "khaki"},
                    {'range': [df['price'].quantile(0.66), df['price'].max()], 'color': "salmon"}
                ],
            },
            delta = {'reference': df['price'].mean(), 'increasing': {'color': 'red'}, 'decreasing': {'color': 'green'}}
        ))
        fig.update_layout(height=400, width=600, margin=dict(t=60, b=0, l=0, r=0))
        fig.show()

# Create interactive UI
ui = VBox([
    HBox([input_widgets[col] for col in x_cat]),
    HBox([input_widgets[col] for col in x_num])
])

def on_change(change):
    kwargs = {col: input_widgets[col].value for col in feature_names}
    plotly_price_predictor(**kwargs)

for w in input_widgets.values():
    w.observe(on_change, names='value')

display(ui, output)
# Initial plot
on_change(None)


