In [2]:
# Imports
import json
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from pygeocoder import Geocoder
from sklearn.preprocessing import StandardScaler
from geopy.geocoders import GeocodeFarm, Nominatim
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from datetime import datetime

In [3]:
clean_df = pd.read_csv("../data/cleaned_government_data.csv")
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,From,To,Sum of Net Tickets,Sum of Total $
0,Business Class,Jan,Calgary,Ottawa,2,6045.62
1,Business Class,Jan,Calgary,Victoria,1,740.6
2,Business Class,Jan,Campbell River,Ottawa,1,3482.85
3,Business Class,Jan,Charlottetown,Calgary,1,2807.24
4,Business Class,Jan,Charlottetown,Ottawa,1,737.35


## Origin, Destination encoding

In [8]:
geo = Nominatim()
geo_farm = GeocodeFarm()

cities=list(clean_df["From"].unique())
cities.extend(list(clean_df["To"].unique()))
cities = list(set(cities))

  """Entry point for launching an IPython kernel.


In [10]:
len(cities)

264

In [11]:
location_details = {}

In [12]:
for each_city in cities:
    try:
        location = geo.geocode(each_city + ", Canada")
    except:
        try:
            location = geo_farm.geocode(each_city + ", Canada")
        except:
            continue
    try:
        location_details[each_city] = {
            "latitude": location[1][0],
            "longitude": location[1][1]
        }
    except:
        continue

In [19]:
location_details["St Theris Point"] = {
    "latitude": 53.8328,
    "longitude": -94.8497
}

In [20]:
location_details["Ganges Harbor"] = {
    "latitude": 48.8556,
    "longitude": -123.4965
}

In [26]:
location_details["Pender Harbor"] = {
    "latitude": 49.6271,
    "longitude": -124.0359
}

In [27]:
location_details["Points North Landing"] = {
    "latitude": 58.2701,
    "longitude": -104.0814
}

In [24]:
location_details["PITTS MEADOW  BC"]

{'latitude': 42.2579, 'longitude': -83.05942}

In [28]:
with open("../data/city_coordinates.json", "w") as f:
    json.dump(location_details, f)

In [4]:
def load_coordinates():
    # Loading coords from JSON
    with open("../data/city_coordinates.json") as f:
        coords = json.load(f)
    return coords

In [5]:
def transform_coordinates(clean_df):
    # Loading coordinates
    location_details = load_coordinates()
    # Origin lat, lon and destination lat, long figured
    clean_df[["From_lat","From_lon"]] = clean_df["From"].apply(lambda x: pd.Series([location_details[x]["latitude"],location_details[x]["longitude"]]))
    clean_df[["To_lat","To_lon"]] = clean_df["To"].apply(lambda x: pd.Series([location_details[x]["latitude"],location_details[x]["longitude"]]))
    clean_df.drop(["From", "To"], axis=1, inplace=True)
    return clean_df

In [6]:
def convert_to_radians(clean_df, cols=None):
    # Converting degree to rads
    clean_df[cols] = np.radians(clean_df[cols])
    return clean_df

In [7]:
clean_df = transform_coordinates(clean_df)
clean_df = convert_to_radians(clean_df, cols=["From_lat", "From_lon", "To_lat", "To_lon"])

In [8]:
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,Sum of Net Tickets,Sum of Total $,From_lat,From_lon,To_lat,To_lon
0,Business Class,Jan,2,6045.62,0.89105,-1.990768,0.792748,-1.321045
1,Business Class,Jan,1,740.6,0.89105,-1.990768,0.845234,-2.153125
2,Business Class,Jan,1,3482.85,0.873067,-2.185923,0.792748,-1.321045
3,Business Class,Jan,1,2807.24,0.806952,-1.101878,0.89105,-1.990768
4,Business Class,Jan,1,737.35,0.806952,-1.101878,0.792748,-1.321045


## Feature Engineering: Distance between origin and destination

In [9]:
def haversine(clean_df):
    # Distance metric to calculate between two coordinate points
    from_lat, from_long, to_lat, to_long = clean_df["From_lat"], clean_df["From_lon"], clean_df["To_lat"], clean_df["To_lon"]
    radius_of_earth = 6378.1  # In km
    lat_delta = to_lat - from_lat
    lon_delta = to_long - from_long
    d = np.sin(lat_delta*0.5)**2 + np.cos(from_lat) * np.cos(to_lat) * np.sin(lon_delta*0.5)**2
    haversine_values = 2 * radius_of_earth * np.arcsin(np.sqrt(d))
    return haversine_values

In [10]:
def calculate_distance(clean_df):
    # Add a distance column to denote distance between origin and destination
    clean_df["distance"] = clean_df.apply(haversine, axis=1)
    return clean_df

In [11]:
clean_df = calculate_distance(clean_df)
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,Sum of Net Tickets,Sum of Total $,From_lat,From_lon,To_lat,To_lon,distance
0,Business Class,Jan,2,6045.62,0.89105,-1.990768,0.792748,-1.321045,2878.608298
1,Business Class,Jan,1,740.6,0.89105,-1.990768,0.845234,-2.153125,729.555861
2,Business Class,Jan,1,3482.85,0.873067,-2.185923,0.792748,-1.321045,3677.013367
3,Business Class,Jan,1,2807.24,0.806952,-1.101878,0.89105,-1.990768,3708.065355
4,Business Class,Jan,1,737.35,0.806952,-1.101878,0.792748,-1.321045,977.226751


## Major Class

In [12]:
clean_df.groupby(["Major Class"])[["Sum of Net Tickets", "Sum of Total $"]].sum()['Sum of Total $']/clean_df.groupby(["Major Class"])[["Sum of Net Tickets", "Sum of Total $"]].sum()['Sum of Net Tickets']

Major Class
Business Class     1845.703896
Economy             486.790989
First Class         235.342727
Premium Economy    1511.283000
dtype: float64

### Inference:
- Although First Class gives a very low mean price, it is logically the most expensive way to travel via air.
- Upon closer examination, it was seen that the first class records observed in the data was for a very short distance.
- Thus, as per logic, it is being label encoded with First class ranking the highest and Economy ranking the lowest.
- This might give a slightly lower accuracy for our dataset but is more interpretable.

In [13]:
def dump_fe_pkl(model, col_name):
    with open(f"../fe_models/fe_{col_name}.pkl", "wb") as f:
        pickle.dump(model, f)

In [14]:
def load_fe_pkl(col_name):
    with open(f"../fe_models/fe_{col_name}.pkl", "rb") as f:
        model=pickle.load(f)
    return model

In [15]:
def label_encode(clean_df, col_name=None, use_pre_trained=False):
    if not use_pre_trained:
        model = LabelEncoder()
        model.fit(clean_df[col_name])
        dump_fe_pkl(model, col_name)
    model=load_fe_pkl(col_name)
    clean_df[col_name] = model.transform(clean_df[col_name])
    return clean_df

In [16]:
def custom_label_encode(clean_df, mappings):
    clean_df = clean_df.replace(mappings)
    return clean_df

In [17]:
mappings = {
    "Major Class":{
        "Economy": 1,
        "Premium Economy": 2,
        "Business Class": 3,
        "First Class": 4
    }
}
clean_df = custom_label_encode(clean_df, mappings)

In [18]:
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,Sum of Net Tickets,Sum of Total $,From_lat,From_lon,To_lat,To_lon,distance
0,3,Jan,2,6045.62,0.89105,-1.990768,0.792748,-1.321045,2878.608298
1,3,Jan,1,740.6,0.89105,-1.990768,0.845234,-2.153125,729.555861
2,3,Jan,1,3482.85,0.873067,-2.185923,0.792748,-1.321045,3677.013367
3,3,Jan,1,2807.24,0.806952,-1.101878,0.89105,-1.990768,3708.065355
4,3,Jan,1,737.35,0.806952,-1.101878,0.792748,-1.321045,977.226751


In [19]:
def reorder_cols(clean_df, col_order=[]):
    return clean_df[col_order]

In [20]:
clean_df = reorder_cols(clean_df, col_order=['Major Class', 'Month of Travel Date', 'Sum of Net Tickets', 'From_lat', 'From_lon', 'To_lat', 'To_lon',
       'distance', 'Sum of Total $'])
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,Sum of Net Tickets,From_lat,From_lon,To_lat,To_lon,distance,Sum of Total $
0,3,Jan,2,0.89105,-1.990768,0.792748,-1.321045,2878.608298,6045.62
1,3,Jan,1,0.89105,-1.990768,0.845234,-2.153125,729.555861,740.6
2,3,Jan,1,0.873067,-2.185923,0.792748,-1.321045,3677.013367,3482.85
3,3,Jan,1,0.806952,-1.101878,0.89105,-1.990768,3708.065355,2807.24
4,3,Jan,1,0.806952,-1.101878,0.792748,-1.321045,977.226751,737.35


## Month of Travel Date

In [21]:
clean_df.groupby(["Month of Travel Date"])[["Sum of Net Tickets", "Sum of Total $"]].describe()

Unnamed: 0_level_0,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Month of Travel Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Apr,1322.0,14.5,51.035518,1.0,1.0,3.0,8.0,1059.0,1322.0,7564.120416,23791.91364,4.52,885.76,1946.735,4970.7425,439706.32
Aug,1434.0,11.359833,30.533593,1.0,1.0,3.0,8.0,548.0,1434.0,5818.012629,14419.9577,22.6,840.5125,1871.98,4427.9,229749.52
Dec,1096.0,12.875912,41.477783,1.0,1.0,3.0,8.0,776.0,1096.0,6074.260319,17666.768233,11.5,762.02,1650.09,4350.16,271952.43
Feb,1390.0,18.152518,66.668278,1.0,1.0,3.0,9.0,1426.0,1390.0,9000.705237,29700.126779,16.95,867.65,2060.06,5857.305,530559.69
Jan,1303.0,16.86109,57.872933,1.0,1.0,3.0,9.0,1071.0,1303.0,7889.833753,24820.491955,26.25,835.87,1949.78,5296.005,461784.07
Jul,1459.0,13.037012,35.379328,1.0,1.0,3.0,9.0,554.0,1459.0,6237.850206,16738.505276,19.0,844.745,1818.22,5077.68,358749.2
Jun,1492.0,17.484584,63.959791,1.0,1.0,3.0,10.0,1368.0,1492.0,8535.565449,27687.36008,18.9,921.9725,2039.485,5455.435,448261.71
Mar,1464.0,18.368852,66.263919,1.0,1.0,3.0,9.0,1292.0,1464.0,9460.459713,32308.715377,17.25,872.91,2022.59,5819.78,639415.03
May,1489.0,18.243788,65.942885,1.0,1.0,3.0,10.0,1331.0,1489.0,8965.658784,28897.219857,10.5,903.15,1953.92,6013.55,534445.88
Nov,1427.0,20.147162,77.934706,1.0,1.0,3.0,10.0,1703.0,1427.0,9725.045452,35147.448876,33.9,834.595,1957.82,6273.765,671987.13


### Inference:
- One-hot encode months because there is no significant rise in price between the months.
- This was determined by diving the mean of Sum of Total with the Sum of Net Tickets for every month.

In [22]:
def compute_avg_price(clean_df):
    clean_df["price"] = clean_df["Sum of Total $"]/clean_df["Sum of Net Tickets"]
    clean_df.drop(["Sum of Total $", "Sum of Net Tickets"], axis=1, inplace=True)
    return clean_df

In [23]:
clean_df = compute_avg_price(clean_df)

In [24]:
def ohe_encode(clean_df, categories=None, use_pre_trained=False):
    if not use_pre_trained:
        model = make_column_transformer((categories, OneHotEncoder()), remainder="passthrough")
        clean_df_data, clean_df_labels = model.fit_transform(clean_df.iloc[:, :-1]), np.array(clean_df.iloc[:, -1])
        dump_fe_pkl(model, "ohe_model")
        return clean_df_data, clean_df_labels
    model=load_fe_pkl("ohe_model")
    clean_df = model.transform(clean_df)
    return clean_df

In [25]:
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,From_lat,From_lon,To_lat,To_lon,distance,price
0,3,Jan,0.89105,-1.990768,0.792748,-1.321045,2878.608298,3022.81
1,3,Jan,0.89105,-1.990768,0.845234,-2.153125,729.555861,740.6
2,3,Jan,0.873067,-2.185923,0.792748,-1.321045,3677.013367,3482.85
3,3,Jan,0.806952,-1.101878,0.89105,-1.990768,3708.065355,2807.24
4,3,Jan,0.806952,-1.101878,0.792748,-1.321045,977.226751,737.35


### Also, splitting the encoded data into features and targets

In [62]:
X, y = ohe_encode(clean_df, categories=["Month of Travel Date"])



## Cross Validation for model selection

In [65]:
# Use cross validation to find a model that gives the least error (using mean absolute error)

# Linear Regression on the model
lin_reg_cv_score = cross_val_score(LinearRegression(), X, y, scoring="neg_mean_absolute_error", cv=10, n_jobs=-1)
print(f"Mean absolute error with Linear Regression is: {lin_reg_cv_score}\n")

# Random Forest Regressor
forest_reg_cv_score = cross_val_score(RandomForestRegressor(), X, y, scoring="neg_mean_absolute_error", cv=10, n_jobs=-1)
print(f"Mean absolute error with Random Forest Regressor is: {forest_reg_cv_score}\n")

# XGBoost Regressor
xgb_reg_cv_score = cross_val_score(XGBRegressor(), X, y, scoring="neg_mean_absolute_error", cv=10, n_jobs=-1)
print(f"Mean absolute error with XGBoost Regressor is: {xgb_reg_cv_score}")

Mean absolute error with Linear Regression is: [-525.44973477 -380.38618915 -333.4080112  -329.64843716 -331.48375537
 -304.30249959 -333.78092374 -351.60640486 -330.50286717 -301.9693914 ]

Mean absolute error with Random Forest Regressor is: [-491.85513025 -262.07968083 -260.71133562 -241.24452713 -259.60450712
 -242.69164273 -250.25857782 -248.5783139  -248.31922422 -232.48985057]

Mean absolute error with XGBoost Regressor is: [-545.88526887 -306.41334643 -290.2479453  -286.53016615 -282.98878098
 -283.28770994 -286.74360486 -284.80685806 -287.16324272 -277.83623674]


## GRIDSEARCH for Hyperparameter Tuning

In [None]:
# Since Random Forest Regressor gives the lowest error among all models, we will use
# GridSearchCV to tune the hyper-parameters for RF Regressor and minimize the error

print(datetime.now())

rf_parameters = {'n_estimators':[120, 150, 200],
                 'max_depth':[20, 30, 50], 'min_samples_leaf':[1, 3, 5]}
rf_gsc = GridSearchCV(RandomForestRegressor(), param_grid=rf_parameters, scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=True)
grid_search_result = rf_gsc.fit(X, y)
print(f"The best set of hyper-parameters are: {grid_search_result.best_params_}")

print(datetime.now())

In [None]:
with open("../trained_models/model.pkl", "wb") as f:
    pickle.dump(grid_search_result.best_estimator_, f)

## Function to preprocess data from front-end

In [29]:
def preprocessing(clean_df):
    # Transforming "From" and "To" destinations to coordinates
    clean_df = transform_coordinates(clean_df)
    # Transforming coordinates to radians
    clean_df = convert_to_radians(clean_df, cols=["From_lat", "From_lon", "To_lat", "To_lon"])
    # Computing haversine between origin and destination
    clean_df = calculate_distance(clean_df)
    # Transforming "Major Class" using Label Encoder
    clean_df = custom_label_encode(clean_df, mappings)
    # Transforming "Month of Travel Date" using one-hot encoder
    clean_arr = ohe_encode(clean_df, categories=["Month of Travel Date"], use_pre_trained=True)
    return clean_arr

In [30]:
sample = pd.DataFrame([["Vancouver","Halifax","Economy","Dec"]], columns=["From", "To", "Major Class", "Month of Travel Date"])
sample=preprocessing(sample)

In [31]:
sample

array([[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1,
        0.8597644157848011, -2.1487438332502213, 0.7792653905652056,
        -1.1097841628192082, 4434.102538202029]], dtype=object)