In [144]:
# Imports
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from pygeocoder import Geocoder
from sklearn.preprocessing import StandardScaler
from geopy.geocoders import GeocodeFarm, Nominatim

In [7]:
clean_df = pd.read_csv("../data/cleaned_government_data.csv")

In [8]:
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,From,To,Sum of Net Tickets,Sum of Total $
0,Business Class,Jan,Calgary,Ottawa,2,6045.62
1,Business Class,Jan,Calgary,Victoria,1,740.6
2,Business Class,Jan,Campbell River,Ottawa,1,3482.85
3,Business Class,Jan,Charlottetown,Calgary,1,2807.24
4,Business Class,Jan,Charlottetown,Ottawa,1,737.35


## Month of Travel Date

In [23]:
clean_df.groupby(["Month of Travel Date"])[["Sum of Net Tickets", "Sum of Total $"]].describe()

Unnamed: 0_level_0,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Net Tickets,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $,Sum of Total $
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Month of Travel Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Apr,1322.0,14.5,51.035518,1.0,1.0,3.0,8.0,1059.0,1322.0,7564.120416,23791.91364,4.52,885.76,1946.735,4970.7425,439706.32
Aug,1434.0,11.359833,30.533593,1.0,1.0,3.0,8.0,548.0,1434.0,5818.012629,14419.9577,22.6,840.5125,1871.98,4427.9,229749.52
Dec,1096.0,12.875912,41.477783,1.0,1.0,3.0,8.0,776.0,1096.0,6074.260319,17666.768233,11.5,762.02,1650.09,4350.16,271952.43
Feb,1390.0,18.152518,66.668278,1.0,1.0,3.0,9.0,1426.0,1390.0,9000.705237,29700.126779,16.95,867.65,2060.06,5857.305,530559.69
Jan,1303.0,16.86109,57.872933,1.0,1.0,3.0,9.0,1071.0,1303.0,7889.833753,24820.491955,26.25,835.87,1949.78,5296.005,461784.07
Jul,1459.0,13.037012,35.379328,1.0,1.0,3.0,9.0,554.0,1459.0,6237.850206,16738.505276,19.0,844.745,1818.22,5077.68,358749.2
Jun,1492.0,17.484584,63.959791,1.0,1.0,3.0,10.0,1368.0,1492.0,8535.565449,27687.36008,18.9,921.9725,2039.485,5455.435,448261.71
Mar,1464.0,18.368852,66.263919,1.0,1.0,3.0,9.0,1292.0,1464.0,9460.459713,32308.715377,17.25,872.91,2022.59,5819.78,639415.03
May,1489.0,18.243788,65.942885,1.0,1.0,3.0,10.0,1331.0,1489.0,8965.658784,28897.219857,10.5,903.15,1953.92,6013.55,534445.88
Nov,1427.0,20.147162,77.934706,1.0,1.0,3.0,10.0,1703.0,1427.0,9725.045452,35147.448876,33.9,834.595,1957.82,6273.765,671987.13


### Inference:
- One-hot encode months because there is no significant rise in price between the months.
- This was determined by diving the mean of Sum of Total with the Sum of Net Tickets for every month.

## Origin, Destination encoding

In [62]:
geo = Nominatim()

  """Entry point for launching an IPython kernel.


In [63]:
geo_farm = GeocodeFarm()

In [174]:
cities=list(clean_df["From"].unique())
cities.extend(list(clean_df["To"].unique()))

In [None]:
location_details = {}

In [170]:
for each_city in cities:
    try:
        location = geo.geocode(each_city + ", Canada")
    except:
        try:
            location = geo_farm.geocode(each_city + ", Canada")
        except:
            continue
    try:
        location_details[each_city] = {
            "latitude": location[1][0],
            "longitude": location[1][1]
        }
    except:
        continue

In [172]:
with open("../data/coordinates.json", "w") as f:
    json.dump(location_details, f)

In [171]:
clean_df[["From_lat","From_lon"]] = clean_df["From"].apply(lambda x: pd.Series([location_details[x]["latitude"],location_details[x]["longitude"]]))
clean_df[["To_lat","To_lon"]] = clean_df["To"].apply(lambda x: pd.Series([location_details[x]["latitude"],location_details[x]["longitude"]]))

In [178]:
clean_df.head(5)

Unnamed: 0,Major Class,Month of Travel Date,From,To,Sum of Net Tickets,Sum of Total $,From_lat,From_lon,To_lat,To_lon
0,Business Class,Jan,Calgary,Ottawa,2,6045.62,51.053423,-114.062589,45.421106,-75.690308
1,Business Class,Jan,Calgary,Victoria,1,740.6,51.053423,-114.062589,-36.59861,144.678005
2,Business Class,Jan,Campbell River,Ottawa,1,3482.85,50.023071,-125.244154,45.421106,-75.690308
3,Business Class,Jan,Charlottetown,Calgary,1,2807.24,46.234953,-63.132935,51.053423,-114.062589
4,Business Class,Jan,Charlottetown,Ottawa,1,737.35,46.234953,-63.132935,45.421106,-75.690308


Unnamed: 0,Major Class,Month of Travel Date,From,To,Sum of Net Tickets,Sum of Total $
0,Business Class,Jan,Calgary,Ottawa,2,6045.62
1,Business Class,Jan,Calgary,Victoria,1,740.60
2,Business Class,Jan,Campbell River,Ottawa,1,3482.85
3,Business Class,Jan,Charlottetown,Calgary,1,2807.24
4,Business Class,Jan,Charlottetown,Ottawa,1,737.35
...,...,...,...,...,...,...
16827,Premium Economy,Aug,Whitehorse,Ottawa,1,771.15
16828,Premium Economy,Aug,Williams Lake,Kingston,1,2786.89
16829,Premium Economy,Sep,Regina,Vancouver,1,1945.11
16830,Premium Economy,Oct,Penticton,Montreal,1,1513.46


## Major Class

In [31]:
clean_df.groupby(["Major Class"])[["Sum of Net Tickets", "Sum of Total $"]].sum()['Sum of Total $']/clean_df.groupby(["Major Class"])[["Sum of Net Tickets", "Sum of Total $"]].sum()['Sum of Net Tickets']

Major Class
Business Class     1845.703896
Economy             486.790989
First Class         235.342727
Premium Economy    1511.283000
dtype: float64

In [6]:
if __name__ == "__main__":
    main()

      Major Class Month of Travel Date            From        To  \
0  Business Class                  Jan         Calgary    Ottawa   
1  Business Class                  Jan         Calgary  Victoria   
2  Business Class                  Jan  Campbell River    Ottawa   
3  Business Class                  Jan   Charlottetown   Calgary   
4  Business Class                  Jan   Charlottetown    Ottawa   

   Sum of Net Tickets  Sum of Total $  
0                   2         6045.62  
1                   1          740.60  
2                   1         3482.85  
3                   1         2807.24  
4                   1          737.35  


In [33]:
clean_df[clean_df["Major Class"]=="First Class"]

Unnamed: 0,Major Class,Month of Travel Date,From,To,Sum of Net Tickets,Sum of Total $
16807,First Class,Jan,Vancouver,Toronto,2,303.58
16808,First Class,Jan,Victoria,Abbotsford,1,270.47
16809,First Class,May,Montreal,Quebec,1,231.74
16810,First Class,Aug,Lac Brochet,Thompson,2,695.1
16811,First Class,Sep,Vancouver,Victoria,1,207.0
16812,First Class,Sep,Victoria,Vancouver,1,159.0
16813,First Class,Nov,Vancouver,Nanaimo,1,108.0
16814,First Class,Nov,Winnipeg,St Theris Point,2,613.88
