In [1]:
import pandas as pd
import numpy as np
import pickle

In [6]:
dataset_path = "./../data/cleaned_government_data.csv"
query_extra_details_base_path = "./../data/prediction_page_queries/"

df = pd.read_csv(dataset_path, header=0)
df.head()

Unnamed: 0,Major Class,Month of Travel Date,From,To,Sum of Net Tickets,Sum of Total $
0,Business Class,Jan,Calgary,Ottawa,2,6045.62
1,Business Class,Jan,Calgary,Victoria,1,740.6
2,Business Class,Jan,Campbell River,Ottawa,1,3482.85
3,Business Class,Jan,Charlottetown,Calgary,1,2807.24
4,Business Class,Jan,Charlottetown,Ottawa,1,737.35


In [3]:
all_months = np.unique(df["Month of Travel Date"]).tolist()

In [4]:
unique_sources = np.unique(df['From']).tolist()
unique_destinations = np.unique(df['To']).tolist()
print(f"There are {len(unique_sources)} sources")
print(f"There are {len(unique_destinations)} destinations")

all_cities = set(unique_sources + unique_destinations)
len(all_cities)

There are 254 sources
There are 254 destinations


267

In [5]:
# Frequent destination cities
destination_cities_df = pd.DataFrame(columns = ['dest_city', 'count'])
for dest in unique_destinations:
    count = int(df[df['To']==dest].shape[0])
    row = destination_cities_df.shape[0]
    destination_cities_df.loc[row] = [dest, count]
    
frequent_dest_cities = destination_cities_df.sort_values(by=['count'], ascending=False).head(15)
frequent_dest_cities = frequent_dest_cities.reset_index()
frequent_dest_cities
destination_cities = frequent_dest_cities['dest_city'].tolist()

# Frequent source cities
source_cities_df = pd.DataFrame(columns = ['source_city', 'count'])
for source in unique_sources:
    count = int(df[df['From']==source].shape[0])
    row = source_cities_df.shape[0]
    source_cities_df.loc[row] = [source, count]
    
frequent_source_cities = source_cities_df.sort_values(by=['count'], ascending=False).head(15)
frequent_source_cities = frequent_source_cities.reset_index()
frequent_source_cities
source_cities = frequent_source_cities['source_city'].tolist()

important_cities = source_cities + destination_cities
unique_imp_cities = set(important_cities)
unique_imp_cities

{'Calgary',
 'Edmonton',
 'Fredericton',
 'Halifax',
 'Montreal',
 'Ottawa',
 'Quebec',
 'Regina',
 'Saskatoon',
 "St John's",
 'Thunder Bay',
 'Toronto',
 'Vancouver',
 'Victoria',
 'Winnipeg',
 'Yellowknife'}

In [7]:
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    for month in all_months:
        query_result = df[(df['To']==imp_city) & (df['Month of Travel Date']==month)]
        total_passengers = round(np.sum(query_result['Sum of Net Tickets']))
        data[imp_city][month] = total_passengers
        
with open(query_extra_details_base_path + "passengers_count_trend.sav", 'wb') as f:
    pickle.dump(data, f)

In [8]:
with open(query_extra_details_base_path + "passengers_count_trend.sav", 'rb') as f:
    a = pickle.load(f)
a

{'Edmonton': {'Apr': 1181,
  'Aug': 926,
  'Dec': 736,
  'Feb': 1324,
  'Jan': 1138,
  'Jul': 1122,
  'Jun': 1544,
  'Mar': 1542,
  'May': 1853,
  'Nov': 1523,
  'Oct': 1343,
  'Sep': 1414},
 "St John's": {'Apr': 394,
  'Aug': 528,
  'Dec': 435,
  'Feb': 549,
  'Jan': 484,
  'Jul': 550,
  'Jun': 642,
  'Mar': 665,
  'May': 705,
  'Nov': 937,
  'Oct': 928,
  'Sep': 789},
 'Montreal': {'Apr': 934,
  'Aug': 778,
  'Dec': 542,
  'Feb': 1221,
  'Jan': 1361,
  'Jul': 1066,
  'Jun': 935,
  'Mar': 1231,
  'May': 961,
  'Nov': 1399,
  'Oct': 1242,
  'Sep': 1256},
 'Calgary': {'Apr': 711,
  'Aug': 668,
  'Dec': 583,
  'Feb': 1128,
  'Jan': 870,
  'Jul': 845,
  'Jun': 982,
  'Mar': 1347,
  'May': 1031,
  'Nov': 1255,
  'Oct': 1228,
  'Sep': 1033},
 'Saskatoon': {'Apr': 215,
  'Aug': 202,
  'Dec': 180,
  'Feb': 308,
  'Jan': 228,
  'Jul': 193,
  'Jun': 357,
  'Mar': 343,
  'May': 301,
  'Nov': 351,
  'Oct': 345,
  'Sep': 353},
 'Regina': {'Apr': 457,
  'Aug': 386,
  'Dec': 319,
  'Feb': 557,
  'Ja