In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
dataset_path = "./../data/cleaned_government_data.csv"
query_extra_details_base_path = "./../data/visualisation_query_data/extra_details/"

df = pd.read_csv(dataset_path, header=0)
df.head()

Unnamed: 0,Major Class,Month of Travel Date,From,To,Sum of Net Tickets,Sum of Total $
0,Business Class,Jan,Calgary,Ottawa,2,6045.62
1,Business Class,Jan,Calgary,Victoria,1,740.6
2,Business Class,Jan,Campbell River,Ottawa,1,3482.85
3,Business Class,Jan,Charlottetown,Calgary,1,2807.24
4,Business Class,Jan,Charlottetown,Ottawa,1,737.35


In [3]:
all_months = np.unique(df["Month of Travel Date"]).tolist()

In [4]:
unique_sources = np.unique(df['From']).tolist()
unique_destinations = np.unique(df['To']).tolist()
print(f"There are {len(unique_sources)} sources")
print(f"There are {len(unique_destinations)} destinations")

all_cities = set(unique_sources + unique_destinations)
len(all_cities)

There are 254 sources
There are 254 destinations


267

In [5]:
# Frequent destination cities
destination_cities_df = pd.DataFrame(columns = ['dest_city', 'count'])
for dest in unique_destinations:
    count = int(df[df['To']==dest].shape[0])
    row = destination_cities_df.shape[0]
    destination_cities_df.loc[row] = [dest, count]
    
frequent_dest_cities = destination_cities_df.sort_values(by=['count'], ascending=False).head(15)
frequent_dest_cities = frequent_dest_cities.reset_index()
frequent_dest_cities
destination_cities = frequent_dest_cities['dest_city'].tolist()

# Frequent source cities
source_cities_df = pd.DataFrame(columns = ['source_city', 'count'])
for source in unique_sources:
    count = int(df[df['From']==source].shape[0])
    row = source_cities_df.shape[0]
    source_cities_df.loc[row] = [source, count]
    
frequent_source_cities = source_cities_df.sort_values(by=['count'], ascending=False).head(15)
frequent_source_cities = frequent_source_cities.reset_index()
frequent_source_cities
source_cities = frequent_source_cities['source_city'].tolist()

important_cities = source_cities + destination_cities
unique_imp_cities = set(important_cities)
unique_imp_cities

{'Calgary',
 'Edmonton',
 'Fredericton',
 'Halifax',
 'Montreal',
 'Ottawa',
 'Quebec',
 'Regina',
 'Saskatoon',
 "St John's",
 'Thunder Bay',
 'Toronto',
 'Vancouver',
 'Victoria',
 'Winnipeg',
 'Yellowknife'}

## Query1

In [6]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['To']==imp_city) & (df['Month of Travel Date']==month)]
        val = np.sum(query_result['Sum of Net Tickets'])
        data[imp_city][month] = val
        values.append(val)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query1.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 549 ms, sys: 327 µs, total: 549 ms
Wall time: 549 ms


## Query 2

In [7]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['From']==imp_city) & (df['Month of Travel Date']==month)]
        val = np.sum(query_result['Sum of Net Tickets'])
        data[imp_city][month] = val
        values.append(val)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query2.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 551 ms, sys: 0 ns, total: 551 ms
Wall time: 549 ms


## Query 3

In [8]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['From']==imp_city) & (df['Month of Travel Date']==month)]
        passenger_count = np.sum(query_result['Sum of Net Tickets'])
        total_cost = np.sum(query_result["Sum of Total $"])
        avg_cost = round((total_cost/passenger_count),2)
        data[imp_city][month] = avg_cost
        values.append(avg_cost)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query3.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 585 ms, sys: 11 µs, total: 585 ms
Wall time: 584 ms


## Query 4

In [9]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['To']==imp_city) & (df['Month of Travel Date']==month)]
        passenger_count = np.sum(query_result['Sum of Net Tickets'])
        total_cost = np.sum(query_result["Sum of Total $"])
        avg_cost = round((total_cost/passenger_count),2)
        data[imp_city][month] = avg_cost
        values.append(avg_cost)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query4.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 585 ms, sys: 0 ns, total: 585 ms
Wall time: 587 ms


## Query 5

In [10]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['To']==imp_city) & (df['Month of Travel Date']==month)]
        val = round(np.sum(query_result['Sum of Total $']),2)
        data[imp_city][month] = val
        values.append(val)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query5.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 582 ms, sys: 0 ns, total: 582 ms
Wall time: 582 ms


## Query 6

In [11]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['From']==imp_city) & (df['Month of Travel Date']==month)]
        val = round(np.sum(query_result['Sum of Total $']),2)
        data[imp_city][month] = val
        values.append(val)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query6.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 567 ms, sys: 0 ns, total: 567 ms
Wall time: 565 ms


## Query  7

In [12]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['To']==imp_city) & (df['Month of Travel Date']==month)]
        val = round(np.sum(query_result['Sum of Net Tickets']))
        data[imp_city][month] = val
        values.append(val)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query7.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 555 ms, sys: 0 ns, total: 555 ms
Wall time: 555 ms


## Query 8

In [13]:
%%time
data = {}
for imp_city in unique_imp_cities:
    data[imp_city] = {}
    values = []
    for month in all_months:
        query_result = df[(df['From']==imp_city) & (df['Month of Travel Date']==month)]
        val = round(np.sum(query_result['Sum of Net Tickets']))
        data[imp_city][month] = val
        values.append(val)
    data[imp_city]['min'] = min(values)
    data[imp_city]['max'] = max(values)
    data[imp_city]['avg'] = round(sum(values)/len(values))

filename = query_extra_details_base_path + "query8.sav"
with open(filename, 'wb') as f:
    pickle.dump(data, f)

CPU times: user 550 ms, sys: 0 ns, total: 550 ms
Wall time: 550 ms
