Please notice that the Telia data which is used as an input to this script is private and thus not provided in the library. Neverteless, the script is provided in case the reader eants to reproduce this study on a simillar dataset of their own.

In [8]:
import scipy.stats as stat
import numpy as np
import datetime as dt
import pandas as pd
import pickle
import os

Finding the repository we are working in so later is easier to read/write files

In [None]:
dir_path = os.path.abspath('')
data_path = os.path.join(dir_path, 'data')
print(data_path)

In [3]:
def date_to_weekday(day, month, year):
    parsed_date = dt.date(int(year), int(month), int(day))
    weekday_map = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday", 4:"Friday", 5:"Saturday", 6:"Sunday"}
    return((weekday_map[parsed_date.weekday()]))

This function will read the file kunta_utf-8_trimmed_include_both_swedish_and_Finnish.csv and generate 2 dictionaries. The first one will map the hospital care districts (HCD) to the municipalities. The second one will do the inverse order.

In [6]:
def map_municiplaities_to_hospital_district_areas(data_path):
    #This function returns two dictionaries h_to_m and m_to_h.
    #h_to_m keys are HCDs and the values are list of their municipalities.
    #m_to_h keys are municipalities and their corresponding values the HCDs that they belong to.
    line_counter = 0
    m_to_h = {}
    h_to_m = {}
    with open(data_path, 'r') as data:
        for line in data:
            line_counter += 1
            ### the enteries start from 6th line
            if line_counter > 5:
                fields = line.strip().split(';')
                municipality_string = fields[1]
                #print(municipality_string)
                municipality = municipality_string[1:-1]
                #print(municipality)
                hospital_string = fields[3]
                hospital_district = hospital_string[1:-1]
                #print(hospital_district)
                m_to_h[municipality] = hospital_district
                if hospital_district not in h_to_m.keys():
                    h_to_m[hospital_district] = set()
                h_to_m[hospital_district].add(municipality)
    return h_to_m, m_to_h

In [7]:
m_to_h_data_converted_format = os.path.join(data_path, 'kunta_utf-8_trimmed_include_both_swedish_and_Finnish.csv')
h_to_m, m_to_h = map_municiplaities_to_hospital_district_areas(m_to_h_data_converted_format)
#print(m_to_h)

# Telia

In [84]:
def calculate_total_hcd_out_in_telia(telia_datapath, municipality_to_hcd_map, out_degree_save_path = None, in_degree_save_path = None):
    hcd_in_degree = {} # a dictionary with (destination, hour, day, month, year) as key and total in-degree as value
    hcd_out_degree = {} # a dictionary with (origin, hour, day, month, year) as key and total out-degree as value
    with open(telia_datapath, 'r') as data:
        next(data)
        for line in data:
            #print(line)
            fields = line.strip().split(",")
            origin_m = fields[3]
            destination_m = fields[7]
            origin_hcd = municipality_to_hcd_map[origin_m]
            destination_hcd = municipality_to_hcd_map[destination_m]
            if origin_hcd != destination_hcd:
                date_string = fields[1]
                date_string_splitted = (date_string).split("-")
                year = str(int(date_string_splitted[0]))
                month = str(int(date_string_splitted[1]))
                day = str(int(date_string_splitted[2]))
                hour = str(int(fields[-1].split("-")[0]))
                flow = int(fields[0])
                if (destination_hcd, hour, day, month, year) not in hcd_in_degree.keys():
                    hcd_in_degree[(destination_hcd, hour, day, month, year)] = 0
                hcd_in_degree[(destination_hcd, hour, day, month, year)] += flow 
                if (origin_hcd, hour, day, month, year) not in hcd_out_degree.keys():
                    hcd_out_degree[(origin_hcd, hour, day, month, year)] = 0
                hcd_out_degree[(origin_hcd, hour, day, month, year)] += flow
            
    ### print the in degree and out degree to files
    if out_degree_save_path is not None:
        out_key_list = list(hcd_out_degree.keys())
        sorted_out_key_list = sorted(out_key_list,key=lambda x: (x[0], x[4], x[3], x[2], x[1]))
        with open(out_degree_save_path, "w") as out_save_file:
            header = "in-or-out, hcd, hour, day, month, year, flow"
            out_save_file.write(header+"\n")
            for key in sorted_out_key_list:
                out_save_file.write("outdegree,"+','.join([str(key[0]),str(key[1]),str(key[2]),str(key[3]),str(key[4])])+','+str(hcd_out_degree[key])+"\n")
    if in_degree_save_path is not None:
        in_key_list = list(hcd_in_degree.keys())
        sorted_in_key_list = sorted(in_key_list,key=lambda x: (x[0], x[4], x[3], x[2], x[1]))
        with open(in_degree_save_path, "w") as in_save_file:
            header = "in-or-out, hcd, hour, day, month, year, flow"
            in_save_file.write(header+"\n")
            for key in sorted_in_key_list:
                in_save_file.write("indegree,"+','.join([str(key[0]),str(key[1]),str(key[2]),str(key[3]),str(key[4])])+','+str(hcd_in_degree[key])+"\n")
    return(hcd_in_degree, hcd_out_degree)

In [85]:
telia_datapath = '/.../GSE_od_muni_concatenated_aland_and_days_missing_more_than_10_municipalities_removed.csv'

hcd_telia_indegree_book, hcd_telia_outdegree_book = calculate_total_hcd_out_in_telia(telia_datapath, m_to_h, None, None)
hcd_telia_indegree_book_2019 = {key:value for (key, value) in hcd_telia_indegree_book.items() if key[4] == "2019"}
hcd_telia_outdegree_book_2019 = {key:value for (key, value) in hcd_telia_outdegree_book.items() if key[4] == "2019"}

In [86]:
def telia_od_book_hcd_level(telia_data_path, municipality_to_hcd_map):
    #trips_sum,date,origin_muni_code,origin_muni,origin_region_code,origin_region,dest_muni_code,dest_muni,dest_region_code,dest_region,hour_bin
    od_book = {}
    with open(telia_data_path, 'r') as data:
        next(data)
        for line in data:
            fields = line.strip().split(",")
            origin_m = fields[3]
            destination_m = fields[7]
            origin_hcd = municipality_to_hcd_map[origin_m]
            destination_hcd = municipality_to_hcd_map[destination_m]
            date_string = fields[1]
            date_string_splitted = (date_string).split("-")
            year = str(int(date_string_splitted[0]))
            month = str(int(date_string_splitted[1]))
            day = str(int(date_string_splitted[2]))
            hour = str(int(fields[-1].split("-")[0]))
            flow = int(fields[0])
            if (origin_hcd, destination_hcd, hour, day, month, year) not in od_book:
                od_book[(origin_hcd, destination_hcd, hour, day, month, year)] = 0
            od_book[(origin_hcd, destination_hcd, hour, day, month, year)] += flow
    return(od_book)

In [87]:
def od_book_to_od_dataframe(telia_od_book):
    #trips_sum,date,origin_muni_code,origin_muni,origin_region_code,origin_region,dest_muni_code,dest_muni,dest_region_code,dest_region,hour_bin
    list_of_lists = [] 
    import pandas as pd  
    
    #telia_hcd_od["Åland", "Helsinki and Uusimaa Hospital District", "6", "10", "2", "2019"]
    header = ['origin', 'destination', 'hour', 'day', 'month', 'year', 'day_of_the_week', 'time_string', 'flow']
    for key in telia_od_book.keys():
        origin, destination, hour, day, month, year = key[0], key[1], key[2], key[3], key[4], key[5]
        weekday = date_to_weekday(int(day), int(month), int(year))
        flow = telia_od_book[key]
        time_string = hour + " " + ("-").join([year, month, day])
        list_of_lists.append([origin, destination, hour, day, month, year, weekday, time_string, flow])
    df = pd.DataFrame(list_of_lists, columns = header)
    return(df)

In [88]:
telia_hcd_od_book = telia_od_book_hcd_level(telia_datapath, m_to_h)

In [91]:
telia_hcd_od_book_2019 = {key:value for (key, value) in telia_hcd_od_book.items() if key[5] == "2019"}
telia_hcd_od_book_2020 = {key:value for (key, value) in telia_hcd_od_book.items() if key[5] == "2020"}
#pickle the ground truth
pickle_path = "/.../telia_ground_truth.pkl"
with open(pickle_path, 'wb') as handle:
    pickle.dump(telia_hcd_od_book_2020, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
telia_hcd_od_dataframe = od_book_to_od_dataframe(telia_hcd_od_book)
telia_hcd_od_dataframe_2019 = telia_hcd_od_dataframe.loc[telia_hcd_od_dataframe['year'] == "2019"]
telia_hcd_od_dataframe_2020 = telia_hcd_od_dataframe.loc[telia_hcd_od_dataframe['year'] == "2020"]

In [None]:
def breaking_outgoing_telia_flow_to_fractions_and_make_dataframe(telia_od_df):
    header = ['origin', 'destination', 'hour', 'day', 'month', 'year', 'day_of_the_week', 'time_string', 'flow', 'fraction' ]
    unique_timestring_list = telia_od_df.time_string.unique().tolist()
    fractions_list_of_lists = []
    telia_od_diag_removed = telia_od_df.loc[telia_od_df['destination'] != telia_od_df['origin']]
    unique_origins = telia_od_diag_removed.origin.unique().tolist()
    for o in unique_origins:
        print(o)
        selected_origin = telia_od_diag_removed.loc[telia_od_diag_removed['origin'] == o]
        for t in unique_timestring_list:
            selected_origin_time = selected_origin.loc[selected_origin['time_string'] == t]
            total_flow = selected_origin_time['flow'].sum()
            slected_list_of_lists = selected_origin_time.values.tolist()
            for ls in slected_list_of_lists:
                fraction = ls[-1]/total_flow
                ls.append(fraction)
                fractions_list_of_lists.append(ls)
    fractions_df = pd.DataFrame(fractions_list_of_lists, columns = header)
    return(fractions_df)

In [None]:
out_frac_df_2019 = breaking_outgoing_telia_flow_to_fractions_and_make_dataframe(telia_hcd_od_dataframe_2019)

In [None]:
def breaking_incomming_telia_flow_to_fractions_and_make_dataframe(telia_od_df):
    header = ['origin', 'destination', 'hour', 'day', 'month', 'year', 'day_of_the_week', 'time_string', 'flow', 'fraction']
    telia_od_diag_removed = telia_od_df.loc[telia_od_df['destination'] != telia_od_df['origin']]
    unique_timestring_list = telia_od_diag_removed.time_string.unique().tolist()
    print(len(unique_timestring_list))
    unique_destinations = telia_od_diag_removed.destination.unique().tolist()
    print(len(unique_destinations))
    fractions_list_of_lists = []
    for d in unique_destinations:
        print(d)
        selected_destination = telia_od_diag_removed.loc[telia_od_diag_removed['destination'] == d]
        for t in unique_timestring_list:
            selected_destination_time = selected_destination.loc[selected_destination['time_string'] == t]
            total_flow = selected_destination_time['flow'].sum()
            slected_list_of_lists = selected_destination_time.values.tolist()
            for ls in slected_list_of_lists:
                fraction = ls[-1]/total_flow
                ls.append(fraction)
                fractions_list_of_lists.append(ls)
    fractions_df = pd.DataFrame(fractions_list_of_lists, columns = header)
    return(fractions_df)

In [None]:
in_frac_df_2019 = breaking_incomming_telia_flow_to_fractions_and_make_dataframe(telia_hcd_od_dataframe_2019)

In [None]:
def make_regional_in_frac_df(in_frac_df):
    # this function is called inside those functions which calculate the median of fractions
    book_of_dataframes = {}
    unique_destinations = in_frac_df.destination.unique().tolist()
    for d in unique_destinations:
        selected_destination = in_frac_df.loc[in_frac_df['destination'] == d]
        book_of_dataframes[d] = selected_destination
    return(book_of_dataframes)

In [None]:
def make_regional_out_frac_df(out_frac_df):
    # this function is called inside those functions which calculate the median of fractions
    book_of_dataframes = {}
    unique_origins = out_frac_df.origin.unique().tolist()
    for o in unique_origins:
        selected_origin = out_frac_df.loc[out_frac_df['origin'] == o]
        book_of_dataframes[o] = selected_origin
    return(book_of_dataframes)

In [None]:
def make_weekday_frac_df(df):
    book_of_dataframes = {}
    unique_weekdays = df.day_of_the_week.unique().tolist()
    for w in unique_weekdays:
        selected_weekday = df.loc[df['day_of_the_week'] == w]
        book_of_dataframes[w] = selected_weekday
    return(book_of_dataframes)

In [None]:
def make_month_frac_df(df):
    book_of_dataframes = {}
    unique_months = df.month.unique().tolist()
    for m in unique_months:
        selected_month = df.loc[df['month'] == m]
        book_of_dataframes[m] = selected_month
    return(book_of_dataframes)

In [None]:
def make_hour_frac_df(df):
    book_of_dataframes = {}
    unique_hours = df.hour.unique().tolist()
    for h in unique_hours:
        #print(h)
        selected_hour = df.loc[df['hour'] == h]
        book_of_dataframes[h] = selected_hour
    return(book_of_dataframes)

In [None]:
def divide_different_years(fraction_df):
    book_of_dataframes = {}
    year_list = ['2019', '2020']
    for y in year_list:
        selected_lines = fraction_df.loc[fraction_df['year'] == y]
        book_of_dataframes[y] = selected_lines
    return(book_of_dataframes)

In [None]:
def median_out_fraction_hour_and_weekday(out_fraction_df):
    print("dataframe length:")
    print(len(out_fraction_df))
    median_book = {}
    separated_by_o = make_regional_out_frac_df(out_fraction_df)
    for o in separated_by_o.keys():
        separated_by_o_w = make_weekday_frac_df(separated_by_o[o])
        for w in separated_by_o_w.keys():
            separated_by_o_w_h = make_hour_frac_df(separated_by_o_w[w])
            for h in separated_by_o_w_h.keys():
                small_df = separated_by_o_w_h[h]
                unique_destinations = small_df.destination.unique().tolist()
                unnormalized_mobility_vector = dict()
                for d in unique_destinations:
                    filtered_by_destination = small_df.loc[small_df['destination'] == d]
                    data_points = len(filtered_by_destination)
                    median = filtered_by_destination['fraction'].median()
                    unnormalized_mobility_vector[d] = median
                    #renormalize_the_signature
                sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
                for key in unnormalized_mobility_vector:
                    median_book[(o, key, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector                   
    return(median_book)

In [None]:
def median_in_fraction_hour_and_weekday(in_fraction_df):
    median_book = {}
    print("dataframe length:")
    print(len(in_fraction_df))
    counter = 0
    separated_by_d = make_regional_in_frac_df(in_fraction_df)
    for d in separated_by_d.keys():
        separated_by_d_w = make_weekday_frac_df(separated_by_d[d])
        for w in separated_by_d_w.keys():
            separated_by_d_w_h = make_hour_frac_df(separated_by_d_w[w])
            for h in separated_by_d_w_h.keys():
                small_df = separated_by_d_w_h[h]
                unique_origins = small_df.origin.unique().tolist()
                unnormalized_mobility_vector = dict()
                for o in unique_origins:
                    filtered_by_origin = small_df.loc[small_df['origin'] == o]
                    data_points = len(filtered_by_origin)
                    median = filtered_by_origin['fraction'].median()
                    unnormalized_mobility_vector[o] = median
                    #renormalize_the_signature
                sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
                for key in unnormalized_mobility_vector:
                    median_book[(key, d, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector
                    counter += 1
    return(median_book)

In [None]:
def median_out_fraction_basic(out_fraction_df):
    w_list = out_fraction_df.day_of_the_week.unique().tolist()
    h_list = out_fraction_df.hour.unique().tolist()
    print("dataframe length:")
    print(len(out_fraction_df))
    median_book = {}
    separated_by_o = make_regional_out_frac_df(out_fraction_df)
    for o in separated_by_o.keys():
        small_df = separated_by_o[o]
        unique_destinations = small_df.destination.unique().tolist()
        unnormalized_mobility_vector = dict()
        for d in unique_destinations:
            filtered_by_destination = small_df.loc[small_df['destination'] == d]
            median = filtered_by_destination['fraction'].median()
            unnormalized_mobility_vector[d] = median
        #renormalize_the_signature
        sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
        for key in unnormalized_mobility_vector:
            for w in w_list:
                for h in h_list:
                    median_book[(o, key, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector
            
    return(median_book)

In [None]:
def median_in_fraction_basic(in_fraction_df):
    w_list = in_fraction_df.day_of_the_week.unique().tolist()
    h_list = in_fraction_df.hour.unique().tolist()
    median_book = {}
    print("dataframe length:")
    print(len(in_fraction_df))
    separated_by_d = make_regional_in_frac_df(in_fraction_df)
    for d in separated_by_d.keys():
        small_df = separated_by_d[d]
        unique_origins = small_df.origin.unique().tolist()
        unnormalized_mobility_vector = dict()
        for o in unique_origins:
            filtered_by_origin = small_df.loc[small_df['origin'] == o]
            median = filtered_by_origin['fraction'].median()
            unnormalized_mobility_vector[o] = median
        #renormalize_the_signature
        sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
        for key in unnormalized_mobility_vector:
            for w in w_list:
                for h in h_list:
                    median_book[(key, d, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector
    return(median_book)

In [None]:
def median_in_fraction_hour(in_fraction_df):
    median_book = {}
    w_list = in_fraction_df.day_of_the_week.unique().tolist()
    print("dataframe length:")
    print(len(in_fraction_df))
    separated_by_d = make_regional_in_frac_df(in_fraction_df)
    for d in separated_by_d.keys():
        separated_by_d_h = make_hour_frac_df(separated_by_d[d])
        for h in separated_by_d_h.keys():
            small_df = separated_by_d_h[h]
            unique_origins = small_df.origin.unique().tolist()
            unnormalized_mobility_vector = dict()
            for o in unique_origins:
                filtered_by_origin = small_df.loc[small_df['origin'] == o]
                median = filtered_by_origin['fraction'].median()
                unnormalized_mobility_vector[o] = median
            #renormalize_the_signature
            sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
            for key in unnormalized_mobility_vector:
                for w in w_list:
                    median_book[(key, d, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector
    return(median_book)

In [None]:
def median_out_fraction_hour(out_fraction_df):
    w_list = out_fraction_df.day_of_the_week.unique().tolist()
    print("dataframe length:")
    print(len(out_fraction_df))
    median_book = {}
    separated_by_o = make_regional_out_frac_df(out_fraction_df)
    for o in separated_by_o.keys():
        separated_by_o_h = make_hour_frac_df(separated_by_o[o])
        for h in separated_by_o_h.keys():
            small_df = separated_by_o_h[h]
            unique_destinations = small_df.destination.unique().tolist()
            unnormalized_mobility_vector = dict()
            for d in unique_destinations:
                filtered_by_destination = small_df.loc[small_df['destination'] == d]
                median = filtered_by_destination['fraction'].median()
                unnormalized_mobility_vector[d] = median
            #renormalize_the_signature
            sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
            for key in unnormalized_mobility_vector:
                for w in w_list:
                    median_book[(o, key, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector
    return(median_book)

In [None]:
def median_in_fraction_weekday(in_fraction_df):
    h_list = in_fraction_df.hour.unique().tolist()
    median_book = {}
    print("dataframe length:")
    print(len(in_fraction_df))
    counter = 0
    separated_by_d = make_regional_in_frac_df(in_fraction_df)
    for d in separated_by_d.keys():
        separated_by_d_w = make_weekday_frac_df(separated_by_d[d])
        for w in separated_by_d_w.keys():
            small_df = separated_by_d_w[w]
            unique_origins = small_df.origin.unique().tolist()
            unnormalized_mobility_vector = dict()
            for o in unique_origins:
                filtered_by_origin = small_df.loc[small_df['origin'] == o]
                median = filtered_by_origin['fraction'].median()
                unnormalized_mobility_vector[o] = median
            #renormalize_the_signature
            sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
            for key in unnormalized_mobility_vector:
                for h in h_list:
                    median_book[(key, d, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector
    return(median_book)

In [None]:
def median_out_fraction_weekday(out_fraction_df):
    h_list = out_fraction_df.hour.unique().tolist()
    print("dataframe length:")
    print(len(out_fraction_df))
    median_book = {}
    counter = 0
    encountered_keys = 0
    separated_by_o = make_regional_out_frac_df(out_fraction_df)
    for o in separated_by_o.keys():
        separated_by_o_w = make_weekday_frac_df(separated_by_o[o])
        for w in separated_by_o_w.keys():
            small_df = separated_by_o_w[w]
            unique_destinations = small_df.destination.unique().tolist()
            unnormalized_mobility_vector = dict()
            for d in unique_destinations:
                #filtered_by_destination = small_df.loc[small_df['destination'] != d]
                filtered_by_destination = small_df.loc[small_df['destination'] == d]
                median = filtered_by_destination['fraction'].median()
                unnormalized_mobility_vector[d] = median
            #renormalize_the_signature
            sum_unnormalized_mobility_vector = sum(list(unnormalized_mobility_vector.values()))
            for key in unnormalized_mobility_vector:
                for h in h_list:
                    median_book[(o, key, h, w)] = unnormalized_mobility_vector[key]/sum_unnormalized_mobility_vector
    return(median_book)

# Road Traffic

In [10]:
road_hcd_indegree_path = os.path.join(dir_path, 'road_traffic_data') + "/road_hcd_outdegree.pkl"
road_hcd_outdegree_path = os.path.join(dir_path, 'road_traffic_data') + "/road_hcd_outdegree.pkl"
with open(road_hcd_indegree_path,'rb') as f:
     road_indegree_book = pickle.load(f)
with open(road_hcd_outdegree_path,'rb') as f:
     road_outdegree_book = pickle.load(f)

In [12]:
road_indegree_book_2020 = {key:value for (key, value) in road_indegree_book.items() if key[4] == "2020"}
road_indegree_book_2019 = {key:value for (key, value) in road_indegree_book.items() if key[4] == "2019"}
road_outdegree_book_2020 = {key:value for (key, value) in road_outdegree_book.items() if key[4] == "2020"}
road_outdegree_book_2019 = {key:value for (key, value) in road_outdegree_book.items() if key[4] == "2019"}

# Comparing Telia and Road Traffic

In [13]:
road_hcd_set = set([key[0] for key in list(road_indegree_book_2020.keys())])
telia_hcd_set = set([key[0] for key in list(telia_hcd_od_book.keys())])   
common_hcd_list = road_hcd_set.intersection(telia_hcd_set)
print(common_hcd_list)

NameError: name 'telia_hcd_od_book' is not defined

In [None]:
def get_two_degree_dictionary_and_make_a_dataframe_of_their_mutual_rows(road_degree_dict, telia_degree_dict):
    list_of_lists = []
    df_header = ['region', 'hour', 'day', 'month', 'year', "day_of_the_week", 'timestamp', 'road_degree', 'telia_degree']
    mutual_keys_list = list(set(telia_degree_dict.keys()).intersection(set(road_degree_dict.keys())))
    for key in mutual_keys_list:
        (region, hour, day, month, year) = key
        #timestring = hour + " " + ("-").join([year, month, day])
        datetime_string = year+"-"+month+"-"+day+" "+hour+":00"       
        weekday = date_to_weekday(int(day), int(month), int(year))
        list_to_append = [region, hour, day, month, year, weekday, datetime_string, road_degree_dict[key], telia_degree_dict[key]]
        list_of_lists.append(list_to_append)
    df = pd.DataFrame(list_of_lists, columns = df_header)
    return(df)   

In [None]:
in_degree_road_and_telia_df_2020 = get_two_degree_dictionary_and_make_a_dataframe_of_their_mutual_rows(road_indegree_book_2020, hcd_telia_indegree_book)
in_degree_road_and_telia_df_2019 = get_two_degree_dictionary_and_make_a_dataframe_of_their_mutual_rows(road_indegree_book_2019, hcd_telia_indegree_book)
out_degree_road_and_telia_df_2020 = get_two_degree_dictionary_and_make_a_dataframe_of_their_mutual_rows(road_outdegree_book_2020, hcd_telia_outdegree_book)
out_degree_road_and_telia_df_2019 = get_two_degree_dictionary_and_make_a_dataframe_of_their_mutual_rows(road_outdegree_book_2019, hcd_telia_outdegree_book)
in_degree_road_and_telia_df_2020

In [None]:
def make_regional_degree_df(degree_df):
    book_of_dataframes = {}
    unique_regions = degree_df.region.unique().tolist()
    for r in unique_regions:
        selected_lines = degree_df.loc[degree_df['region'] == r]
        book_of_dataframes[r] = selected_lines
    return(book_of_dataframes)

In [None]:
def divide_different_hours(road_and_telia_degree_dataframe):
    book_of_dataframes = {}
    hour_list = ['0', '6', '12', '18']
    for h in hour_list:
        selected_lines = road_and_telia_degree_dataframe.loc[road_and_telia_degree_dataframe['hour'] == h]
        book_of_dataframes[h] = selected_lines
    return(book_of_dataframes)

In [None]:
def divide_different_weekdays(road_and_telia_degree_dataframe):
    df = road_and_telia_degree_dataframe
    book_of_dataframes = {}
    weekday_list = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    for weekday in weekday_list:
        mask = (df['day_of_the_week'] == weekday)
        related_lines = road_and_telia_degree_dataframe.loc[mask]
        book_of_dataframes[weekday] = related_lines
    return(book_of_dataframes)

In [None]:
def divide_different_hours_and_different_weekdays(road_and_telia_degree_dataframe):
    book_of_dataframes = {}
    hour_list = ['0', '6', '12', '18']
    weekday_list = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    for h in hour_list:
        for wd in weekday_list:
            h_lines = road_and_telia_degree_dataframe.loc[road_and_telia_degree_dataframe['hour'] == h]
            selected_lines = h_lines.loc[h_lines['day_of_the_week'] == wd]
            book_of_dataframes[(h, wd)] = selected_lines
    return(book_of_dataframes)

In [None]:
def weekend_and_weekdays(road_and_telia_degree_dataframe):
    df = road_and_telia_degree_dataframe
    book_of_dataframes = {}
    weekend_mask = (df['day_of_the_week'] == "Saturday") | (df['day_of_the_week'] == "Sunday")
    weekday_mask = (df['day_of_the_week'] != "Saturday") & (df['day_of_the_week'] != "Sunday")
    #weekend_lines = road_and_telia_degree_dataframe.loc[road_and_telia_degree_dataframe['day_of_the_week'] in {"Saturday", "Sunday"}]
    weekend_lines = road_and_telia_degree_dataframe.loc[weekend_mask]
    weekday_lines = road_and_telia_degree_dataframe.loc[weekday_mask]
    book_of_dataframes["weekend"] = weekend_lines
    book_of_dataframes["weekday"] = weekday_lines
    return(book_of_dataframes)

In [None]:
def linear_transform_from_road_degree_to_telia_degree(filter_function, road_and_telia_degree_dataframe):
    
    transformation_values_book = {}
    if filter_function == None:
        dict_of_flitered_dataframes = {"all": road_and_telia_degree_dataframe}
    else:
        dict_of_flitered_dataframes = filter_function(road_and_telia_degree_dataframe)
    for label in dict_of_flitered_dataframes.keys():
        flitered_df = dict_of_flitered_dataframes[label]
        book_of_dataframes_indexed_by_region = make_regional_degree_df(flitered_df)
        for hcd in book_of_dataframes_indexed_by_region.keys():
            hcd_filterd_df = book_of_dataframes_indexed_by_region[hcd]
            road_degree_list = hcd_filterd_df['road_degree'].tolist()
            telia_degree_list = hcd_filterd_df['telia_degree'].tolist()
            region_list = hcd_filterd_df['region'].tolist()
            hour_list = hcd_filterd_df['hour'].tolist()
            day_list = hcd_filterd_df['day'].tolist()
            month_list = hcd_filterd_df['month'].tolist()
            year_list = hcd_filterd_df['year'].tolist()
            correlation_coefficient, p_value = stat.pearsonr(telia_degree_list, road_degree_list)
            m, b = np.polyfit(road_degree_list, telia_degree_list, 1)
            for i in range(len(region_list)):
                reg = region_list[i]
                hour = hour_list[i]
                day = day_list[i]
                month = month_list[i]
                year = year_list[i]
                weekday = date_to_weekday(day, month, year) ## this is s atupidly expensive way of doing this, correct it later
                transformation_values_book[(reg, hour, weekday)] = (correlation_coefficient, p_value, m, b)
    return(transformation_values_book)   

In [None]:
linear_transformation_book_indegree_2019 = linear_transform_from_road_degree_to_telia_degree(divide_different_hours, in_degree_road_and_telia_df_2019)
linear_transformation_book_outdegree_2019 = linear_transform_from_road_degree_to_telia_degree(divide_different_hours, out_degree_road_and_telia_df_2019)
linear_transformation_book_indegree_2019

In [None]:
in_degree_road_and_telia_df_2020['parsed_time'] = pd.to_datetime(in_degree_road_and_telia_df_2020['timestamp'])
#in_degree_road_and_telia_df_2020

In [None]:
in_degree_road_and_telia_df_2020

In [None]:
def predict_flow(keys_to_predict_for, in_road_degree_book, indegree_transformation_book, median_in_frac_book, out_road_degree_book, outdegree_transformation_book, median_out_frac_book, path_to_save_results = None):
    outdegree_trans_key_set = set(outdegree_transformation_book.keys())
    indegree_trans_key_set = set(indegree_transformation_book.keys())
    out_road_degree_keys_set = set(out_road_degree_book.keys())
    in_road_degree_keys_set = set(in_road_degree_book.keys())
    key_list_to_loop_over = []
    for (origin, destination, hour, day, month, year) in keys_to_predict_for:
        #print(len(keys_to_predict_for))
        if (origin, hour, day, month, year) in out_road_degree_keys_set:
            #print(len(out_road_degree_keys_set))
            if (destination, hour, day, month, year) in in_road_degree_keys_set:
                #print(len(in_road_degree_keys_set))
                weekday = date_to_weekday(day, month, year) #figure out the weekday
                if (origin, hour, weekday) in outdegree_trans_key_set:
                    #print(len(outdegree_trans_key_set))
                    if (destination, hour, weekday) in indegree_trans_key_set:
                        if (origin, destination, hour, weekday) in median_out_frac_book:
                            if (origin, destination, hour, weekday) in median_in_frac_book:
                                key_list_to_loop_over.append((origin, destination, hour, day, month, year))                        
    
    
    
    out_pred_book = {}
    in_pred_book = {}
    comb_pred_book = {}
    for (origin, destination, hour, day, month, year) in key_list_to_loop_over:
        weekday = date_to_weekday(day, month, year) #figure out the weekday
        out_key = (origin, hour, day, month, year) 
        #trans_tuple = outdegree_transformation_book[out_key]
        trans_tuple = outdegree_transformation_book[(origin, hour, weekday)]
        m_out, b_out = trans_tuple[2], trans_tuple[3]
        f_out = median_out_frac_book[(origin, destination, hour, weekday)]
        out_pred = (out_road_degree_book[out_key] * m_out + b_out) * f_out
        out_pred_book[(origin, destination, hour, day, month, year)] = out_pred
        
        in_key = (destination, hour, day, month, year)
        #trans_tuple = indegree_transformation_book[in_key]
        trans_tuple = indegree_transformation_book[(destination, hour, weekday)]
        m_in, b_in = trans_tuple[2], trans_tuple[3]
        f_in = median_in_frac_book[(origin, destination, hour, weekday)]
        in_pred = (in_road_degree_book[in_key] * m_in + b_in) * f_in
        in_pred_book[(origin, destination, hour, day, month, year)] = in_pred
        #from IPython.core.debugger import Pdb
        #Pdb().set_trace()
    for key in in_pred_book.keys():
        if key in out_pred_book.keys():
            comb_pred = (in_pred_book[key] + out_pred_book[key])/2
            comb_pred_book[key] = comb_pred
            
    if path_to_save_results:
        print("printing")
        with open(path_to_save_results, 'w') as results_file:
            header_list = ["flow_estimate", "timestamp", "date", "day_of_the_week", "hour", "origin_hcd", "destination_hcd"]
            header_to_write = ",".join(header_list)+"\n"
            results_file.write(header_to_write)
            for key in comb_pred_book.keys():
                hour = str(key[2])
                day = str(key[3])
                month = str(key[4])
                year = str(key[5])
                #date_string = str(key[5])+"-"+str(key[4])+"-"+str(key[3])
                date_string = year+"-"+month+"-"+day
                weekday = date_to_weekday(day, month, year)
                datetime_string = year+"-"+month+"-"+day+" "+hour+":00"
                list_to_write = [str(comb_pred_book[key]), datetime_string, date_string, weekday, hour, key[0], key[1]]
                line_to_write = ",".join(list_to_write)+"\n"
                results_file.write(line_to_write)     
    
    return(in_pred_book, out_pred_book, comb_pred_book)

## Telia Baseline 

In [None]:
def median_flow_2019(telia_od_df_2019):
    print("dataframe length:")
    print(len(telia_od_df_2019))
    median_book = {}
    df = telia_od_df_2019
    counter = 0
    encountered_keys = 0
    separated_by_d = make_regional_in_frac_df(df)
    no_datapoint_falg = False
    for d in separated_by_d.keys():
        separated_by_d_w = make_weekday_frac_df(separated_by_d[d])
        for w in separated_by_d_w.keys():
            separated_by_d_w_h = make_hour_frac_df(separated_by_d_w[w])
            for h in separated_by_d_w_h.keys():
                small_df = separated_by_d_w_h[h]
                unique_origins = small_df.origin.unique().tolist()
                for o in unique_origins:
                    filtered_by_origin = small_df.loc[small_df['origin'] == o]
                    selected_lines = filtered_by_origin
                    data_points = len(selected_lines)
                    if data_points == 0: #if there are no other datapaoints with the same origin, destination, hour, and weekday, return the median withouth filtering with regards to weekday
                        no_datapoint_flag = True
                    elif data_points == 1:
                        median = selected_lines['flow'].values[0]
                    else:
                        median = selected_lines['flow'].median()
                    if no_datapoint_falg != True:
                        if (o, d, h, w) in median_book:
                            encountered_keys += 1
                        median_book[(o, d, h, w)] = median 
                        counter += 1
                    no_datapoint_flag = False
    print("how many times the innest part of the loop is reached")
    print(counter)
    print("repeated keys")
    print(encountered_keys)
    return(median_book)

In [None]:
def baseline_prediction(telia_hcd_od_book, median_od_book_2019, path_to_save_results = None):
    pred_book = {}
    not_found = 0
    for (origin, destination, hour, day, month, year) in telia_hcd_od_book.keys():
        if year == "2020":
            weekday = date_to_weekday(day, month, year)
            if (origin, destination, hour, weekday) in median_od_book_2019.keys():
                pred = median_od_book_2019[(origin, destination, hour, weekday)]
                pred_book[(origin, destination, hour, day, month, year)] = pred
            else:
                not_found += 1
    if path_to_save_results:
        print("printing")
        with open(path_to_save_results, 'w') as results_file:
            header_list = ["flow_estimate", "timestamp", "date", "day_of_the_week", "hour", "origin_hcd", "destination_hcd"]
            header_to_write = ",".join(header_list)+"\n"
            results_file.write(header_to_write)
            for key in pred_book.keys():
                hour = str(key[2])
                day = str(key[3])
                month = str(key[4])
                year = str(key[5])
                date_string = year+"-"+month+"-"+day
                weekday = date_to_weekday(day, month, year)
                datetime_string = year+"-"+month+"-"+day+" "+hour+":00"
                list_to_write = [str(pred_book[key]), datetime_string, date_string, weekday, hour, key[0], key[1]]
                line_to_write = ",".join(list_to_write)+"\n"
                results_file.write(line_to_write)   
    return(pred_book)

# make baseline predictions

In [None]:
telia_hcd_od_dataframe_2019 = telia_hcd_od_dataframe.loc[telia_hcd_od_dataframe['year'] == "2019"]


In [None]:
median_od_book_2019 = median_flow_2019(telia_hcd_od_dataframe_2019)

In [None]:
path_to_save_estimation = '/.../median_of_2019_as_a_baseline.csv'
baseline_pediction_book_2020 = baseline_prediction(telia_hcd_od_book_2020, median_od_book_2019, path_to_save_estimation)

## Different versions of combining road data and telia data in 2019 to predict od in 2020

In [None]:
in_degree_road_and_telia_df = get_two_degree_dictionary_and_make_a_dataframe_of_their_mutual_rows(road_indegree_book_2019 , hcd_telia_indegree_book_2019)
out_degree_road_and_telia_df = get_two_degree_dictionary_and_make_a_dataframe_of_their_mutual_rows(road_outdegree_book_2019 , hcd_telia_outdegree_book_2019)

## Basic:
* Use 2019 data for training
* linear regression: Only one line fitted
* median of fractions: Medianed in the most basic way

In [None]:
#calculate the medians
median_in_fraction_book_x_0 = median_in_fraction_basic(in_frac_df_2019)
median_out_fraction_book_x_0 = median_out_fraction_basic(out_frac_df_2019)

In [None]:
#linear regression
linear_transform_book_indegree_0_x = linear_transform_from_road_degree_to_telia_degree(None, in_degree_road_and_telia_df_2019)
linear_transform_book_outdegree_0_x = linear_transform_from_road_degree_to_telia_degree(None, out_degree_road_and_telia_df_2019)

In [None]:
#the medians
median_in_book = median_in_fraction_book_x_0
median_out_book = median_out_fraction_book_x_0

In [None]:
linear_transform_book_in = linear_transform_book_indegree_0_x
linear_transform_book_out = linear_transform_book_outdegree_0_x

In [None]:
keys_to_predict = telia_hcd_od_book_2020.keys()

In [None]:
path_to_save_estimation = '/.../telia_road_estimation_basic.csv'
_, _, _ = predict_flow(keys_to_predict , road_indegree_book, linear_transform_book_in, median_in_book, road_outdegree_book, \
                       linear_transform_book_out, median_out_book, path_to_save_estimation)

## Daily rythms:
* Use 2019 data years for prediction
* linear regression: Only one line fitted for each quarter of the day
* median of fractions: Medianed among the datapoints which are in the same hour of the day

In [None]:
#linear regression
linear_transform_book_indegree_2_x = linear_transform_from_road_degree_to_telia_degree(divide_different_hours, in_degree_road_and_telia_df_2019)
linear_transform_book_outdegree_2_x = linear_transform_from_road_degree_to_telia_degree(divide_different_hours, out_degree_road_and_telia_df_2019)

In [None]:
linear_transform_book_in = linear_transform_book_indegree_2_x
linear_transform_book_out = linear_transform_book_outdegree_2_x

In [None]:
#calculate the medians
median_in_fraction_book_x_1 = median_in_fraction_hour(in_frac_df_2019)
median_out_fraction_book_x_1 = median_out_fraction_hour(out_frac_df_2019)

In [None]:
#the medians
median_in_book = median_in_fraction_book_x_1
median_out_book = median_out_fraction_book_x_1

In [None]:
path_to_save_estimation = '/.../telia_road_estimation_daily_rhythms.csv'
_, _, _ = predict_flow(keys_to_predict, road_indegree_book, linear_transform_book_in, median_in_book, road_outdegree_book, \
                       linear_transform_book_out, median_out_book, path_to_save_estimation)

## Weekly:
* use 2019 data fro training
* linear regression: one line fitted for each day of week
* median of fractions: Medianed among the datapoints which are in the same day of the week

In [None]:
#calculate the medians
median_out_fraction_book_x_2 = median_out_fraction_weekday(out_frac_df_2019)
median_in_fraction_book_x_2 = median_in_fraction_weekday(in_frac_df_2019)

In [None]:
#the medians
median_in_book = median_in_fraction_book_x_2
median_out_book = median_out_fraction_book_x_2

In [None]:
linear_transform_book_in = linear_transform_from_road_degree_to_telia_degree(divide_different_weekdays, in_degree_road_and_telia_df_2019)
linear_transform_book_out = linear_transform_from_road_degree_to_telia_degree(divide_different_weekdays, out_degree_road_and_telia_df_2019)

In [None]:
path_to_save_estimation = '/.../telia_road_estimation_weekly_rhythms.csv'
_, _, _ = predict_flow(keys_to_predict, road_indegree_book, linear_transform_book_in, median_in_book, road_outdegree_book, \
                       linear_transform_book_out, median_out_book, path_to_save_estimation)

## daily and weekly:
* Use 2019 data for training
* linear regression: one line fitted for each (hour, weekday)
* median of fractions: Medianed among the datapoints which are in the same day of the week and same hour

In [None]:
#calculate the medians
median_out_fraction_book_x_3 = median_out_fraction_hour_and_weekday(out_frac_df_2019)
median_in_fraction_book_x_3 = median_in_fraction_hour_and_weekday(in_frac_df_2019)

In [None]:
linear_transform_book_in = linear_transform_from_road_degree_to_telia_degree(divide_different_hours_and_different_weekdays, in_degree_road_and_telia_df_2019)
linear_transform_book_out = linear_transform_from_road_degree_to_telia_degree(divide_different_hours_and_different_weekdays, out_degree_road_and_telia_df_2019)

In [None]:
#the medians
median_in_book = median_in_fraction_book_x_3
median_out_book = median_out_fraction_book_x_3

In [None]:
path_to_save_estimation = '/.../telia_road_estimation_daily_and_weekly_rhythms.csv'
_, _, _ = predict_flow(keys_to_predict, road_indegree_book, linear_transform_book_in, median_in_book, road_outdegree_book, \
                       linear_transform_book_out, median_out_book, path_to_save_estimation)