In [1]:
import pandas as pd
import geopandas as gpd
import os

pd.set_option('display.max_columns', None)
os.chdir(r"C:\Users\LMENENDEZ\GitHub\MultiModX")
print(os.getcwd())

C:\Users\LMENENDEZ\GitHub\MultiModX


# Calculate coefficients to assign ground paths for international trips

Steps to follow:
1. Download MND
2. Eliminate international trips that travel via ground and trips that use deprecated stations
3. Use the coefficients calculated with aena data to re-scale the trips
4. Assign a "MultiModX path" (i.e., a path composed of only MultiModX stations) to the remaining trips that reach airports via train
Question: What do we do with trips that use rail but for which I am unable to assign a MultiModX path.
5. Calculate the coefficient per first/last airport and path
6. Analyse results
7. Export results

## 1. Download MND

In [2]:
%load_ext autoreload

In [3]:
%autoreload
from script.trips_format import *

In [4]:
all_trips = pd.read_csv(
    r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP4 Performance Assessment Solution\Demand data\Matrices MITMA\with_archetypes\20220922_28_trip_matrix_arc_pt_processed.csv.gz",
    compression="gzip",
    sep="|"
)

In [5]:
trips = all_trips[all_trips["date"] == 20220923].reset_index(drop=True).rename(columns={"origin_nut": "origin", "destination_nut": "destination"})

In [6]:
#associates each airport to the corresponding new NUTS
airports_to_NUTS={"airport_LPA":("ES705","Gran Canaria"),
                 "airport_FUE":("ES704","Fuerteventura"),
                 "airport_ACE":("ES708","Lanzarote"),
                 "airport_TFS":("ES709","Tenerife"),
                 "airport_TFN":("ES709","Tenerife"),
                 "airport_GMZ":("ES709","Tenerife"),
                 "airport_SPC":("ES707","La Palma"),
                 "airport_VDE":("ES703","El Hierro"),
                 "airport_PMI":("ES532","Mallorca"),
                 "airport_IBZ":("ES531","Eivissa i Formentera"),
                 "airport_MAH":("ES533","Menorca")}

In [7]:
trips=format_trips(trips, airports_to_NUTS)

17 columns were removed


## 2. Eliminate international trips that travel via ground and trips that use deprecated stations

In [8]:
# remove cercanías
trips=trips[~(((trips["origin"]=="ES424")&(trips["destination"]=="ES300"))|((trips["origin"]=="ES300")&(trips["destination"]=="ES424")))]

In [9]:
# location of "ALL" train stops given by UiC
# However this list is still incomplete
stops_loc=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\infrastructure\rail_info\stops.txt").astype(str) # everything is a string here to match other formatting
stops_loc["stop_id"] = stops_loc["stop_id"].apply(lambda x: "00" + x) #to make they start with 00

In [10]:
trips.loc[:,"weird_stations"] = trips["node_sequence_reduced"].apply(
    lambda x: find_weird_stations(x, stops_loc))

In [11]:
unique_weird_stations=get_weird_stations(trips["weird_stations"])

In [12]:
MobA_stations_coord=gpd.read_file(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\datos moba\train_stations\train_stations.shp")

In [13]:
# identifies all the un-localisable stations
nowhere_stations=set(unique_weird_stations)-set(MobA_stations_coord["ID"])
print(f"there are {len(nowhere_stations)} stations that are not in the data provided by MobA but appear in the trips dataframe")

there are 205 stations that are not in the data provided by MobA but appear in the trips dataframe


In [14]:
trips = trips[~trips["node_sequence_reduced"].apply(lambda x: any(station in x for station in nowhere_stations))]

In [15]:
trips_abroad=trips[(trips["origin"]=="abroad")|(trips["destination"]=="abroad")]

In [16]:
# creates a dictionary of mcc code to country acronym
international_codes=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\infrastructure\countries mcc\mcc_to_nationality.txt", sep="|")
mcc_to_country=international_codes.set_index("mcc")["country"].to_dict()

In [17]:
trips_abroad=format_trips_abroad(trips_abroad,mcc_to_country)

## 3. Use the coefficients calculated with aena data to re-scale the trips

In [18]:
coeffs_incoming=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\CS11\coefficients_to_rescale_MND\incoming_trips_coefficients_all.csv")

In [19]:
coeffs_outgoing=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\CS11\coefficients_to_rescale_MND\outgoing_trips_coefficients_all.csv")

In [20]:
#we have slightly different countries in coeffs_incoming and coeffs_outgoing
countries_incoming=set(coeffs_incoming["origin"])

countries_outgoing=set(coeffs_outgoing["destination"])

countries_selected=set.union(countries_outgoing,countries_incoming)-(countries_outgoing-countries_incoming)-(countries_incoming-countries_outgoing)

In [21]:
coeffs_incoming=coeffs_incoming[coeffs_incoming["origin"].isin(countries_selected)]
coeffs_outgoing=coeffs_outgoing[coeffs_outgoing["destination"].isin(countries_selected)]

In [22]:
trips_abroad_selected=trips_abroad[(trips_abroad["origin"].isin(countries_selected))|(trips_abroad["destination"].isin(countries_selected))]

In [23]:
trips_abroad_rescaled=rescale_trips(trips_abroad_selected,coeffs_incoming,coeffs_outgoing)

In [24]:
airport_codes=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\infrastructure\airports_info\IATA_ICAO_Airport_codes_v1.3.csv")
iata_to_icao=airport_codes.set_index("IATA")["ICAO"].to_dict()

In [25]:
#change the entry_point or exit_point to icao code
trips_abroad_rescaled["entry_point"]=trips_abroad_rescaled["entry_point"].apply(lambda x: format_airports(x,iata_to_icao))
trips_abroad_rescaled["exit_point"]=trips_abroad_rescaled["exit_point"].apply(lambda x: format_airports(x,iata_to_icao))

In [26]:
trips_going_to_spain=trips_abroad_rescaled[trips_abroad_rescaled["exit_point"].isna()]
trips_leaving_spain=trips_abroad_rescaled[trips_abroad_rescaled["entry_point"].isna()]

In [27]:
trips_going_to_spain["trips"].sum()

np.float64(307718.2463604814)

In [28]:
trips_leaving_spain["trips"].sum()

np.float64(281570.2128004248)

## 4. Assign a "MultiModX path" (i.e., a path composed of only MultiModX stations) to the remaining trips that reach airports via train

In [29]:
# coordinates, geometry and other properties of all NUTS (in Europe?)
NUTS_coord=gpd.read_file(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP4 Performance Assessment Solution\Demand data\nuts3_2003_geom_10.gpkg")

In [30]:
# list of stations considered in MMX
train_stations_considered=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\CS11\v=0.25\infrastructure\rail_info\rail_stations_considered_GTFS_2022v0.1.csv").astype(str)
train_stations_considered["stop_id"] = train_stations_considered["stop_id"].apply(lambda x: "00" + x) #to make they start with 00

In [31]:
# align coordinate reference system (crs)
NUTS_coord = NUTS_coord.to_crs(MobA_stations_coord.crs)

# Perform a spatial join to find which NUTS region each station belongs to
spatial_join = gpd.sjoin(MobA_stations_coord, NUTS_coord, how="left", predicate="within")

# Construct the dictionary. This dictionary tells us in which nuts is each station
station_to_nuts = dict(zip(spatial_join["ID"], spatial_join["geocode"]))

In [32]:
# adds a column for the moba acronym and another one with the nuts of the sation
train_stations_considered=format_train_stations_considered(train_stations_considered,station_to_nuts)

In [33]:
# list of all MMX stations
train_station_MMX=train_stations_considered["stop_id"].tolist()

In [34]:
trips_going_to_spain=process_node_sequence_MMX(trips_going_to_spain,train_station_MMX,iata_to_icao)

In [35]:
trips_leaving_spain=process_node_sequence_MMX(trips_leaving_spain,train_station_MMX,iata_to_icao)

In [36]:
trips_going_to_spain.sample(5)

Unnamed: 0,date,origin,origin_name,destination,destination_name,entry_point,exit_point,origin_purpose,destination_purpose,legs,nationality,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,n_legs,mode_sequence,node_sequence,start_node,end_node,type,road_legs,train_legs,plane_legs,node_sequence_reduced,start_node_reduced,end_node_reduced,mode_tp,trips,node_sequence_MMX
61330,20220923,MA,MA,ES617,Málaga,LEMG,,NF,NF,P12*abroad_604*2906711*None*airport_AGP*00-01*...,MA,7.219072,6.663759,3.331879,0.0,0.277657,0.277657,2,plane-road,airport_AGP,airport_AGP,airport_AGP,international_O,1,0,1,airport_AGP,airport_AGP,airport_AGP,['air'],17.770023,['LEMG']
67119,20220923,PT,PT,ES511,Barcelona,LEBL,,NF,H,P08*PT170*0816904*None*airport_BCN*00-01*plane...,ES,0.387275,0.379838,0.392853,0.011341,0.042204,0.027331,2,plane-road,airport_BCN,airport_BCN,airport_BCN,international_O,1,0,1,airport_BCN,airport_BCN,airport_BCN,['air'],1.240843,['LEBL']
67436,20220923,PT,PT,ES511,Barcelona,LEBL,,NF,H,P21*PT11A*0816904*None*airport_BCN*00-01*plane...,ES,0.229409,0.257517,0.184354,0.014054,0.031001,0.018601,2,plane-road,airport_BCN,airport_BCN,airport_BCN,international_O,1,0,1,airport_BCN,airport_BCN,airport_BCN,['air'],0.734936,['LEBL']
48494,20220923,FR,FR,ES300,Madrid,LEMD,,NF,NF,P15*abroad_208*2807921*None*airport_MAD*00-01*...,FR,1.738876,0.4387,0.5223,0.021497,0.097931,0.024682,2,plane-road,airport_MAD,airport_MAD,airport_MAD,international_O,1,0,1,airport_MAD,airport_MAD,airport_MAD,['air'],2.843986,['LEMD']
57214,20220923,FR,FR,ES618,Sevilla,LEJR,,NF,H,P08*FR107*1102009*None*airport_XRY*00-01*plane...,ES,0.128173,0.09144,0.049237,0.011723,0.013286,0.012505,2,plane-road,airport_XRY,airport_XRY,airport_XRY,international_O,1,0,1,airport_XRY,airport_XRY,airport_XRY,['air'],0.306364,['LEJR']


In [37]:
trips_going_to_spain[trips_going_to_spain["node_sequence_MMX"].isna()]["trips"].sum()

np.float64(4411.286095643872)

In [38]:
trips_leaving_spain[trips_leaving_spain["node_sequence_MMX"].isna()]["trips"].sum()

np.float64(2181.437955926911)

In [39]:
# I will drop all trips that do not have a NaN for node_sequence_MMX
trips_going_to_spain=trips_going_to_spain[trips_going_to_spain["node_sequence_MMX"].notna()]
trips_leaving_spain=trips_leaving_spain[trips_leaving_spain["node_sequence_MMX"].notna()]

## 5. Calculate the coefficient per first/last airport and path

In [40]:
def modify_node_sequence_MMX_incoming_trips(node_sequence):
    # transforms the sequence from a string to a list
    node_sequence=ast.literal_eval(node_sequence)

    # initialise a new sequence and the train counter
    new_sequence=[]
    train_counter=0
    #iterates over the nodes in the node sequence
    for node in node_sequence:
        # checks if the node is an airport
        if re.fullmatch(r'^[A-Z]{4}$', node):
            # checks if there was a train previously, if not, it does not add the airport to the new sequence
            if train_counter==0:
                continue
            elif train_counter>0:
                new_sequence.append(node)
        # checks if the node is a train station. Adds that node to the new sequence
        elif node.isdigit():
            new_sequence.append(node)
            train_counter+=1
        else:
            print(f"problem with node {node}")

    #if new_sequence is empty it transforms it into the string egress. If it is not empty it transforms it into a string
    if not new_sequence:
        new_sequence="egress"
    else:
        new_sequence=str(new_sequence)

        
    return new_sequence



In [41]:
def modify_node_sequence_MMX_outgoing_trips(node_sequence):
    # transforms the sequence from a string to a list
    node_sequence=ast.literal_eval(node_sequence)

    # initialise a new sequence and the train counter
    new_sequence=[]
    train_counter=0
    #iterates over the nodes in the node sequence
    for node in reversed(node_sequence):
        # checks if the node is an airport
        if re.fullmatch(r'^[A-Z]{4}$', node):
            # checks if there was a train previously, if not, it does not add the airport to the new sequence
            if train_counter==0:
                continue
            elif train_counter>0:
                new_sequence.append(node)
        # checks if the node is a train station. Adds that node to the new sequence
        elif node.isdigit():
            new_sequence.append(node)
            train_counter+=1
        else:
            print(f"problem with node {node}")
    
    #if new_sequence is empty it transforms it into the string access. If it is not empty it transforms it into a string
    if not new_sequence:
        new_sequence="access"
    else:
        # re-reverse the sequence to obtain the original order
        new_sequence=list(reversed(new_sequence))
        new_sequence=str(new_sequence)

    return new_sequence

In [42]:
trips_going_to_spain["node_sequence_ground"]=trips_going_to_spain["node_sequence_MMX"].apply(lambda row: modify_node_sequence_MMX_incoming_trips(row))
trips_leaving_spain["node_sequence_ground"]=trips_leaving_spain["node_sequence_MMX"].apply(lambda row: modify_node_sequence_MMX_outgoing_trips(row))

In [43]:
trips_going_to_spain[trips_going_to_spain["node_sequence_ground"]!="egress"].sample(5)

Unnamed: 0,date,origin,origin_name,destination,destination_name,entry_point,exit_point,origin_purpose,destination_purpose,legs,nationality,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,n_legs,mode_sequence,node_sequence,start_node,end_node,type,road_legs,train_legs,plane_legs,node_sequence_reduced,start_node_reduced,end_node_reduced,mode_tp,trips,node_sequence_MMX,node_sequence_ground
49575,20220923,FR,FR,ES300,Madrid,LEVC,,NF,O,P20*FRL04*4615903*None*airport_VLC*00-01*plane...,ES,0.289106,0.116556,0.093702,0.006285,0.018855,0.010856,4,plane-road-train-road,airport_VLC-train_03216-train_60000,airport_VLC,train_60000,international_O,2,1,1,airport_VLC-train_03216-train_60000,airport_VLC,train_60000,"['air', 'rail']",0.535359,"['LEVC', '007103216', '007160000']","['007103216', '007160000']"
53666,20220923,FR,FR,ES523,Valencia / València,LEGE,,NF,NF,P10*abroad_208*17233_AM*None*airport_GRO*00-01...,FR,0.308875,0.247683,0.136954,0.0,0.008742,0.008742,4,plane-road-train-road,airport_GRO-train_79300-train_03216,airport_GRO,train_03216,international_O,2,1,1,airport_GRO-train_79300-train_03216,airport_GRO,train_3213,"['air', 'rail']",0.710996,"['LEGE', '007179300', '007103216']","['007179300', '007103216']"
64854,20220923,PT,PT,ES300,Madrid,LEBL,,NF,NF,P11*PT170*0816904*None*airport_BCN*00-01*plane...,ES,0.518312,0.22738,0.323186,0.009261,0.053332,0.009581,4,plane-road-train-road,airport_BCN-train_71801-train_60000,airport_BCN,train_60000,international_O,2,1,1,airport_BCN-train_71801-train_60000,airport_BCN,train_60000,"['air', 'rail']",1.141052,"['LEBL', '007171801', '007160000']","['007171801', '007160000']"
43737,20220923,FR,FR,ES113,Ourense,LEST,,NF,O,P15*FR102*1507806*None*airport_SCQ*00-01*plane...,ES,0.03409,0.048912,0.023715,0.002964,0.007411,0.002964,4,plane-road-train-road,airport_SCQ-train_31400-train_22100,airport_SCQ,train_22100,international_O,2,1,1,airport_SCQ-train_31400-train_22100,airport_SCQ,train_22100,"['air', 'rail']",0.120056,"['LEST', '007131400', '007122100']","['007131400', '007122100']"
856,20220923,BE,BE,ES511,Barcelona,LEGE,,NF,NF,P17*abroad_206*17233_AM*None*airport_GRO*00-01...,BE,3.030797,2.270208,2.154617,0.106344,0.231182,0.15258,4,plane-road-train-road,airport_GRO-train_79300-train_71801,airport_GRO,train_71801,international_O,2,1,1,airport_GRO-train_79300-train_71801,airport_GRO,train_71801,"['air', 'rail']",7.945727,"['LEGE', '007179300', '007171801']","['007179300', '007171801']"


In [44]:
trips_leaving_spain[trips_leaving_spain["node_sequence_ground"]=="access"].sample(5)

Unnamed: 0,date,origin,origin_name,destination,destination_name,entry_point,exit_point,origin_purpose,destination_purpose,legs,nationality,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,n_legs,mode_sequence,node_sequence,start_node,end_node,type,road_legs,train_legs,plane_legs,node_sequence_reduced,start_node_reduced,end_node_reduced,mode_tp,trips,node_sequence_MMX,node_sequence_ground
36910,20220923,ES523,Valencia / València,RO,RO,,LEVC,NF,NF,P07*4625008*4615903*None*airport_VLC*00-01*roa...,RO,3.028086,2.836839,0.478119,0.0,0.063749,0.127498,2,road-plane,airport_VLC,airport_VLC,airport_VLC,international_D,1,0,1,airport_VLC,airport_VLC,airport_VLC,['air'],6.534292,['LEVC'],access
15124,20220923,ES300,Madrid,FR,FR,,LEMD,NF,NF,P16*2807921*2807921*None*airport_MAD*None*road...,ES,0.773696,0.210609,0.166883,0.004774,0.016994,0.013939,2,road-plane,airport_MAD,airport_MAD,airport_MAD,international_D,1,0,1,airport_MAD,airport_MAD,airport_MAD,['air'],1.186894,['LEMD'],access
27112,20220923,ES511,Barcelona,PT,PT,,LEBL,H,NF,P05*0820003*0816904*None*airport_BCN*00-01*roa...,ES,0.45491,0.600824,0.230756,0.018157,0.051499,0.029381,2,road-plane,airport_BCN,airport_BCN,airport_BCN,international_D,1,0,1,airport_BCN,airport_BCN,airport_BCN,['air'],1.385527,['LEBL'],access
33846,20220923,ES523,Valencia / València,CA,CA,,LEVC,NF,NF,P04*4625012*4615903*None*airport_VLC*00-01*roa...,AR,5.451436,5.107135,0.860753,0.0,0.114767,0.229534,2,road-plane,airport_VLC,airport_VLC,airport_VLC,international_D,1,0,1,airport_VLC,airport_VLC,airport_VLC,['air'],11.763626,['LEVC'],access
20409,20220923,ES421,Albacete,FR,FR,,LEVC,W,NF,P13*0200308*4615903*None*airport_VLC*02-03*roa...,ES,0.597646,0.105467,0.035156,0.0,0.017578,0.0,2,road-plane,airport_VLC,airport_VLC,airport_VLC,international_D,1,0,1,airport_VLC,airport_VLC,airport_VLC,['air'],0.755847,['LEVC'],access


In [45]:
def count_stations(row):
    """Function to check whether node_sequence_ground has an even number of stations"""
    if row in ["access","egress"]:
        return True
    try:
        list=ast.literal_eval(row)
        return len(list)% 2==0
    except:
        return False

In [46]:
trips_going_to_spain[trips_going_to_spain["node_sequence_ground"].apply(count_stations)].head(3)

Unnamed: 0,date,origin,origin_name,destination,destination_name,entry_point,exit_point,origin_purpose,destination_purpose,legs,nationality,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,n_legs,mode_sequence,node_sequence,start_node,end_node,type,road_legs,train_legs,plane_legs,node_sequence_reduced,start_node_reduced,end_node_reduced,mode_tp,trips,node_sequence_MMX,node_sequence_ground
6,20220923,AE,AE,ES300,Madrid,LEMD,,NF,NF,P08*abroad_424*2807921*None*airport_MAD*00-01*...,AR,86.225008,21.753654,25.899087,1.065969,4.856079,1.22389,4,plane-road-train-road,airport_MAD-train_70101-train_18002,airport_MAD,train_18002,international_O,2,1,1,airport_MAD,airport_MAD,airport_MAD,"['air', 'rail']",141.023686,['LEMD'],egress
7,20220923,AE,AE,ES300,Madrid,LEMD,,NF,NF,P08*abroad_424*2807921*None*airport_MAD*01-02*...,AR,86.225008,21.753654,25.899087,1.065969,4.856079,1.22389,2,plane-road,airport_MAD,airport_MAD,airport_MAD,international_O,1,0,1,airport_MAD,airport_MAD,airport_MAD,['air'],141.023686,['LEMD'],egress
8,20220923,AE,AE,ES300,Madrid,LEMD,,NF,NF,P11*abroad_424*2807921*None*airport_MAD*00-01*...,AR,86.225008,21.753654,25.899087,1.065969,4.856079,1.22389,2,plane-road,airport_MAD,airport_MAD,airport_MAD,international_O,1,0,1,airport_MAD,airport_MAD,airport_MAD,['air'],141.023686,['LEMD'],egress


In [47]:
#remove all trips that contain an odd number of stations (these are the minority)
trips_going_to_spain=trips_going_to_spain[trips_going_to_spain["node_sequence_ground"].apply(count_stations)]
trips_leaving_spain=trips_leaving_spain[trips_leaving_spain["node_sequence_ground"].apply(count_stations)]

In [48]:
def coefficient_calculator(trips,iata_to_icao):
    trips=trips.copy()
    #trips["train_leg"]=0
    if trips["entry_point"].isna().any():
        raise ValueError("missing values for entry point, check dataframe")
    else:
        for idx,row in trips.iterrows():
            if row["mode_tp"] == "['air']":
                continue
            else:
                modes=row["mode_sequence"].split("-")
                modes_filtered=[mode for mode in modes if mode not in ["bus","road"]]
                for idx2,mode in enumerate(modes_filtered):
                    if mode != "plane":
                        trips.loc[idx, "entry_point"]=row["node_sequence"].split("-")[idx2-1]
                        #if mode == "train":
                            #trips.loc[idx,"train_leg"]=1
                        break
    
        trips["entry_point"]=trips["entry_point"].str.replace("airport_","")
        trips["entry_point"]=trips["entry_point"].replace(iata_to_icao)
        
        #remove columns
        trips=trips.drop(["origin","origin_name",'date', 'exit_point', 'origin_purpose', 'destination_purpose',
       'legs', 'nationality', 'archetype_0', 'archetype_1', 'archetype_2',
       'archetype_3', 'archetype_4', 'archetype_5', 'n_legs', 'mode_sequence',
       'node_sequence', 'start_node', 'end_node', 'type', 'road_legs',
       'train_legs', 'plane_legs', 'node_sequence_reduced',
       'start_node_reduced', 'end_node_reduced', 'mode_tp',"node_sequence_MMX"],axis=1)
        
        #groupby and create the coefficient
        trips=trips.groupby([
            "destination",
            "destination_name",
            "entry_point",
            #"train_leg",
            "node_sequence_ground"
            ]).sum().reset_index()
        trips['total_trips'] =trips.groupby('entry_point')['trips'].transform('sum')
        trips["coeff"]=trips["trips"]/trips["total_trips"]

        #reorder
        column_order=["entry_point"]+trips.columns.drop(["entry_point"]).tolist()
        trips=trips.reindex(columns=column_order)
        trips=trips.rename(columns={"entry_point":"last_airport"})
        trips=trips.sort_values(["last_airport","destination"])
        return trips
    

In [49]:
def coefficient_calculator_outgoing(trips,iata_to_icao):
    trips=trips.copy()
    #trips["train_leg"]=0
    if trips["exit_point"].isna().any():
        raise ValueError("missing values for exit point, check dataframe")
    else:
        for idx,row in trips.iterrows():
            if row["mode_tp"] == "['air']":
                continue
            else:
                nodes=row["node_sequence"].split("-")
                exit_point=None

                for node in reversed(nodes):
                    if node.startswith("airport"):
                        exit_point=node
                    else:
                        #if node.startswith("train"):
                            #trips.loc[idx,"train_leg"]=1
                        break
                
                trips.loc[idx,"exit_point"]=exit_point


    
        trips["exit_point"]=trips["exit_point"].str.replace("airport_","")
        trips["exit_point"]=trips["exit_point"].replace(iata_to_icao)
        
        #remove columns
        trips=trips.drop(["destination","destination_name",'date', 'entry_point', 'origin_purpose', 'destination_purpose',
       'legs', 'nationality', 'archetype_0', 'archetype_1', 'archetype_2',
       'archetype_3', 'archetype_4', 'archetype_5', 'n_legs', 'mode_sequence',
       'node_sequence', 'start_node', 'end_node', 'type', 'road_legs',
       'train_legs', 'plane_legs', 'node_sequence_reduced',
       'start_node_reduced', 'end_node_reduced', 'mode_tp',"node_sequence_MMX"],axis=1)
        
        #groupby and create the coefficient
        trips=trips.groupby([
            "origin",
            "origin_name",
            "exit_point",
            #"train_leg",
            "node_sequence_ground"
            ]).sum().reset_index()
        trips['total_trips'] =trips.groupby('exit_point')['trips'].transform('sum')
        trips["coeff"]=trips["trips"]/trips["total_trips"]

        #reorder
        column_order=["exit_point"]+trips.columns.drop(["exit_point"]).tolist()
        trips=trips.reindex(columns=column_order)
        trips=trips.rename(columns={"exit_point":"first_airport"})
        trips=trips.sort_values(["first_airport","origin"])
        return trips

In [50]:
coeff_incoming=coefficient_calculator(trips_going_to_spain,iata_to_icao)

In [51]:
coeff_leaving=coefficient_calculator_outgoing(trips_leaving_spain,iata_to_icao)

## 6. Analyse results

In [52]:
coeff_incoming["trips"].sum()

np.float64(303145.1859504331)

In [53]:
trips_going_to_spain["trips"].sum()

np.float64(303145.185950433)

In [54]:
trips_leaving_spain["trips"].sum()

np.float64(278932.1853063629)

In [55]:
coeff_leaving["trips"].sum()

np.float64(278932.1853063629)

I am not losing trips in the process

In [56]:
coeff_incoming

Unnamed: 0,last_airport,destination,destination_name,node_sequence_ground,trips,total_trips,coeff
24,LEAL,ES114,Pontevedra,"['007160911', '007122100']",8.034224,36988.543274,0.000217
32,LEAL,ES120,Asturias,egress,1.766597,36988.543274,0.000048
110,LEAL,ES241,Huesca,egress,2.742340,36988.543274,0.000074
142,LEAL,ES300,Madrid,"['007160911', '007117000']",5.019661,36988.543274,0.000136
143,LEAL,ES300,Madrid,egress,9.195157,36988.543274,0.000249
...,...,...,...,...,...,...,...
441,LEZL,ES616,Jaén,egress,3.583077,5663.341747,0.000633
458,LEZL,ES617,Málaga,"['007151003', '007154413']",1.668329,5663.341747,0.000295
459,LEZL,ES617,Málaga,egress,5.258203,5663.341747,0.000928
471,LEZL,ES618,Sevilla,egress,4987.149433,5663.341747,0.880602


In [57]:
coeff_leaving.head(3)

Unnamed: 0,first_airport,origin,origin_name,node_sequence_ground,trips,total_trips,coeff
426,GCFV,ES704,Fuerteventura,access,84.718567,84.718567,1.0
427,GCLA,ES707,La Palma,access,161.01434,161.01434,1.0
428,GCRR,ES708,Lanzarote,access,169.817518,169.817518,1.0


In [58]:
node_sequence_ground_incoming=coeff_incoming.drop(["trips","total_trips","coeff"],axis=1)
node_sequence_ground_incoming["number_of_ground_options"]=node_sequence_ground_incoming.groupby(
    ['last_airport', 'destination']
)['node_sequence_ground'].transform('nunique')

In [59]:
node_sequence_ground_incoming["number_of_ground_options"].describe()

count    496.000000
mean       1.620968
std        0.883910
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: number_of_ground_options, dtype: float64

In [60]:
node_sequence_ground_incoming.max()

last_airport                  LEZL
destination                  ES709
destination_name             Ávila
node_sequence_ground        egress
number_of_ground_options         5
dtype: object

In [61]:
node_sequence_ground_incoming[node_sequence_ground_incoming["number_of_ground_options"]>=3]

Unnamed: 0,last_airport,destination,destination_name,node_sequence_ground,number_of_ground_options
86,LEBB,ES220,Navarra,"['007111200', '007181108']",3
87,LEBB,ES220,Navarra,"['007113200', '007111200']",3
88,LEBB,ES220,Navarra,egress,3
100,LEBB,ES230,La Rioja,"['007113200', '007111200']",3
101,LEBB,ES230,La Rioja,"['007113200', '007181108']",3
...,...,...,...,...,...
281,LEVC,ES511,Barcelona,"['007165402', '007171801']",3
282,LEVC,ES511,Barcelona,egress,3
439,LEZL,ES616,Jaén,"['007151003', '007103100']",3
440,LEZL,ES616,Jaén,"['007151003', '007150300']",3


In [62]:
node_sequence_ground_outgoing=coeff_leaving.drop(["trips","total_trips","coeff"],axis=1)
node_sequence_ground_outgoing["number_of_ground_options"]=node_sequence_ground_outgoing.groupby(
    ['first_airport', 'origin']
)['node_sequence_ground'].transform('nunique')

In [63]:
node_sequence_ground_outgoing["number_of_ground_options"].describe()

count    431.000000
mean       1.617169
std        0.843350
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        4.000000
Name: number_of_ground_options, dtype: float64

In [64]:
node_sequence_ground_outgoing.max()

first_airport                 LEZL
origin                       ES709
origin_name                  Ávila
node_sequence_ground        access
number_of_ground_options         4
dtype: object

Conclusion: most airports and origin/destination have only a handful of possible ground paths to reach them. This is good for us. 

## 7. Export results

In [None]:
#coeff_leaving.to_csv("coefficients_outgoing_trips_from_spain_v0.5.csv",index=False)

In [None]:
#coeff_incoming.to_csv("coefficients_incoming_trips_to_spain_v0.5.csv",index=False)