In [1]:
import pandas as pd
import geopandas as gpd
import os

pd.set_option('display.max_columns', None)
os.chdir(r"C:\Users\LMENENDEZ\GitHub\MultiModX")
print(os.getcwd())

C:\Users\LMENENDEZ\GitHub\MultiModX


# Calculate coefficients to assign ground paths for international trips

Steps to follow:
1. Download MND
2. Eliminate international trips that travel via ground and trips that use deprecated stations
3. Use the coefficients calculated with aena data to re-scale the trips
4. Assign a "MultiModX path" (i.e., a path composed of only MultiModX stations) to the remaining trips that reach airports via train
Question: What do we do with trips that use rail but for which I am unable to assign a MultiModX path.
5. Calculate the coefficient per first/last airport and path
6. Analyse results
7. Export results

## 1. Download MND

In [2]:
%load_ext autoreload

In [31]:
%autoreload
from script.trips_format import *

In [4]:
all_trips = pd.read_csv(
    r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP4 Performance Assessment Solution\Demand data\Matrices MITMA\with_archetypes\20220922_28_trip_matrix_arc_pt_processed.csv.gz",
    compression="gzip",
    sep="|"
)

In [5]:
trips = all_trips[all_trips["date"] == 20220923].reset_index(drop=True).rename(columns={"origin_nut": "origin", "destination_nut": "destination"})

In [6]:
#associates each airport to the corresponding new NUTS
airports_to_NUTS={"airport_LPA":("ES705","Gran Canaria"),
                 "airport_FUE":("ES704","Fuerteventura"),
                 "airport_ACE":("ES708","Lanzarote"),
                 "airport_TFS":("ES709","Tenerife"),
                 "airport_TFN":("ES709","Tenerife"),
                 "airport_GMZ":("ES709","Tenerife"),
                 "airport_SPC":("ES707","La Palma"),
                 "airport_VDE":("ES703","El Hierro"),
                 "airport_PMI":("ES532","Mallorca"),
                 "airport_IBZ":("ES531","Eivissa i Formentera"),
                 "airport_MAH":("ES533","Menorca")}

In [7]:
trips=format_trips(trips, airports_to_NUTS)

17 columns were removed


## 2. Eliminate international trips that travel via ground and trips that use deprecated stations

In [14]:
# remove cercanías
trips=trips[~(((trips["origin"]=="ES424")&(trips["destination"]=="ES300"))|((trips["origin"]=="ES300")&(trips["destination"]=="ES424")))]

In [15]:
# location of "ALL" train stops given by UiC
# However this list is still incomplete
stops_loc=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\infrastructure\rail_info\stops.txt").astype(str) # everything is a string here to match other formatting
stops_loc["stop_id"] = stops_loc["stop_id"].apply(lambda x: "00" + x) #to make they start with 00

In [16]:
trips.loc[:,"weird_stations"] = trips["node_sequence_reduced"].apply(
    lambda x: find_weird_stations(x, stops_loc))

In [17]:
unique_weird_stations=get_weird_stations(trips["weird_stations"])

In [18]:
MobA_stations_coord=gpd.read_file(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\datos moba\train_stations\train_stations.shp")

In [19]:
# identifies all the un-localisable stations
nowhere_stations=set(unique_weird_stations)-set(MobA_stations_coord["ID"])
print(f"there are {len(nowhere_stations)} stations that are not in the data provided by MobA but appear in the trips dataframe")

there are 205 stations that are not in the data provided by MobA but appear in the trips dataframe


In [20]:
trips = trips[~trips["node_sequence_reduced"].apply(lambda x: any(station in x for station in nowhere_stations))]

In [21]:
trips_abroad=trips[(trips["origin"]=="abroad")|(trips["destination"]=="abroad")]

In [22]:
# creates a dictionary of mcc code to country acronym
international_codes=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\infrastructure\countries mcc\mcc_to_nationality.txt", sep="|")
mcc_to_country=international_codes.set_index("mcc")["country"].to_dict()

In [23]:
trips_abroad=format_trips_abroad(trips_abroad,mcc_to_country)

## 3. Use the coefficients calculated with aena data to re-scale the trips

In [25]:
coeffs_incoming=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\CS11\coefficients_to_rescale_MND\incoming_trips_coefficients_all.csv")

In [24]:
coeffs_outgoing=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\CS11\coefficients_to_rescale_MND\outgoing_trips_coefficients_all.csv")

In [26]:
#we have slightly different countries in coeffs_incoming and coeffs_outgoing
countries_incoming=set(coeffs_incoming["origin"])

countries_outgoing=set(coeffs_outgoing["destination"])

countries_selected=set.union(countries_outgoing,countries_incoming)-(countries_outgoing-countries_incoming)-(countries_incoming-countries_outgoing)

In [28]:
coeffs_incoming=coeffs_incoming[coeffs_incoming["origin"].isin(countries_selected)]
coeffs_outgoing=coeffs_outgoing[coeffs_outgoing["destination"].isin(countries_selected)]

In [32]:
trips_abroad_selected=trips_abroad[(trips_abroad["origin"].isin(countries_selected))|(trips_abroad["destination"].isin(countries_selected))]

In [33]:
trips_abroad_rescaled=rescale_trips(trips_abroad_selected,coeffs_incoming,coeffs_outgoing)

In [34]:
airport_codes=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\v=0.1\infrastructure\airports_info\IATA_ICAO_Airport_codes_v1.3.csv")
iata_to_icao=airport_codes.set_index("IATA")["ICAO"].to_dict()

In [35]:
#change the entry_point or exit_point to icao code
trips_abroad_rescaled["entry_point"]=trips_abroad_rescaled["entry_point"].apply(lambda x: format_airports(x,iata_to_icao))
trips_abroad_rescaled["exit_point"]=trips_abroad_rescaled["exit_point"].apply(lambda x: format_airports(x,iata_to_icao))

In [36]:
trips_going_to_spain=trips_abroad_rescaled[trips_abroad_rescaled["exit_point"].isna()]
trips_leaving_spain=trips_abroad_rescaled[trips_abroad_rescaled["entry_point"].isna()]

In [37]:
trips_going_to_spain["trips"].sum()

np.float64(307718.2463604814)

In [39]:
trips_leaving_spain["trips"].sum()

np.float64(281570.2128004248)

## 4. Assign a "MultiModX path" (i.e., a path composed of only MultiModX stations) to the remaining trips that reach airports via train

In [40]:
# coordinates, geometry and other properties of all NUTS (in Europe?)
NUTS_coord=gpd.read_file(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP4 Performance Assessment Solution\Demand data\nuts3_2003_geom_10.gpkg")

In [41]:
# list of stations considered in MMX
train_stations_considered=pd.read_csv(r"G:\Unidades compartidas\04_PROYECTOS I+D+i\2023 MultiModX\iii) Project\WP3 Scenario definition\Case study input data\Spain+abroad\CS11\v=0.25\infrastructure\rail_info\rail_stations_considered_GTFS_2022v0.1.csv").astype(str)
train_stations_considered["stop_id"] = train_stations_considered["stop_id"].apply(lambda x: "00" + x) #to make they start with 00

In [42]:
# align coordinate reference system (crs)
NUTS_coord = NUTS_coord.to_crs(MobA_stations_coord.crs)

# Perform a spatial join to find which NUTS region each station belongs to
spatial_join = gpd.sjoin(MobA_stations_coord, NUTS_coord, how="left", predicate="within")

# Construct the dictionary. This dictionary tells us in which nuts is each station
station_to_nuts = dict(zip(spatial_join["ID"], spatial_join["geocode"]))

In [43]:
# adds a column for the moba acronym and another one with the nuts of the sation
train_stations_considered=format_train_stations_considered(train_stations_considered,station_to_nuts)

In [44]:
# list of all MMX stations
train_station_MMX=train_stations_considered["stop_id"].tolist()

In [45]:
trips_going_to_spain=process_node_sequence_MMX(trips_going_to_spain,train_station_MMX,iata_to_icao)

In [46]:
trips_leaving_spain=process_node_sequence_MMX(trips_leaving_spain,train_station_MMX,iata_to_icao)

In [51]:
trips_going_to_spain.sample(5)

Unnamed: 0,date,origin,origin_name,destination,destination_name,entry_point,exit_point,origin_purpose,destination_purpose,legs,nationality,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,n_legs,mode_sequence,node_sequence,start_node,end_node,type,road_legs,train_legs,plane_legs,node_sequence_reduced,start_node_reduced,end_node_reduced,mode_tp,trips,node_sequence_MMX
43943,20220923,FR,FR,ES130,Cantabria,LEMD,,NF,NF,P13*FRF33*2807921*None*airport_MAD*00-01*plane...,ES,0.618023,0.486928,0.243464,0.018728,0.037456,0.018728,4,plane-road-train-road,airport_MAD-train_10600-train_14100,airport_MAD,train_14100,international_O,2,1,1,airport_MAD-train_10600-train_14100,airport_MAD,train_14100,"['air', 'rail']",1.423327,"['LEMD', '007110600', '007114100']"
55596,20220923,FR,FR,ES617,Málaga,LEMG,,NF,H,P07*FR102*2906711*None*airport_AGP*04-06*plane...,ES,0.182444,0.319277,0.146162,0.014513,0.005183,0.011403,2,plane-road,airport_AGP,airport_AGP,airport_AGP,international_O,1,0,1,airport_AGP,airport_AGP,airport_AGP,['air'],0.678982,['LEMG']
54507,20220923,FR,FR,ES523,Valencia / València,LEVC,,NF,NF,P11*abroad_208*4615903*None*airport_VLC*00-01*...,FR,0.926626,0.74305,0.410863,0.0,0.026225,0.026225,2,plane-road,airport_VLC,airport_VLC,airport_VLC,international_O,1,0,1,airport_VLC,airport_VLC,airport_VLC,['air'],2.132989,['LEVC']
43373,20220923,FI,FI,ES617,Málaga,LEMG,,NF,NF,P19*abroad_244*2906711*None*airport_AGP*00-01*...,FI,3.966964,3.661813,1.830906,0.0,0.152576,0.152576,4,plane-road-train-road,airport_AGP-train_54520-train_54505,airport_AGP,train_54505,international_O,2,1,1,airport_AGP,airport_AGP,airport_AGP,"['air', 'rail']",9.764835,['LEMG']
58080,20220923,GB,GB,ES521,Alicante / Alacant,LEAL,,NF,NF,P23*abroad_234*0306507*None*airport_ALC*00-01*...,GB,16.660084,35.402679,8.330042,0.0,0.0,0.0,2,plane-road,airport_ALC,airport_ALC,airport_ALC,international_O,1,0,1,airport_ALC,airport_ALC,airport_ALC,['air'],60.392806,['LEAL']


In [53]:
trips_going_to_spain[trips_going_to_spain["node_sequence_MMX"].isna()]["trips"].sum()

np.float64(4411.286095643872)

In [54]:
trips_leaving_spain[trips_leaving_spain["node_sequence_MMX"].isna()]["trips"].sum()

np.float64(2181.437955926911)

In [55]:
# I will drop all trips that do not have a NaN for node_sequence_MMX
trips_going_to_spain=trips_going_to_spain[trips_going_to_spain["node_sequence_MMX"].notna()]
trips_leaving_spain=trips_leaving_spain[trips_leaving_spain["node_sequence_MMX"].notna()]

## 5. Calculate the coefficient per first/last airport and path

In [None]:
def modify_node_sequence_MMX_incoming_trips(node_sequence):
    # transforms the sequence from a string to a list
    node_sequence=ast.literal_eval(node_sequence)

    # initialise a new sequence and the train counter
    new_sequence=[]
    train_counter=0
    #iterates over the nodes in the node sequence
    for node in node_sequence:
        # checks if the node is an airport
        if re.fullmatch(r'^[A-Z]{4}$', node):
            # checks if there was a train previously, if not, it does not add the airport to the new sequence
            if train_counter==0:
                continue
            elif train_counter>0:
                new_sequence.append(node)
        # checks if the node is a train station. Adds that node to the new sequence
        elif node.isdigit():
            new_sequence.append(node)
            train_counter+=1
        else:
            print(f"problem with node {node}")

    #if new_sequence is empty it transforms it into the string egress. If it is not empty it transforms it into a string
    if not new_sequence:
        new_sequence="egress"
    else:
        new_sequence=str(new_sequence)

        
    return new_sequence



In [118]:
def modify_node_sequence_MMX_outgoing_trips(node_sequence):
    # transforms the sequence from a string to a list
    node_sequence=ast.literal_eval(node_sequence)

    # initialise a new sequence and the train counter
    new_sequence=[]
    train_counter=0
    #iterates over the nodes in the node sequence
    for node in reversed(node_sequence):
        # checks if the node is an airport
        if re.fullmatch(r'^[A-Z]{4}$', node):
            # checks if there was a train previously, if not, it does not add the airport to the new sequence
            if train_counter==0:
                continue
            elif train_counter>0:
                new_sequence.append(node)
        # checks if the node is a train station. Adds that node to the new sequence
        elif node.isdigit():
            new_sequence.append(node)
            train_counter+=1
        else:
            print(f"problem with node {node}")
    
    #if new_sequence is empty it transforms it into the string access. If it is not empty it transforms it into a string
    if not new_sequence:
        new_sequence="access"
    else:
        # re-reverse the sequence to obtain the original order
        new_sequence=list(reversed(new_sequence))
        new_sequence=str(new_sequence)

    return new_sequence

In [119]:
trips_going_to_spain["node_sequence_ground"]=trips_going_to_spain["node_sequence_MMX"].apply(lambda row: modify_node_sequence_MMX_incoming_trips(row))
trips_leaving_spain["node_sequence_ground"]=trips_leaving_spain["node_sequence_MMX"].apply(lambda row: modify_node_sequence_MMX_outgoing_trips(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trips_going_to_spain["node_sequence_ground"]=trips_going_to_spain["node_sequence_MMX"].apply(lambda row: modify_node_sequence_MMX_incoming_trips(row))


In [93]:
trips_going_to_spain[trips_going_to_spain["node_sequence_ground"]!="egress"].sample(5)

Unnamed: 0,date,origin,origin_name,destination,destination_name,entry_point,exit_point,origin_purpose,destination_purpose,legs,nationality,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,n_legs,mode_sequence,node_sequence,start_node,end_node,type,road_legs,train_legs,plane_legs,node_sequence_reduced,start_node_reduced,end_node_reduced,mode_tp,trips,node_sequence_MMX,node_sequence_ground
69562,20220923,RO,RO,ES243,Zaragoza,LEGE,,NF,NF,P07*abroad_226*17233_AM*None*airport_GRO*00-01...,PL,24.710439,3.834378,5.325526,0.0,2.13021,0.426042,4,plane-road-train-road,airport_GRO-train_79300-train_04040,airport_GRO,train_04040,international_O,2,1,1,airport_GRO-train_79300-train_04040,airport_GRO,train_70600,"['air', 'rail']",36.426596,"['LEGE', '007179300', '007104040']","['007179300', '007104040']"
52449,20220923,FR,FR,ES512,Girona,LEBL,,NF,H,P14*FR102*0816904*None*airport_BCN*00-01*plane...,ES,0.739773,1.997387,1.072671,0.184943,0.0,0.184943,4,plane-road-train-road,airport_BCN-train_71801-train_79300,airport_BCN,train_79300,international_O,2,1,1,airport_BCN-train_71801-train_79300,airport_BCN,train_79300,"['air', 'rail']",4.179716,"['LEBL', '007171801', '007179300']","['007171801', '007179300']"
60147,20220923,IT,IT,ES511,Barcelona,LEGE,,NF,NF,P11*abroad_222*17233_AM*None*airport_GRO*00-01...,IT,5.041334,3.776194,3.583923,0.176889,0.384541,0.253797,4,plane-road-train-road,airport_GRO-train_79300-train_71801,airport_GRO,train_71801,international_O,2,1,1,airport_GRO-train_79300-train_71801,airport_GRO,train_71801,"['air', 'rail']",13.216678,"['LEGE', '007179300', '007171801']","['007179300', '007171801']"
64762,20220923,PT,PT,ES220,Navarra,LEBB,,NF,H,P22*PT11A*48904_AM*None*airport_BIO*00-01*plan...,ES,0.020399,0.01511,0.009066,0.001133,0.002267,0.001511,4,plane-road-train-road,airport_BIO-train_13200-train_11200,airport_BIO,train_11200,international_O,2,1,1,airport_BIO-train_13200-train_11200,airport_BIO,train_11200,"['air', 'rail']",0.049486,"['LEBB', '007113200', '007111200']","['007113200', '007111200']"
52204,20220923,FR,FR,ES511,Barcelona,LEGE,,NF,NF,P10*abroad_208*17233_AM*None*airport_GRO*00-01...,FR,0.271201,0.203142,0.192799,0.009516,0.020687,0.013653,4,plane-road-train-road,airport_GRO-train_79300-train_71801,airport_GRO,train_71801,international_O,2,1,1,airport_GRO-train_79300-train_71801,airport_GRO,train_71801,"['air', 'rail']",0.710996,"['LEGE', '007179300', '007171801']","['007179300', '007171801']"


In [120]:
trips_leaving_spain[trips_leaving_spain["node_sequence_ground"]=="access"].sample(5)

Unnamed: 0,date,origin,origin_name,destination,destination_name,entry_point,exit_point,origin_purpose,destination_purpose,legs,nationality,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,n_legs,mode_sequence,node_sequence,start_node,end_node,type,road_legs,train_legs,plane_legs,node_sequence_reduced,start_node_reduced,end_node_reduced,mode_tp,trips,node_sequence_MMX,node_sequence_ground
18191,20220923,ES300,Madrid,PT,PT,,LEMD,NF,NF,P00*2807920*2807921*None*airport_MAD*None*road...,ES,0.65554,0.27903,0.110525,0.01377,0.051095,0.023192,2,road-plane,airport_MAD,airport_MAD,airport_MAD,international_D,1,0,1,airport_MAD,airport_MAD,airport_MAD,['air'],1.133153,['LEMD'],access
42359,20220923,ES618,Sevilla,PT,PT,,LEZL,H,NF,P18*4109109*4109109*None*airport_SVQ*None*road...,ES,0.466701,0.14949,0.08386,0.007292,0.016407,0.034638,2,road-plane,airport_SVQ,airport_SVQ,airport_SVQ,international_D,1,0,1,airport_SVQ,airport_SVQ,airport_SVQ,['air'],0.758389,['LEZL'],access
14199,20220923,ES300,Madrid,FR,FR,,LEMD,NF,NF,P07*2807921*2807921*None*airport_MAD*None*road...,ES,0.605572,0.223331,0.055833,0.030064,0.060128,0.017179,2,road-plane,airport_MAD,airport_MAD,airport_MAD,international_D,1,0,1,airport_MAD,airport_MAD,airport_MAD,['air'],0.992107,['LEMD'],access
32279,20220923,ES521,Alicante / Alacant,FR,FR,,LEAL,O,NF,P14*0306507*0306507*None*airport_ALC*00-01*roa...,ES,0.411551,0.448224,0.150766,0.012224,0.004075,0.024449,2,road-plane,airport_ALC,airport_ALC,airport_ALC,international_D,1,0,1,airport_ALC,airport_ALC,airport_ALC,['air'],1.051288,['LEAL'],access
32154,20220923,ES521,Alicante / Alacant,FR,FR,,LEAL,NF,NF,P17*0306503*0306507*None*airport_ALC*00-01*roa...,FR,0.212894,0.479011,0.053223,0.0,0.0,0.0,2,road-plane,airport_ALC,airport_ALC,airport_ALC,international_D,1,0,1,airport_ALC,airport_ALC,airport_ALC,['air'],0.745129,['LEAL'],access


In [111]:
def coefficient_calculator(trips,iata_to_icao):
    trips=trips.copy()
    trips["train_leg"]=0
    if trips["entry_point"].isna().any():
        raise ValueError("missing values for entry point, check dataframe")
    else:
        for idx,row in trips.iterrows():
            if row["mode_tp"] == "['air']":
                continue
            else:
                modes=row["mode_sequence"].split("-")
                modes_filtered=[mode for mode in modes if mode not in ["bus","road"]]
                for idx2,mode in enumerate(modes_filtered):
                    if mode != "plane":
                        trips.loc[idx, "entry_point"]=row["node_sequence"].split("-")[idx2-1]
                        if mode == "train":
                            trips.loc[idx,"train_leg"]=1
                        break
    
        trips["entry_point"]=trips["entry_point"].str.replace("airport_","")
        trips["entry_point"]=trips["entry_point"].replace(iata_to_icao)
        
        #remove columns
        trips=trips.drop(["origin","origin_name",'date', 'exit_point', 'origin_purpose', 'destination_purpose',
       'legs', 'nationality', 'archetype_0', 'archetype_1', 'archetype_2',
       'archetype_3', 'archetype_4', 'archetype_5', 'n_legs', 'mode_sequence',
       'node_sequence', 'start_node', 'end_node', 'type', 'road_legs',
       'train_legs', 'plane_legs', 'node_sequence_reduced',
       'start_node_reduced', 'end_node_reduced', 'mode_tp',"node_sequence_MMX"],axis=1)
        
        #groupby and create the coefficient
        trips=trips.groupby(["destination","destination_name","entry_point","train_leg","node_sequence_ground"]).sum().reset_index()
        trips['total_trips'] =trips.groupby('entry_point')['trips'].transform('sum')
        trips["coeff"]=trips["trips"]/trips["total_trips"]

        #reorder
        column_order=["entry_point"]+trips.columns.drop(["entry_point"]).tolist()
        trips=trips.reindex(columns=column_order)
        trips=trips.rename(columns={"entry_point":"last_airport"})
        trips=trips.sort_values(["last_airport","destination"])
        return trips
    

In [108]:
def coefficient_calculator_outgoing(trips,iata_to_icao):
    trips=trips.copy()
    trips["train_leg"]=0
    if trips["exit_point"].isna().any():
        raise ValueError("missing values for exit point, check dataframe")
    else:
        for idx,row in trips.iterrows():
            if row["mode_tp"] == "['air']":
                continue
            else:
                nodes=row["node_sequence"].split("-")
                exit_point=None

                for node in reversed(nodes):
                    if node.startswith("airport"):
                        exit_point=node
                    else:
                        if node.startswith("train"):
                            trips.loc[idx,"train_leg"]=1
                        break
                
                trips.loc[idx,"exit_point"]=exit_point


    
        trips["exit_point"]=trips["exit_point"].str.replace("airport_","")
        trips["exit_point"]=trips["exit_point"].replace(iata_to_icao)
        
        #remove columns
        trips=trips.drop(["destination","destination_name",'date', 'entry_point', 'origin_purpose', 'destination_purpose',
       'legs', 'nationality', 'archetype_0', 'archetype_1', 'archetype_2',
       'archetype_3', 'archetype_4', 'archetype_5', 'n_legs', 'mode_sequence',
       'node_sequence', 'start_node', 'end_node', 'type', 'road_legs',
       'train_legs', 'plane_legs', 'node_sequence_reduced',
       'start_node_reduced', 'end_node_reduced', 'mode_tp',"node_sequence_MMX"],axis=1)
        
        #groupby and create the coefficient
        trips=trips.groupby(["origin","origin_name","exit_point","train_leg","node_sequence_ground"]).sum().reset_index()
        trips['total_trips'] =trips.groupby('exit_point')['trips'].transform('sum')
        trips["coeff"]=trips["trips"]/trips["total_trips"]

        #reorder
        column_order=["exit_point"]+trips.columns.drop(["exit_point"]).tolist()
        trips=trips.reindex(columns=column_order)
        trips=trips.rename(columns={"exit_point":"first_airport"})
        trips=trips.sort_values(["first_airport","origin"])
        return trips

In [127]:
coeff_incoming=coefficient_calculator(trips_going_to_spain,iata_to_icao)

In [128]:
coeff_leaving=coefficient_calculator_outgoing(trips_leaving_spain,iata_to_icao)

## 6. Analyse results

In [129]:
coeff_incoming["trips"].sum()

np.float64(303306.96026483754)

In [130]:
trips_going_to_spain["trips"].sum()

np.float64(303306.96026483754)

In [131]:
trips_leaving_spain["trips"].sum()

np.float64(279388.77484449785)

In [132]:
coeff_leaving["trips"].sum()

np.float64(279388.77484449785)

I am not losing trips in the process

In [None]:
coeff_incoming.groupby()

Unnamed: 0,last_airport,destination,destination_name,train_leg,node_sequence_ground,trips,total_trips,coeff
27,LEAL,ES114,Pontevedra,1,"['007160911', '007122100']",8.034224,37010.471287,0.000217
35,LEAL,ES120,Asturias,0,egress,1.766597,37010.471287,0.000048
117,LEAL,ES241,Huesca,0,egress,2.742340,37010.471287,0.000074
151,LEAL,ES300,Madrid,0,egress,9.195157,37010.471287,0.000248
152,LEAL,ES300,Madrid,1,"['007160911', '007117000']",5.019661,37010.471287,0.000136
...,...,...,...,...,...,...,...,...
492,LEZL,ES617,Málaga,0,egress,5.258203,5663.341747,0.000928
493,LEZL,ES617,Málaga,1,"['007151003', '007154413']",1.668329,5663.341747,0.000295
506,LEZL,ES618,Sevilla,0,egress,4953.937943,5663.341747,0.874738
507,LEZL,ES618,Sevilla,1,egress,33.211490,5663.341747,0.005864


In [136]:
coeff_incoming

Unnamed: 0,last_airport,destination,destination_name,train_leg,node_sequence_ground,trips,total_trips,coeff
27,LEAL,ES114,Pontevedra,1,"['007160911', '007122100']",8.034224,37010.471287,0.000217
35,LEAL,ES120,Asturias,0,egress,1.766597,37010.471287,0.000048
117,LEAL,ES241,Huesca,0,egress,2.742340,37010.471287,0.000074
151,LEAL,ES300,Madrid,0,egress,9.195157,37010.471287,0.000248
152,LEAL,ES300,Madrid,1,"['007160911', '007117000']",5.019661,37010.471287,0.000136
...,...,...,...,...,...,...,...,...
492,LEZL,ES617,Málaga,0,egress,5.258203,5663.341747,0.000928
493,LEZL,ES617,Málaga,1,"['007151003', '007154413']",1.668329,5663.341747,0.000295
506,LEZL,ES618,Sevilla,0,egress,4953.937943,5663.341747,0.874738
507,LEZL,ES618,Sevilla,1,egress,33.211490,5663.341747,0.005864


In [143]:
node_sequence_ground_incoming=coeff_incoming.drop(["trips","total_trips","coeff"],axis=1)
node_sequence_ground_incoming["number_of_ground_options"]=node_sequence_ground_incoming.groupby(
    ['last_airport', 'destination']
)['node_sequence_ground'].transform('nunique')

In [146]:
node_sequence_ground_incoming["number_of_ground_options"].describe()

count    536.000000
mean       1.729478
std        1.035604
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        6.000000
Name: number_of_ground_options, dtype: float64

In [154]:
node_sequence_ground_incoming.max()

last_airport                  LEZL
destination                  ES709
destination_name             Ávila
train_leg                        1
node_sequence_ground        egress
number_of_ground_options         6
dtype: object

In [147]:
node_sequence_ground_incoming[node_sequence_ground_incoming["number_of_ground_options"]>=3]

Unnamed: 0,last_airport,destination,destination_name,train_leg,node_sequence_ground,number_of_ground_options
92,LEBB,ES220,Navarra,0,egress,3
93,LEBB,ES220,Navarra,1,"['007111200', '007181108']",3
94,LEBB,ES220,Navarra,1,"['007113200', '007111200']",3
107,LEBB,ES230,La Rioja,0,egress,3
108,LEBB,ES230,La Rioja,1,"['007113200', '007111200']",3
...,...,...,...,...,...,...
357,LEVC,ES521,Alicante / Alacant,1,['007160911'],3
358,LEVC,ES521,Alicante / Alacant,1,"['007165200', '007103216']",3
470,LEZL,ES616,Jaén,0,egress,3
471,LEZL,ES616,Jaén,1,"['007151003', '007103100']",3


In [150]:
node_sequence_ground_outgoing=coeff_leaving.drop(["trips","total_trips","coeff"],axis=1)
node_sequence_ground_outgoing["number_of_ground_options"]=node_sequence_ground_outgoing.groupby(
    ['first_airport', 'origin']
)['node_sequence_ground'].transform('nunique')

In [153]:
node_sequence_ground_outgoing["number_of_ground_options"].describe()

count    465.000000
mean       1.722581
std        0.988308
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: number_of_ground_options, dtype: float64

In [155]:
node_sequence_ground_outgoing.max()

first_airport                 LEZL
origin                       ES709
origin_name                  Ávila
train_leg                        1
node_sequence_ground        access
number_of_ground_options         5
dtype: object

Conclusion: most airports and origin/destination have only a handful of possible ground paths to reach them. This is good for us. 

## 7. Export results

In [None]:
coeff_leaving.to_csv("coefficients_outgoing_trips_from_spain_v0.3.csv",index=False)

In [158]:
coeff_incoming.to_csv("coefficients_incoming_trips_to_spain_v0.3.csv",index=False)