In [4]:
import pandas as pd
import sys
import os

In [8]:
# Get the absolute path of the directory where the notebook is currently running
# This usually is /path/to/app/folder_notebook
notebook_dir = os.getcwd()

# Get the absolute path of the 'app' directory (parent of folder_notebook)
main_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
db_path = os.path.join(main_dir, 'data', 'planet_fone.db')
if main_dir not in sys.path:
    sys.path.append(main_dir)
    
from utils import sql


In [9]:
travel_logistics_df = sql.get_table("travel_logistic", db_path)
fone_geography_df = sql.get_table("fone_geography", db_path)

In [4]:
#get a random list of n' code_6 from fone_geography_df 
def get_random_code_6(df, n):
    return df['code_6'].sample(n).tolist()  

In [5]:
random_circuits = get_random_code_6(fone_geography_df, 20)
random_circuits

['BELSPA',
 'ITAIMO',
 'SPACAT',
 'FRAMAG',
 'SPAVAL',
 'JAPOYA',
 'MONMON',
 'USALAS',
 'USAIND',
 'AUSSPI',
 'RUSSOC',
 'UKGSIL',
 'GERNüR',
 'MEXMEX',
 'AUSMEL',
 'INDUTT',
 'ARGBUE',
 'ITAMON',
 'GERHOC',
 'BRASAO']

In [6]:
# Create a list of dictionaries for the rows
def create_legs_dataframe(circuit_list):
    rows = [{"from_circuit": circuit_list[i], "to_circuit": circuit_list[i+1]} for i in range(len(circuit_list)-1)]
    return rows

# Convert the list of dictionaries into a DataFrame
circuit_df = pd.DataFrame(create_legs_dataframe(random_circuits))
circuit_df

Unnamed: 0,from_circuit,to_circuit
0,BELSPA,ITAIMO
1,ITAIMO,SPACAT
2,SPACAT,FRAMAG
3,FRAMAG,SPAVAL
4,SPAVAL,JAPOYA
5,JAPOYA,MONMON
6,MONMON,USALAS
7,USALAS,USAIND
8,USAIND,AUSSPI
9,AUSSPI,RUSSOC


In [7]:
#add codes to the DataFrame
def add_codes(df):
    # The function should operate on the DataFrame passed as 'df'
    df['codes'] = df['from_circuit'] + '-' + df['to_circuit']
    return df


circuit_df = add_codes(circuit_df)

circuit_df

Unnamed: 0,from_circuit,to_circuit,codes
0,BELSPA,ITAIMO,BELSPA-ITAIMO
1,ITAIMO,SPACAT,ITAIMO-SPACAT
2,SPACAT,FRAMAG,SPACAT-FRAMAG
3,FRAMAG,SPAVAL,FRAMAG-SPAVAL
4,SPAVAL,JAPOYA,SPAVAL-JAPOYA
5,JAPOYA,MONMON,JAPOYA-MONMON
6,MONMON,USALAS,MONMON-USALAS
7,USALAS,USAIND,USALAS-USAIND
8,USAIND,AUSSPI,USAIND-AUSSPI
9,AUSSPI,RUSSOC,AUSSPI-RUSSOC


In [8]:
# add columns from travel_logistics_df
def add_travel_logistics_columns(df):
    filtered_travel_logistics_df = travel_logistics_df[travel_logistics_df['codes'].isin(circuit_df['codes'])].set_index('codes').loc[circuit_df['codes']].reset_index()
    filtered_travel_logistics_df = filtered_travel_logistics_df.rename(columns={"truck_viable": "truck_feasible", "distance_km": "air_distance_km"})
    return filtered_travel_logistics_df

circuit_df = add_travel_logistics_columns(circuit_df)
circuit_df

Unnamed: 0,codes,id,from_id,from_circuit,to_id,to_circuit,air_distance_km,transport_mode,needs_air,truck_distance_km,truck_feasible,truck_emissions,air_emissions,effort_score
0,BELSPA-ITAIMO,956,21,Spa-Francorchamps,16,Imola,803.328659,,0,1049.31,1,44071.02,337398.0,94.4379
1,ITAIMO-SPACAT,715,16,Imola,10,Catalunya,828.38291,,0,1152.06,1,48386.52,347920.8,103.6854
2,SPACAT-FRAMAG,442,10,Catalunya,20,Magny Cours,593.050712,,0,758.74,1,31867.08,249081.3,68.2866
3,FRAMAG-SPAVAL,918,20,Magny Cours,26,Valencia,870.619772,,0,1125.97,1,47290.74,365660.3,101.3373
4,SPAVAL-JAPOYA,1209,26,Valencia,35,Oyama,10704.148116,,1,,0,,4495742.0,10704.148116
5,JAPOYA-MONMON,1630,35,Oyama,32,Monte Carlo,9945.576818,,1,,0,,4177142.0,9945.576818
6,MONMON-USALAS,1489,32,Monte Carlo,33,Las Vegas,9407.250243,,1,,0,,3951045.0,9407.250243
7,USALAS-USAIND,1527,33,Las Vegas,23,Indianapolis,2553.836573,,0,2978.95,1,125115.9,1072611.0,268.1055
8,USAIND-AUSSPI,1045,23,Indianapolis,11,Spielberg,7591.523802,,1,,0,,3188440.0,7591.523802
9,AUSSPI-RUSSOC,491,11,Spielberg,22,Sochi,2005.769629,,0,3788.16,0,159102.72,842423.2,2005.769629


In [9]:
def drop_unnecessary_columns(df):
    # Drop the columns that are not needed
    df = df.drop(columns=['needs_air', 'transport_mode', 'effort_score', "codes", "air_emissions", "truck_emissions", "id"])

    return df   

circuit_df = drop_unnecessary_columns(circuit_df)
circuit_df

Unnamed: 0,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible
0,21,Spa-Francorchamps,16,Imola,803.328659,1049.31,1
1,16,Imola,10,Catalunya,828.38291,1152.06,1
2,10,Catalunya,20,Magny Cours,593.050712,758.74,1
3,20,Magny Cours,26,Valencia,870.619772,1125.97,1
4,26,Valencia,35,Oyama,10704.148116,,0
5,35,Oyama,32,Monte Carlo,9945.576818,,0
6,32,Monte Carlo,33,Las Vegas,9407.250243,,0
7,33,Las Vegas,23,Indianapolis,2553.836573,2978.95,1
8,23,Indianapolis,11,Spielberg,7591.523802,,0
9,11,Spielberg,22,Sochi,2005.769629,3788.16,0


In [10]:
#add from_lat	from_long	to_lat	to_long
def add_lat_long(df):
    df = (
        df
        .merge(
            fone_geography_df[['id', 'latitude', 'longitude']],
            left_on='from_id',
            right_on='id',
            how='left'
        )
        .rename(columns={'latitude': 'from_lat', 'longitude': 'from_lon'})
        .drop(columns=['id'])
        .merge(
            fone_geography_df[['id', 'latitude', 'longitude']],
            left_on='to_id',
            right_on='id',
            how='left'
        )
        .rename(columns={'latitude': 'to_lat', 'longitude': 'to_lon'})
        .drop(columns=['id'])
    )
    return df

circuit_df = add_lat_long(circuit_df)
circuit_df

Unnamed: 0,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_lat,from_lon,to_lat,to_lon
0,21,Spa-Francorchamps,16,Imola,803.328659,1049.31,1,50.4455,5.9708,44.3439,11.7167
1,16,Imola,10,Catalunya,828.38291,1152.06,1,44.3439,11.7167,41.57,2.2611
2,10,Catalunya,20,Magny Cours,593.050712,758.74,1,41.57,2.2611,46.8642,3.1633
3,20,Magny Cours,26,Valencia,870.619772,1125.97,1,46.8642,3.1633,39.4589,-0.3317
4,26,Valencia,35,Oyama,10704.148116,,0,39.4589,-0.3317,35.3711,138.9278
5,35,Oyama,32,Monte Carlo,9945.576818,,0,35.3711,138.9278,43.7347,7.4206
6,32,Monte Carlo,33,Las Vegas,9407.250243,,0,43.7347,7.4206,36.1699,-115.1398
7,33,Las Vegas,23,Indianapolis,2553.836573,2978.95,1,36.1699,-115.1398,39.7876,-86.2392
8,23,Indianapolis,11,Spielberg,7591.523802,,0,39.7876,-86.2392,47.2196,14.7649
9,11,Spielberg,22,Sochi,2005.769629,3788.16,0,47.2196,14.7649,43.4057,39.9578


In [11]:
# get deltas
def get_deltas(df):
    df['delta_lon'] = abs(df['from_lon'] - df['to_lon'])
    df['delta_lat'] = abs(df['from_lat'] - df['to_lat'])
    return df

circuit_df = get_deltas(circuit_df)
circuit_df


Unnamed: 0,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_lat,from_lon,to_lat,to_lon,delta_lon,delta_lat
0,21,Spa-Francorchamps,16,Imola,803.328659,1049.31,1,50.4455,5.9708,44.3439,11.7167,5.7459,6.1016
1,16,Imola,10,Catalunya,828.38291,1152.06,1,44.3439,11.7167,41.57,2.2611,9.4556,2.7739
2,10,Catalunya,20,Magny Cours,593.050712,758.74,1,41.57,2.2611,46.8642,3.1633,0.9022,5.2942
3,20,Magny Cours,26,Valencia,870.619772,1125.97,1,46.8642,3.1633,39.4589,-0.3317,3.495,7.4053
4,26,Valencia,35,Oyama,10704.148116,,0,39.4589,-0.3317,35.3711,138.9278,139.2595,4.0878
5,35,Oyama,32,Monte Carlo,9945.576818,,0,35.3711,138.9278,43.7347,7.4206,131.5072,8.3636
6,32,Monte Carlo,33,Las Vegas,9407.250243,,0,43.7347,7.4206,36.1699,-115.1398,122.5604,7.5648
7,33,Las Vegas,23,Indianapolis,2553.836573,2978.95,1,36.1699,-115.1398,39.7876,-86.2392,28.9006,3.6177
8,23,Indianapolis,11,Spielberg,7591.523802,,0,39.7876,-86.2392,47.2196,14.7649,101.0041,7.432
9,11,Spielberg,22,Sochi,2005.769629,3788.16,0,47.2196,14.7649,43.4057,39.9578,25.1929,3.8139


In [12]:
# add is_island

def add_island_info(df):
    df = pd.merge(
        df,
        fone_geography_df[['id', 'is_island']],
        left_on='from_id',
        right_on='id',
        how='left'
    ).rename(columns={'is_island': 'from_is_island'}).drop(columns=['id'])

    # Merge to get 'to_is_island'
    df = pd.merge(
        df,
        fone_geography_df[['id', 'is_island']],
        left_on='to_id',
        right_on='id',
        how='left'
    ).rename(columns={'is_island': 'to_is_island'}).drop(columns=['id'])
    return df

circuit_df = add_island_info(circuit_df)
circuit_df

Unnamed: 0,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_lat,from_lon,to_lat,to_lon,delta_lon,delta_lat,from_is_island,to_is_island
0,21,Spa-Francorchamps,16,Imola,803.328659,1049.31,1,50.4455,5.9708,44.3439,11.7167,5.7459,6.1016,0,0
1,16,Imola,10,Catalunya,828.38291,1152.06,1,44.3439,11.7167,41.57,2.2611,9.4556,2.7739,0,0
2,10,Catalunya,20,Magny Cours,593.050712,758.74,1,41.57,2.2611,46.8642,3.1633,0.9022,5.2942,0,0
3,20,Magny Cours,26,Valencia,870.619772,1125.97,1,46.8642,3.1633,39.4589,-0.3317,3.495,7.4053,0,0
4,26,Valencia,35,Oyama,10704.148116,,0,39.4589,-0.3317,35.3711,138.9278,139.2595,4.0878,0,1
5,35,Oyama,32,Monte Carlo,9945.576818,,0,35.3711,138.9278,43.7347,7.4206,131.5072,8.3636,1,0
6,32,Monte Carlo,33,Las Vegas,9407.250243,,0,43.7347,7.4206,36.1699,-115.1398,122.5604,7.5648,0,0
7,33,Las Vegas,23,Indianapolis,2553.836573,2978.95,1,36.1699,-115.1398,39.7876,-86.2392,28.9006,3.6177,0,0
8,23,Indianapolis,11,Spielberg,7591.523802,,0,39.7876,-86.2392,47.2196,14.7649,101.0041,7.432,0,0
9,11,Spielberg,22,Sochi,2005.769629,3788.16,0,47.2196,14.7649,43.4057,39.9578,25.1929,3.8139,0,0


In [13]:
# add leg id from travel_logistics_df
def add_leg_id(df):
    merged_df = pd.merge(
    df,
    travel_logistics_df[['from_id', 'to_id', 'id']],
    on=['from_id', 'to_id'],
    how='left'
)
    df['id'] = merged_df['id']
    return df
circuit_df = add_leg_id(circuit_df)
circuit_df


Unnamed: 0,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_lat,from_lon,to_lat,to_lon,delta_lon,delta_lat,from_is_island,to_is_island,id
0,21,Spa-Francorchamps,16,Imola,803.328659,1049.31,1,50.4455,5.9708,44.3439,11.7167,5.7459,6.1016,0,0,956
1,16,Imola,10,Catalunya,828.38291,1152.06,1,44.3439,11.7167,41.57,2.2611,9.4556,2.7739,0,0,715
2,10,Catalunya,20,Magny Cours,593.050712,758.74,1,41.57,2.2611,46.8642,3.1633,0.9022,5.2942,0,0,442
3,20,Magny Cours,26,Valencia,870.619772,1125.97,1,46.8642,3.1633,39.4589,-0.3317,3.495,7.4053,0,0,918
4,26,Valencia,35,Oyama,10704.148116,,0,39.4589,-0.3317,35.3711,138.9278,139.2595,4.0878,0,1,1209
5,35,Oyama,32,Monte Carlo,9945.576818,,0,35.3711,138.9278,43.7347,7.4206,131.5072,8.3636,1,0,1630
6,32,Monte Carlo,33,Las Vegas,9407.250243,,0,43.7347,7.4206,36.1699,-115.1398,122.5604,7.5648,0,0,1489
7,33,Las Vegas,23,Indianapolis,2553.836573,2978.95,1,36.1699,-115.1398,39.7876,-86.2392,28.9006,3.6177,0,0,1527
8,23,Indianapolis,11,Spielberg,7591.523802,,0,39.7876,-86.2392,47.2196,14.7649,101.0041,7.432,0,0,1045
9,11,Spielberg,22,Sochi,2005.769629,3788.16,0,47.2196,14.7649,43.4057,39.9578,25.1929,3.8139,0,0,491


In [14]:
def add_intercontinental_flag(df):
    """
    Adds is_intercontinental column to the dataframe using fone_geography_df from the environment.
    Does not create intermediate from_continent/to_continent columns.
    
    Args:
        df: The travel logistics dataframe (must contain 'from_id' and 'to_id' columns)
        
    Returns:
        Modified dataframe with added is_intercontinental column
    """
    # Create mapping dictionary from geography data
    continent_map = fone_geography_df.set_index('id')['continent'].to_dict()
    
    # Map continents directly without creating intermediate columns
    from_continents = df['from_id'].map(continent_map)
    to_continents = df['to_id'].map(continent_map)
    
    # Create is_intercontinental column
    df['is_intercontinental'] = (from_continents != to_continents).astype(int)
    
    return df

circuit_df = add_intercontinental_flag(circuit_df)
circuit_df

Unnamed: 0,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_lat,from_lon,to_lat,to_lon,delta_lon,delta_lat,from_is_island,to_is_island,id,is_intercontinental
0,21,Spa-Francorchamps,16,Imola,803.328659,1049.31,1,50.4455,5.9708,44.3439,11.7167,5.7459,6.1016,0,0,956,0
1,16,Imola,10,Catalunya,828.38291,1152.06,1,44.3439,11.7167,41.57,2.2611,9.4556,2.7739,0,0,715,0
2,10,Catalunya,20,Magny Cours,593.050712,758.74,1,41.57,2.2611,46.8642,3.1633,0.9022,5.2942,0,0,442,0
3,20,Magny Cours,26,Valencia,870.619772,1125.97,1,46.8642,3.1633,39.4589,-0.3317,3.495,7.4053,0,0,918,0
4,26,Valencia,35,Oyama,10704.148116,,0,39.4589,-0.3317,35.3711,138.9278,139.2595,4.0878,0,1,1209,1
5,35,Oyama,32,Monte Carlo,9945.576818,,0,35.3711,138.9278,43.7347,7.4206,131.5072,8.3636,1,0,1630,1
6,32,Monte Carlo,33,Las Vegas,9407.250243,,0,43.7347,7.4206,36.1699,-115.1398,122.5604,7.5648,0,0,1489,1
7,33,Las Vegas,23,Indianapolis,2553.836573,2978.95,1,36.1699,-115.1398,39.7876,-86.2392,28.9006,3.6177,0,0,1527,0
8,23,Indianapolis,11,Spielberg,7591.523802,,0,39.7876,-86.2392,47.2196,14.7649,101.0041,7.432,0,0,1045,1
9,11,Spielberg,22,Sochi,2005.769629,3788.16,0,47.2196,14.7649,43.4057,39.9578,25.1929,3.8139,0,0,491,0


In [15]:
def add_consecutive_races_column(df):
    """
    Adds consecutive_races_in_region column that counts consecutive non-intercontinental races.
    
    Args:
        df: Dataframe containing is_intercontinental column
        
    Returns:
        Modified dataframe with added consecutive_races_in_region column
    """
    def count_consecutive_zeros(series):
        count = 0
        result = []
        group_start = None
        
        for i, val in enumerate(series):
            if val == 0:
                count += 1
                if group_start is None:
                    group_start = i
                result.append(count)
            else:
                if group_start is not None:
                    # Fill the entire group with the last value
                    max_val = count
                    for j in range(group_start, i):
                        result[j] = max_val
                    group_start = None
                count = 0
                result.append(0)
        
        # Handle any remaining group at the end
        if group_start is not None:
            max_val = count
            for j in range(group_start, len(series)):
                result[j] = max_val
        
        return result

    # Apply the function to create consecutive_races_in_region column
    df['consecutive_races_in_region'] = count_consecutive_zeros(
        df['is_intercontinental'].fillna(1).astype(int)
    )

    # Ensure the first race in the sequence is not counted as consecutive
    if not df.empty:
        df.loc[0, 'consecutive_races_in_region'] = 0
    
    return df

circuit_df = add_consecutive_races_column(circuit_df)
circuit_df

Unnamed: 0,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_lat,from_lon,to_lat,to_lon,delta_lon,delta_lat,from_is_island,to_is_island,id,is_intercontinental,consecutive_races_in_region
0,21,Spa-Francorchamps,16,Imola,803.328659,1049.31,1,50.4455,5.9708,44.3439,11.7167,5.7459,6.1016,0,0,956,0,0
1,16,Imola,10,Catalunya,828.38291,1152.06,1,44.3439,11.7167,41.57,2.2611,9.4556,2.7739,0,0,715,0,4
2,10,Catalunya,20,Magny Cours,593.050712,758.74,1,41.57,2.2611,46.8642,3.1633,0.9022,5.2942,0,0,442,0,4
3,20,Magny Cours,26,Valencia,870.619772,1125.97,1,46.8642,3.1633,39.4589,-0.3317,3.495,7.4053,0,0,918,0,4
4,26,Valencia,35,Oyama,10704.148116,,0,39.4589,-0.3317,35.3711,138.9278,139.2595,4.0878,0,1,1209,1,0
5,35,Oyama,32,Monte Carlo,9945.576818,,0,35.3711,138.9278,43.7347,7.4206,131.5072,8.3636,1,0,1630,1,0
6,32,Monte Carlo,33,Las Vegas,9407.250243,,0,43.7347,7.4206,36.1699,-115.1398,122.5604,7.5648,0,0,1489,1,0
7,33,Las Vegas,23,Indianapolis,2553.836573,2978.95,1,36.1699,-115.1398,39.7876,-86.2392,28.9006,3.6177,0,0,1527,0,1
8,23,Indianapolis,11,Spielberg,7591.523802,,0,39.7876,-86.2392,47.2196,14.7649,101.0041,7.432,0,0,1045,1,0
9,11,Spielberg,22,Sochi,2005.769629,3788.16,0,47.2196,14.7649,43.4057,39.9578,25.1929,3.8139,0,0,491,0,3
