In [1]:
import pandas as pd
import sys
import os

In [2]:
# Get the absolute path of the directory where the notebook is currently running
# This usually is /path/to/app/folder_notebook
notebook_dir = os.getcwd()

# Get the absolute path of the 'app' directory (parent of folder_notebook)
main_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
db_path = os.path.join(main_dir, 'data', 'planet_fone.db')
if main_dir not in sys.path:
    sys.path.append(main_dir)
    
from utils import sql


In [3]:
def generate_features(
    codes: list = None,
    n_random: int = None,
    year: int = 2026,
    seed: int = None,
    verbose: bool = False
) -> pd.DataFrame:
    """
    Generate features for a sequence of circuits.

    Parameters:
        codes (list): List of circuit codes to use as the calendar. If None, random circuits are selected.
        n_random (int): Number of random circuits to select if codes is None.
        year (int): The year for which to generate features (used to compute 'from_thirty').
        seed (int): Random seed for reproducibility when sampling random circuits.
        verbose (bool): If True, prints information about the process.

    Returns:
        pd.DataFrame: DataFrame containing features for each leg in the calendar.
    """

    travel_logistics_df = sql.get_table('travel_logistic', db_path)
    fone_geography_df = sql.get_table('fone_geography', db_path)

    if year is not None:
        from_thirty = 2030 - year

    if n_random is not None and n_random > 0 and codes is None:
        if seed is None:
            seed = 42
        calendar = fone_geography_df['code_6'].sample(n_random, random_state=seed).to_list()
        if verbose:
            print(f"Randomly selected {n_random} circuits: {calendar}")
    elif codes is not None:
        calendar = list(codes)
        if verbose:
            print(f"Using provided codes: {calendar}")
    else:
        raise ValueError("Either 'codes' must be provided or 'n_random' must be > 0.")

    legs = [{'from_circuit': calendar[i],
             'to_circuit': calendar[i + 1]} for i in range(len(calendar) - 1)]
    legs = pd.DataFrame(legs)
    legs['codes'] = legs['from_circuit'].astype(str) + '-' + legs['to_circuit'].astype(str)

    features_df = travel_logistics_df[travel_logistics_df['codes'].isin(legs['codes'])].set_index('codes').loc[legs['codes']].reset_index()
    features_df = features_df.rename(columns={"truck_viable": "truck_feasible", "distance_km": "air_distance_km"})
    features_df = features_df.drop(columns=['needs_air', 'transport_mode', 'effort_score', "air_emissions", "truck_emissions"])

    features_df['from_thirty'] = from_thirty

    if verbose:
        print(f"Generated features for {len(features_df)} legs.")

    return features_df

In [4]:
# this func is an example of how to use the function and gives you test data to play around with
generate_features(n_random=20)

Unnamed: 0,codes,id,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_thirty
0,SINMAR-NOROSL,656,14,Marina Bay,46,Oslo,10056.660499,,0,4
1,NOROSL-CANVAN,2162,46,Oslo,48,Vancouver,7181.440994,,0,4
2,CANVAN-NIGLAG,2254,48,Vancouver,45,Lagos,11944.624001,,0,4
3,NIGLAG-AZEBAK,2086,45,Lagos,18,Baku,5948.505734,,0,4
4,AZEBAK-QATLUS,826,18,Baku,28,Lusail,1661.348675,2653.1,0,4
5,QATLUS-FRALEC,1296,28,Lusail,27,Le Castellet,4562.22255,,0,4
6,FRALEC-SPAVAL,1248,27,Le Castellet,26,Valencia,662.19692,893.36,1,4
7,SPAVAL-MONMON,1206,26,Valencia,32,Monte Carlo,800.431221,1026.77,1,4
8,MONMON-FRAMAG,1477,32,Monte Carlo,20,Magny Cours,481.49075,742.77,1,4
9,FRAMAG-GERHOC,906,20,Magny Cours,13,Hockenheim,485.629655,678.19,1,4


In [5]:
def fetch_training_data(verbose=False) -> pd.DataFrame:

    fone_calendar_df = sql.get_table('fone_calendar', db_path)
    fone_geography_df = sql.get_table('fone_geography', db_path)

    fone_calendar_df = fone_calendar_df[fone_calendar_df['leg_emissions'] > 0]

    training_data = pd.DataFrame()
    for year in fone_calendar_df['year'].unique():
        year_data = fone_calendar_df[fone_calendar_df['year'] == year]
        if len(year_data) > 0:
            codes = year_data.merge(fone_geography_df, how='left', left_on='geo_id', right_on='id')['code_6']
            if verbose:
                print(f"Year {year}: {len(codes)} codes")
            features = generate_features(codes=codes, year=year, verbose=verbose)
            features['leg_emissions'] = year_data['leg_emissions'].values[1:]
            training_data = pd.concat([training_data, features], ignore_index=True)

    if verbose:
        print(f"Total training samples: {len(training_data)}")

    return training_data

In [6]:
fetch_training_data( verbose=True) # this is the new training dataset

Year 2000: 16 codes
Using provided codes: ['AUSMEL', 'BRASAO', 'ITAIMO', 'UKGSIL', 'SPACAT', 'GERNüR', 'MONMON', 'CANMON', 'FRAMAG', 'AUSSPI', 'GERHOC', 'HUNBUD', 'BELSPA', 'ITAMON', 'USAIND', 'JAPSUZ']
Generated features for 15 legs.
Year 2001: 16 codes
Using provided codes: ['AUSMEL', 'MALKUA', 'BRASAO', 'ITAIMO', 'SPACAT', 'AUSSPI', 'MONMON', 'CANMON', 'GERNüR', 'FRAMAG', 'UKGSIL', 'GERHOC', 'HUNBUD', 'BELSPA', 'ITAMON', 'USAIND']
Generated features for 15 legs.
Year 2002: 16 codes
Using provided codes: ['AUSMEL', 'MALKUA', 'BRASAO', 'ITAIMO', 'SPACAT', 'AUSSPI', 'MONMON', 'CANMON', 'GERNüR', 'UKGSIL', 'FRAMAG', 'GERHOC', 'HUNBUD', 'BELSPA', 'ITAMON', 'USAIND']
Generated features for 15 legs.
Year 2003: 15 codes
Using provided codes: ['AUSMEL', 'MALKUA', 'BRASAO', 'ITAIMO', 'SPACAT', 'AUSSPI', 'MONMON', 'CANMON', 'GERNüR', 'FRAMAG', 'UKGSIL', 'GERHOC', 'HUNBUD', 'ITAMON', 'USAIND']
Generated features for 14 legs.
Year 2004: 17 codes
Using provided codes: ['AUSMEL', 'MALKUA', 'BAHSAK

Unnamed: 0,codes,id,from_id,from_circuit,to_id,to_circuit,air_distance_km,truck_distance_km,truck_feasible,from_thirty,leg_emissions
0,AUSMEL-BRASAO,239,6,Melbourne,4,São Paulo,13062.910869,,0,30,15704.375394
1,BRASAO-ITAIMO,156,4,São Paulo,16,Imola,9612.464691,,0,30,233.936829
2,ITAIMO-UKGSIL,706,16,Imola,1,Silverstone,1273.383116,1591.00,1,30,237.024619
3,UKGSIL-SPACAT,9,1,Silverstone,10,Catalunya,1194.496386,1612.00,1,30,185.302671
4,SPACAT-GERNüR,458,10,Catalunya,36,Nürburgring,1039.309617,1260.24,1,30,167.133234
...,...,...,...,...,...,...,...,...,...,...,...
422,SINMAR-USAAUS,625,14,Marina Bay,15,Austin,15842.999006,,0,6,137.841562
423,USAAUS-MEXMEX,674,15,Austin,17,Mexico City,1201.867707,1508.45,1,6,7544.880460
424,MEXMEX-BRASAO,756,17,Mexico City,4,São Paulo,7430.971654,,0,6,9938.587665
425,BRASAO-USALAS,173,4,São Paulo,33,Las Vegas,9788.539873,,0,6,13215.309929


In [None]:
def regression_model():
    """
    Placeholder function for regression model.
    This function should be implemented to train a regression model on the training data.
    it uses fetch_training_data() to get the training data.
    returns the trained model so it can be used for predictions.
    i.e. regression_model = regression_model()
    usage: regression_model.predict(X_test) where X_test is the test data (a record with all features used in the training data).
    """
    pass