#### Entrega GoMotion

Realizamos los imports necesarios.

In [1]:
import os
import sys

# 1. Obtener la ruta absoluta del directorio actual del notebook
current_dir = os.getcwd()

# 2. Subir un nivel (o los necesarios) para llegar a la raíz del proyecto
# Si tu notebook está en 'GoMotion/notebooks', el padre es 'GoMotion'
project_root = os.path.dirname(current_dir)

# 3. Añadir la raíz al sys.path
# Esto permite hacer imports como "from src import ..." aunque estés en una subcarpeta
if project_root not in sys.path:
    sys.path.append(project_root)

# 4. Cambiar el directorio de trabajo a la raíz
# Esto arregla los rutas relativas a archivos tipo "./data/archivo.csv"
os.chdir(project_root)

In [None]:
import os
#shut tensorflow up if it doesn't need to say anything
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
import src.intensities as intensities
import src.event_encoder as event_encoder
import src.llm_scraper as llm_scraper
import datetime
import src.data_filler as data_filler
import src.hyperparameter_optimizer as hyperparameter_optimizer
import keras
import numpy as np
from src.metadata_manager import MetadataManager
from src.pipeline import check_and_load_data, process_scraped_events

ModuleNotFoundError: No module named 'intensities'

Empezamos a ejecutar el pipeline

In [None]:
manager = MetadataManager()
TODAY = datetime.datetime.today()
pd.set_option("display.max_columns", None)

Computamos las intensidades de los días que ya tenemos

In [None]:
if not os.path.exists("../data/intensities.csv"):
    # we only load the merged data if we do not have the intensities, as once we have them there is no need to load this (big file)
    merged_data = check_and_load_data(False)
    # calculate the intensities for each day-barri pair according to the mathematical formulation of the task
    intensities_df = intensities.process_df(merged_data, download=True, verbose=1)
else:
    intensities_df = pd.read_csv("../data/intensities.csv")
    # keep this (its useful)
    barri_list = intensities_df["barri"].unique()

Cargamos los datos de eventos y vacaciones. Pasamos los eventos por el codificador.

In [None]:
# deal with encoder
encoder_created = False
if not os.path.exists("data/encoded_events.csv"):
    if not os.path.exists("data/events.csv"):
        raise Exception("Event data is missing")
    if not os.path.exists("data/holidays.csv"):
        raise Exception("Event data is missing")

    # load precomputed events and holidays, rename them and keep track of it in metadata
    all_events = pd.read_csv("data/events.csv")
    all_events.to_csv("data/all_events.csv", index=None)
    all_holidays = pd.read_csv("data/holidays.csv")
    all_holidays.to_csv("data/all_holidays.csv", index=None)

    # run event_encoder.py
    # the file creates an encoder (which takes existing events and projects them to a 5-dimensional latent space conserving all information) and processes all events for given data
    event_encoder.main(manager)

    # keep track of this; if the encoder is rebuilt, the weights might have changed, so we need to retrain XGB model
    encoder_created = True

Si ejecutamos este pipeline más de una vez, es posible que falten eventos o vacaciones futuras (el scrapper solo recoge eventos con una semana de antelación), que tendremos que recopilar con nuestro scrapper.

In [None]:
# this part only matters if the pipeline is run multiple times: since the day might be different, we need to check new events
if (
    datetime.datetime.strptime(
        manager.get("last_day_event_checked"), "%Y-%m-%d"
    ).date()
    < TODAY.date()
):
    # there might be new events
    (new_events, new_holidays) = process_scraped_events(
        llm_scraper.scrape_week_ahead(),
        barri_list,
    )

    # load old events and holidays, concatenate new ones and metadata
    all_events = pd.read_csv("data/all_events.csv")
    all_holidays = pd.read_csv("data/all_holidays.csv")

    # cut events and holidays that might already be loaded
    event_cutoff = max(all_events["day"])
    holiday_cutoff = max(all_holidays["day"])

    new_events = new_events[new_events["day"] > event_cutoff]
    new_holidays = new_holidays[new_holidays["day"] > holiday_cutoff]

    all_events = pd.concat([all_events, new_events])
    all_holidays = pd.concat([all_holidays, new_holidays])
    all_events.to_csv("data/all_events.csv", index=None)
    all_holidays.to_csv("data/all_holidays.csv", index=None)
    manager.set("last_day_event_checked", TODAY.strftime("%Y-%m-%d"))

    # need to encode new events
    encoder = keras.models.load_model("models/encoder.keras")
    encoder_max_len = int(manager.get("encoder_max_len"))

    # predict no events (need bias)
    if len(new_events) > 0:
        encoded_events = pd.read_csv("data/encoded_events.csv")
        new_encoded_events = event_encoder.predict(
            new_events, encoder, encoder_max_len, 5
        )
        encoded_events = pd.concat([encoded_events, new_encoded_events])
        encoded_events.to_csv("data/encoded_events.csv", index=None)

Creamos el modelo predictivo

In [None]:
if encoder_created or not os.path.exists("models/regressor.joblib"):
        # we need to train a model. For that, we first prepare the data
        data = intensities_df
        data["day"] = pd.to_datetime(data["day"])

        # add meteo first
        data = data_filler.add_weather_features(
            data, datetime.datetime(year=2022, month=12, day=31), datetime.date.today()
        )

        # add rest of features
        data_processed = hyperparameter_optimizer.create_features(data)

        # store features for all
        df_to_save = hyperparameter_optimizer.create_features(data, False)
        df_to_save.drop(inplace=True, columns=["enc1", "enc2", "enc3", "enc4", "enc5"])
        df_to_save.to_csv(
            "data/data_processed.csv", index=None
        )

        # training portion
        # choose where to split the dataset
        split_date = datetime.datetime(year=2025, month=1, day=1)

        # split data
        train = data_processed.loc[data_processed["day"] < split_date].copy()
        test = data_processed.loc[data_processed["day"] >= split_date].copy()

        # remove nans
        train = train.dropna()

        # define empty hyperspace; adding values next
        hyperspace = []

         # weights base
        hyperspace.append([10])
        # learning rate
        hyperspace.append([0.0005])
        # tree depth
        hyperspace.append([9])
        hyperparameter_optimizer.grid_search(
            hyperspace, 0, [], hyperparameter_optimizer.features, train, test
        )


Abrimos el dashboard

In [None]:
os.system("streamlit run src/dashboard.py")