In [82]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import polars as pl
import numpy as np
import wandb
from wandb.integration.keras import WandbCallback
from sklearn.preprocessing import StandardScaler


# Read data

In [83]:
df = pl.read_csv("D:\\PJATK_MGR\\bus_delay_project\\df_for_modelling_v2.csv", separator=";", schema_overrides={
    "line": pl.Utf8,
    "stop_id": pl.Int16,
    "stop_lat": pl.Float32,
    "stop_lon": pl.Float32,
    "stop_seq": pl.Int16,
    "arrival_hour": pl.Int8,
    "delay": pl.Int32,  
})[:200_000]

In [84]:
def preprocess_data(_df):
    _df = _df.drop_nulls()
    _df = _df.filter(pl.col("delay").is_between(-60*60, 60*60)) # drop extreme values

    _df = _df.with_columns(
        pl.col("is_weekday").cast(pl.Int8),
        pl.col("is_holiday").cast(pl.Int8),
    )

    unique_lines = _df.select("line").unique().sort("line").with_row_count(name="line_encoded")
    _df = _df.join(unique_lines, on="line", how="left")

    return _df

def prepare_training_data(_df):
    X_train_full = _df.select(selected_columns)
    X_train_full = X_train_full.to_numpy()
    y_train_full = _df.select(["delay"]).to_numpy()
    X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, test_size=0.33, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=42)
    
    print("NaNs in X_train:", np.isnan(X_train).sum())
    print("Infs in X_train:", np.isinf(X_train).sum())
    print("NaNs in y_train:", np.isnan(y_train).sum())
    print("Infs in y_train:", np.isinf(y_train).sum())
    print("Delay extreme values:", df["delay"].min(), df["delay"].max())
    
    print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_valid, y_valid, X_test, y_test
    

In [85]:
selected_columns = [
     "is_weekday",
    "arrival_hour",
    "is_holiday",
    "stop_lat",
    "stop_lon",
    "line_encoded",
    "stop_seq",
    "detection_type",
]

# Preprocess data

In [86]:
df = preprocess_data(df)
X_train, y_train, X_valid, y_valid, X_test, y_test = prepare_training_data(df)

  unique_lines = _df.select("line").unique().sort("line").with_row_count(name="line_encoded")


NaNs in X_train: 0
Infs in X_train: 0
NaNs in y_train: 0
Infs in y_train: 0
Delay extreme values: -477 3420
(108170, 8) (108170, 1) (26639, 8) (26639, 1)


# Initialize Run

In [87]:
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="dsc-pjatk-warsaw",
    # Set the wandb project where this run will be logged.
    project="bus-punctuality",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": 0.005,
        "architecture": "2 Layer NN with 32 neurons each",
        "epochs": 15,
        "optimizer": "SGD",
        "train_size": X_train.shape[0],
    },
)

0,1
epoch,▁▂▃▄▅▅▆▇█

0,1
epoch,8.0
loss,
rmse,
val_loss,
val_rmse,


# Define model

In [88]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.005),
              loss="mse",
              metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')]
              )


print(model.summary())


Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_66 (Dense)            (None, 32)                288       
                                                                 
 dense_67 (Dense)            (None, 32)                1056      
                                                                 
 dense_68 (Dense)            (None, 1)                 33        
                                                                 
Total params: 1,377
Trainable params: 1,377
Non-trainable params: 0
_________________________________________________________________
None


# Log mapping to wandb

In [89]:
import pandas as pd

pd.read_csv('line_mapping.csv')

wandb.log({"line_mapping": wandb.Table(dataframe=pd.read_csv('line_mapping.csv'))})

# Train model

In [90]:
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=32, epochs=15, callbacks=[WandbCallback()])

Epoch 1/15


[34m[1mwandb[0m: Adding directory to artifact (D:\PROJEKTY\BusPunctuality\training\wandb\run-20250420_230841-mkq2ttc1\files\model-best)... Done. 0.0s


Epoch 2/15
Epoch 3/15


[34m[1mwandb[0m: Adding directory to artifact (D:\PROJEKTY\BusPunctuality\training\wandb\run-20250420_230841-mkq2ttc1\files\model-best)... Done. 0.1s


Epoch 4/15
Epoch 5/15
Epoch 6/15


[34m[1mwandb[0m: Adding directory to artifact (D:\PROJEKTY\BusPunctuality\training\wandb\run-20250420_230841-mkq2ttc1\files\model-best)... Done. 0.0s


Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15


[34m[1mwandb[0m: Adding directory to artifact (D:\PROJEKTY\BusPunctuality\training\wandb\run-20250420_230841-mkq2ttc1\files\model-best)... Done. 0.0s


Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


# Evaluate model

In [91]:
# evaluate model on test set
test_loss, test_mse = model.evaluate(X_test, y_test, verbose=2)
print(f"Test loss: {test_loss}")
print(f"Test mse: {test_mse}")
# log to wandb
wandb.log({"test_loss": test_loss, "test_mse": test_mse})

833/833 - 2s - loss: 18136.7910 - rmse: 134.6729 - 2s/epoch - 2ms/step
Test loss: 18136.791015625
Test mse: 134.67291259765625


In [92]:
run.finish()

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rmse,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_loss,▁
test_mse,▁
val_loss,▁▂▁█▅▁▁▁▂▁▁▆▁▅▃
val_rmse,▁▂▁█▅▁▁▁▂▁▁▆▁▅▃

0,1
best_epoch,9.0
best_val_loss,19253.74609
epoch,14.0
loss,20257.89258
rmse,142.33022
test_loss,18136.79102
test_mse,134.67291
val_loss,19257.2793
val_rmse,138.7706
