In [14]:
from src.utils.paths import DATA_DIR
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [15]:
df = pd.read_csv(f'{DATA_DIR}/train.csv', index_col=0)
df

Unnamed: 0,date,temperature,relative_humidity,dew_point,apparent_temperature,precipitation_probability,rain,surface_pressure,bike_stands,available_bike_stands
0,2023-06-25 19:07:30+00:00,25.1,45,12.4,24.7,0.0,0.0,984.3,22,8
1,2023-06-25 20:25:41+00:00,24.2,43,10.9,23.3,,0.0,984.5,22,8
2,2023-06-25 20:12:20+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,12
3,2023-06-25 19:33:48+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,7
4,2023-06-25 19:55:29+00:00,24.2,43,10.9,23.3,,0.0,984.5,22,8
...,...,...,...,...,...,...,...,...,...,...
18679,2023-10-03 15:23:54+00:00,26.1,56,16.7,27.8,0.0,0.0,986.2,22,10
18678,2023-10-03 15:26:03+00:00,26.1,56,16.7,27.8,,0.0,986.2,22,13
18683,2023-10-03 14:53:38+00:00,26.1,56,16.7,27.8,0.0,,986.2,22,20
18690,2023-10-03 15:34:04+00:00,26.5,54,16.4,28.1,0.0,0.0,985.8,22,11


### Preprocess data

In [16]:
df.drop(columns=['bike_stands', 'date'], inplace=True)

In [17]:
TARGET_COL = 'available_bike_stands'

In [18]:
missing = df.isnull().sum()
missing

temperature                  1869
relative_humidity               0
dew_point                       0
apparent_temperature            0
precipitation_probability    1308
rain                         2804
surface_pressure                0
available_bike_stands           0
dtype: int64

In [19]:
numeric_cols = df.select_dtypes(include='number').columns
# df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
df.fillna(df.mean(), inplace=True)
df

Unnamed: 0,temperature,relative_humidity,dew_point,apparent_temperature,precipitation_probability,rain,surface_pressure,available_bike_stands
0,25.1,45,12.4,24.7,0.000000,0.000000,984.3,8
1,24.2,43,10.9,23.3,20.173608,0.000000,984.5,8
2,24.2,43,10.9,23.3,0.000000,0.000000,984.5,12
3,24.2,43,10.9,23.3,0.000000,0.000000,984.5,7
4,24.2,43,10.9,23.3,20.173608,0.000000,984.5,8
...,...,...,...,...,...,...,...,...
18679,26.1,56,16.7,27.8,0.000000,0.000000,986.2,10
18678,26.1,56,16.7,27.8,20.173608,0.000000,986.2,13
18683,26.1,56,16.7,27.8,0.000000,0.147677,986.2,20
18690,26.5,54,16.4,28.1,0.000000,0.000000,985.8,11


In [20]:
TARGET_COL_INDEX = df.columns.get_loc(TARGET_COL)

In [21]:
from typing import Tuple, List

def train_test_split(data: pd.DataFrame, test_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    test_size = int(len(data) * test_size)
    return data.iloc[:-test_size], data.iloc[-test_size:]

In [22]:
train, test = train_test_split(df, 0.2)

In [23]:
scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)
# df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
# df = scaler.fit_transform(df)
# df
df

Unnamed: 0,temperature,relative_humidity,dew_point,apparent_temperature,precipitation_probability,rain,surface_pressure,available_bike_stands
0,25.1,45,12.4,24.7,0.000000,0.000000,984.3,8
1,24.2,43,10.9,23.3,20.173608,0.000000,984.5,8
2,24.2,43,10.9,23.3,0.000000,0.000000,984.5,12
3,24.2,43,10.9,23.3,0.000000,0.000000,984.5,7
4,24.2,43,10.9,23.3,20.173608,0.000000,984.5,8
...,...,...,...,...,...,...,...,...
18679,26.1,56,16.7,27.8,0.000000,0.000000,986.2,10
18678,26.1,56,16.7,27.8,20.173608,0.000000,986.2,13
18683,26.1,56,16.7,27.8,0.000000,0.147677,986.2,20
18690,26.5,54,16.4,28.1,0.000000,0.000000,985.8,11


### Prepare time series

In [24]:
import numpy as np

def create_time_series(data: pd.DataFrame, window_size: int, target_col_index: int) -> Tuple[np.array, np.array]:
    X, y = [], []
    for i in range(len(data) - window_size - 1):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size, target_col_index])
    return np.array(X), np.array(y)

In [25]:
# train, test = train_test_split(df, 0.2)
X_train, y_train = create_time_series(train, 48, TARGET_COL_INDEX)
X_test, y_test = create_time_series(test, 48, TARGET_COL_INDEX)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((14905, 48, 8), (14905,), (3689, 48, 8), (3689,))

### Prepare model

In [26]:
from keras.layers import GRU, Dense, Dropout, BatchNormalization
from keras import Sequential

def create_model(input_shape: Tuple[int, int]) -> Sequential:
    model = Sequential([
        GRU(128, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        GRU(64, return_sequences=True),
        Dropout(0.2),
        GRU(32),
        BatchNormalization(),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [27]:
model = create_model((X_train.shape[1], X_train.shape[2]))

In [28]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a3a82f2a00>

In [None]:
model.save('gru_model.keras')