In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

df = pd.read_parquet('../data/rucphen_precipitation_clean.parquet')
df

Unnamed: 0,timestamp,precipitation,percentage
0,2019-01-01 00:00:00,0.000000,7.895
1,2019-01-01 00:05:00,0.000000,7.895
2,2019-01-01 00:10:00,0.000000,7.895
3,2019-01-01 00:15:00,0.000000,7.895
4,2019-01-01 00:20:00,0.000000,7.895
...,...,...,...
498217,2023-10-12 23:35:00,0.010010,2.632
498218,2023-10-12 23:40:00,0.039978,2.632
498219,2023-10-12 23:45:00,0.049988,2.632
498220,2023-10-12 23:50:00,0.039978,2.632


In [3]:
start_timestamp = pd.to_datetime('2022-01-01 08:00:00')
end_timestamp = pd.to_datetime('2023-01-01 08:00:00')

# Create a boolean mask for the specified time range
mask = (df['timestamp'] >= start_timestamp) & (df['timestamp'] < end_timestamp)

# Apply the mask to get the desired slice of the DataFrame
df = df[mask]
df.fillna(0, inplace=True)

In [4]:
# Datetime featurs
# Extracting features
df_features = df.copy()
df_features['hour'] = df_features['timestamp'].dt.hour
df_features['dayofweek'] = df_features['timestamp'].dt.dayofweek
df_features['isweekend'] = df_features['timestamp'].dt.weekday >= 5  # True if it's Saturday or Sunday, False otherwise
df_features['month'] = df_features['timestamp'].dt.month

df_features['isweekend'] = df_features['isweekend'].astype(int)
df_features.drop('timestamp', axis=1, inplace=True)

# Lags
# Sort the DataFrame by timestamp
# df = df.sort_values(by='timestamp')

# Define the lags you want
lags = [1, 2, 3, 6]

# Add lag features for 'precipitation'
for lag in lags:
    df_features[f'precipitation_lag_{lag}'] = df_features['precipitation'].shift(lag)


# Drop rows with NaN values introduced by the lag operation
df_features = df_features.dropna()

# Reset index if needed
df_features = df_features.reset_index(drop=True)


df_features.head()

Unnamed: 0,precipitation,percentage,hour,dayofweek,isweekend,month,precipitation_lag_1,precipitation_lag_2,precipitation_lag_3,precipitation_lag_6
0,0.0,0.0,8,5,1,1,0.0,0.0,0.0,0.0
1,0.0,0.0,8,5,1,1,0.0,0.0,0.0,0.0
2,0.0,0.0,8,5,1,1,0.0,0.0,0.0,0.0
3,0.0,0.0,8,5,1,1,0.0,0.0,0.0,0.0
4,0.0,0.0,8,5,1,1,0.0,0.0,0.0,0.0


In [5]:
# Extract features and target from the DataFrame
X = df_features.loc[:, df_features.columns != 'percentage']
y = df_features['percentage']

# # Convert to PyTorch tensors
# X = torch.tensor(X.values, dtype=torch.float32)
# y = torch.tensor(y.values, dtype=torch.float32)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Standardize Data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from tpot.export_utils import set_param_recursive

# Average CV score on the training set was: -0.10462254361904769
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    Nystroem(gamma=0.45, kernel="laplacian", n_components=9),
    AdaBoostRegressor(learning_rate=0.1, loss="exponential", n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)



In [7]:

mse = mean_squared_error(y_test, results)

print(f"Mean Squared Error (MSE): {mse:.2f}")


Mean Squared Error (MSE): 645.28


In [8]:
for i in range(0, len(results)):
    print(f"Predicted: {results[i]:.2f}, Actual: {y_test.iloc[i]:.2f}")

Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicted: 21.17, Actual: 5.26
Predicte

In [None]:
#exports weights
import pickle

with open('weights_without_percentage.pkl', 'wb') as f:
    pickle.dump(exported_pipeline, f)
    