In [30]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from reservoirpy.nodes import Reservoir, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as creport
from imblearn.over_sampling import SMOTE 
import shap
from xgboost import XGBClassifier

In [31]:
def one_hot_encode(df, drop_original=True, prefix_sep="_"):

    non_numeric_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
    
    if not non_numeric_cols:
        return df.copy()

    df_encoded = pd.get_dummies(df, columns=non_numeric_cols, prefix_sep=prefix_sep)

    return df_encoded

def find_high_corr_features(df, target_col, threshold=0.9):

    # Drop non-numeric columns
    numeric_df = df.select_dtypes(include='number')

    if target_col not in numeric_df.columns:
        raise ValueError(f"Target column '{target_col}' is not numeric or missing from the DataFrame.")

    # Compute correlations
    corrs = numeric_df.corr()[target_col].drop(target_col)

    # Find strong correlations
    high_corr_features = corrs[abs(corrs) > threshold].sort_values(ascending=False)

    if high_corr_features.empty:
        print("✅ No features are correlated above the threshold.")
    else:
        print("⚠️ High correlation detected:")
        print(high_corr_features)

    return list(high_corr_features.index)

### 12 Hours

In [32]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

find_high_corr_features(df, 'ext_gust_window_72')

In [33]:
df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_48','ext_gust_window_72','ext_gust_window_168', 'ext_gust_window_720'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_12']
df_x = df.drop(['ext_gust_window_12'], axis=1)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [35]:
# base

xgb_model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print(creport(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5245
           1       0.65      0.15      0.24       103

    accuracy                           0.98      5348
   macro avg       0.82      0.57      0.61      5348
weighted avg       0.98      0.98      0.98      5348



In [36]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)
 
X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

print(creport(y_test, y_pred))

Running Reservoir-9: 100%|██████████████████████████████████████████████████████████| 21390/21390 [00:41<00:00, 517.12it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-9: 100%|████████████████████████████████████████████████████████████| 5348/5348 [00:10<00:00, 504.39it/s]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5245
           1       0.83      0.76      0.79       103

    accuracy                           0.99      5348
   macro avg       0.91      0.88      0.89      5348
weighted avg       0.99      0.99      0.99      5348






In [37]:
#### Sanity Check

In [38]:
from sklearn.datasets import make_hastie_10_2
x, y = make_hastie_10_2(n_samples=24000, random_state=42)

In [39]:
y = [0 if i == -1 else 1 for i in y]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [41]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

reservoir = Reservoir(units=5000, sr=0.8)  
X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 


print(creport(y_test, y_pred))

Running Reservoir-10: 100%|█████████████████████████████████████████████████████████| 19200/19200 [00:32<00:00, 597.35it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-11: 100%|███████████████████████████████████████████████████████████| 4800/4800 [00:08<00:00, 596.75it/s]

              precision    recall  f1-score   support

           0       0.48      0.38      0.42      2439
           1       0.47      0.57      0.51      2361

    accuracy                           0.47      4800
   macro avg       0.47      0.47      0.47      4800
weighted avg       0.47      0.47      0.47      4800






In [42]:
####

In [43]:
### 48 Hours

In [44]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_12','ext_gust_window_72','ext_gust_window_168', 'ext_gust_window_720'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_48']
df_x = df.drop(['ext_gust_window_48'], axis=1)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [46]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-12: 100%|█████████████████████████████████████████████████████████| 21390/21390 [00:41<00:00, 510.14it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-12: 100%|███████████████████████████████████████████████████████████| 5348/5348 [00:10<00:00, 510.19it/s]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5223
           1       0.85      0.86      0.86       125

    accuracy                           0.99      5348
   macro avg       0.92      0.93      0.93      5348
weighted avg       0.99      0.99      0.99      5348



In [47]:
### 72 Hours

In [48]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_12','ext_gust_window_48','ext_gust_window_168', 'ext_gust_window_720'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_72']
df_x = df.drop(['ext_gust_window_72'], axis=1)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [50]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-13: 100%|█████████████████████████████████████████████████████████| 21390/21390 [00:41<00:00, 516.28it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-13: 100%|███████████████████████████████████████████████████████████| 5348/5348 [00:10<00:00, 517.49it/s]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5138
           1       0.91      0.92      0.91       210

    accuracy                           0.99      5348
   macro avg       0.95      0.96      0.96      5348
weighted avg       0.99      0.99      0.99      5348



In [51]:
### 168 Hours

In [52]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_12','ext_gust_window_48','ext_gust_window_72', 'ext_gust_window_720'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_168']
df_x = df.drop(['ext_gust_window_168'], axis=1)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [54]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

reservoir = Reservoir(units=5000, sr=0.8)  
X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-14: 100%|█████████████████████████████████████████████████████████| 21390/21390 [00:42<00:00, 508.84it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-15: 100%|███████████████████████████████████████████████████████████| 5348/5348 [00:10<00:00, 504.47it/s]

              precision    recall  f1-score   support

           0       0.91      0.61      0.73      4895
           1       0.08      0.39      0.14       453

    accuracy                           0.59      5348
   macro avg       0.50      0.50      0.43      5348
weighted avg       0.84      0.59      0.68      5348






In [55]:
### 1 month

In [56]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_12','ext_gust_window_48','ext_gust_window_72', 'ext_gust_window_168'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_720']
df_x = df.drop(['ext_gust_window_720'], axis=1)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [58]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

reservoir = Reservoir(units=5000, sr=0.8)  
X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-16: 100%|█████████████████████████████████████████████████████████| 21390/21390 [00:41<00:00, 511.78it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-17: 100%|███████████████████████████████████████████████████████████| 5348/5348 [00:10<00:00, 509.49it/s]

              precision    recall  f1-score   support

           0       0.86      0.61      0.72      4622
           1       0.14      0.39      0.20       726

    accuracy                           0.58      5348
   macro avg       0.50      0.50      0.46      5348
weighted avg       0.77      0.58      0.65      5348






In [59]:
find_high_corr_features(df, 'ext_gust_window_168')

ValueError: Target column 'ext_gust_window_168' is not numeric or missing from the DataFrame.

# let's try with SMOTE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.15, random_state=42)

In [None]:
oversample = SMOTE(random_state=1234, sampling_strategy=0.5)

X_train, y_train = oversample.fit_resample(X_train, y_train) 

In [None]:
scaler_X = StandardScaler()

X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix as cmatrix
print(cmatrix(y_test, y_pred))