In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from reservoirpy.nodes import Reservoir
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as creport
from imblearn.over_sampling import SMOTE 

In [2]:
def one_hot_encode(df, drop_original=True, prefix_sep="_"):

    non_numeric_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
    
    if not non_numeric_cols:
        return df.copy()  # No categorical columns to encode

    df_encoded = pd.get_dummies(df, columns=non_numeric_cols, prefix_sep=prefix_sep)

    return df_encoded


In [3]:
df = pd.read_csv('data/clean_df.csv')

In [4]:
df.drop(['high_wind_event'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_12h']
df_x = df.drop(['ext_gust_12h'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.15, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [None]:
# base
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

In [6]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-0: 100%|██████████████████████████████████████████████████████████| 22727/22727 [03:16<00:00, 115.80it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-0: 100%|████████████████████████████████████████████████████████████| 4011/4011 [00:34<00:00, 115.46it/s]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3669
           1       0.97      0.97      0.97       342

    accuracy                           0.99      4011
   macro avg       0.98      0.98      0.98      4011
weighted avg       0.99      0.99      0.99      4011



In [7]:
# let's try with SMOTE

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.15, random_state=42)

In [9]:
oversample = SMOTE(random_state=1234, sampling_strategy=0.5)

X_train, y_train = oversample.fit_resample(X_train, y_train) 

In [10]:
scaler_X = StandardScaler()

X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-1: 100%|██████████████████████████████████████████████████████████| 31200/31200 [04:33<00:00, 114.08it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-1: 100%|████████████████████████████████████████████████████████████| 4011/4011 [00:34<00:00, 114.85it/s]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3669
           1       0.97      0.98      0.97       342

    accuracy                           1.00      4011
   macro avg       0.98      0.99      0.99      4011
weighted avg       1.00      1.00      1.00      4011






In [11]:
from sklearn.metrics import confusion_matrix as cmatrix
print(cmatrix(y_test, y_pred))

[[3657   12]
 [   6  336]]


In [13]:
with open("column_names.txt", "w") as f:
    for col in df_x.columns:
        f.write(f"{col}\n")