In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from reservoirpy.nodes import Reservoir
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as creport
from imblearn.over_sampling import SMOTE 
import shap
from xgboost import XGBClassifier

In [2]:
def one_hot_encode(df, drop_original=True, prefix_sep="_"):

    non_numeric_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
    
    if not non_numeric_cols:
        return df.copy()

    df_encoded = pd.get_dummies(df, columns=non_numeric_cols, prefix_sep=prefix_sep)

    return df_encoded


### 12 Hours

In [3]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

In [4]:
df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_48','ext_gust_window_72','ext_gust_window_168'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_12']
df_x = df.drop(['ext_gust_window_12'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [6]:
# base

xgb_model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print(creport(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5068
           1       0.52      0.21      0.30       280

    accuracy                           0.95      5348
   macro avg       0.74      0.60      0.64      5348
weighted avg       0.93      0.95      0.94      5348



In [7]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-0: 100%|██████████████████████████████████████████████████████████| 21390/21390 [00:40<00:00, 526.69it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-0: 100%|████████████████████████████████████████████████████████████| 5348/5348 [00:10<00:00, 509.00it/s]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5068
           1       0.93      0.95      0.94       280

    accuracy                           0.99      5348
   macro avg       0.96      0.97      0.97      5348
weighted avg       0.99      0.99      0.99      5348






In [None]:
reservoir = Reservoir(100, lr=0.5, sr=0.9)
readout = Ridge(ridge=1e-7)

esn_model = reservoir >> readout

esn_model = esn_model.fit(X_train, Y_train, warmup=10)

Y_pred = esn_model.run(X_test)

In [8]:
### 48 Hours

In [9]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_12','ext_gust_window_72','ext_gust_window_168'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_48']
df_x = df.drop(['ext_gust_window_48'], axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [11]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-1: 100%|██████████████████████████████████████████████████████████| 21390/21390 [00:41<00:00, 512.70it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Running Reservoir-1: 100%|████████████████████████████████████████████████████████████| 5348/5348 [00:10<00:00, 511.15it/s]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5024
           1       0.96      0.94      0.95       324

    accuracy                           0.99      5348
   macro avg       0.98      0.97      0.97      5348
weighted avg       0.99      0.99      0.99      5348



In [12]:
### 72 Hours

In [13]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_12','ext_gust_window_48','ext_gust_window_168'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_72']
df_x = df.drop(['ext_gust_window_72'], axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [15]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

Running Reservoir-2:  54%|███████████████████████████████                           | 11468/21390 [00:22<00:19, 519.03it/s]


KeyboardInterrupt: 

In [None]:
### 168 Hours

In [None]:
df = pd.read_csv('data/clean_df.csv')
df['date'] = pd.to_datetime(df['date'])
df['time'] = (df['date'] - df['date'].min()).dt.total_seconds() / 3600

df.drop(['high_wind_event','valid_time','date'], axis=1, inplace=True)
df.drop(['ext_gust_window_12','ext_gust_window_48','ext_gust_window_72'], axis=1, inplace=True)
df = one_hot_encode(df)
df_y = df['ext_gust_window_168']
df_x = df.drop(['ext_gust_window_168'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

In [None]:
reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train, reset=True)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test, reset=True)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix

# let's try with SMOTE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.15, random_state=42)

In [None]:
oversample = SMOTE(random_state=1234, sampling_strategy=0.5)

X_train, y_train = oversample.fit_resample(X_train, y_train) 

In [None]:
scaler_X = StandardScaler()

X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

reservoir = Reservoir(units=5000, sr=0.8)  
X_train_reservoir = reservoir.run(X_train)  

classifier = LogisticRegression()
classifier.fit(X_train_reservoir, y_train)

X_test_reservoir = reservoir.run(X_test)
y_pred_probs = classifier.predict_proba(X_test_reservoir)[:, 1] 
y_pred = (y_pred_probs > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(creport(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix as cmatrix
print(cmatrix(y_test, y_pred))

In [None]:
with open("column_names.txt", "w") as f:
    for col in df_x.columns:
        f.write(f"{col}\n")