In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('../definitions')
import def_model as mod_def
import definitions_EDA as eda
from sklearn.multioutput import MultiOutputRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, TimeSeriesSplit,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc
from scipy.fft import fft, rfftfreq
import joblib
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical


  from .autonotebook import tqdm as notebook_tqdm


Select Dates

In [2]:
start_date = pd.Timestamp('2024-03-20')
end_date = pd.Timestamp('2024-07-10')
n_lags = 5
shift_steps = 900  # corresponds to 3 hours (1 record every 12 seconds)

# Optimize the hyperparameter search space to reduce computation
param_grid = {
    "C": Continuous(1e-2, 10, distribution="log-uniform"),  # Narrow the range for C
    "kernel": Categorical(['linear', 'rbf']),  # Only two kernel types to speed up search
    "gamma": Continuous(1e-4, 0.1, distribution="log-uniform"),  # Reduce the search range for gamma
}

Import data

In [3]:
df = mod_def.combine_resampled_data(start_date, end_date)
# let df = subset
# df = df.iloc[:, 5:7]
# downsample
df_resampled = eda.calculate_mean_of_five_in_chunks(df,1000, 5)
df = df_resampled

print(f"This is the head of the df: \n", df.head())
print(f"This is the shape:" ,df.shape)
flag_counts = df['flag'].value_counts()
proportion_flag_1_to_0 = (flag_counts[1.0] / flag_counts[0.0])*100
print(f"Flag = 1 counts: {flag_counts[1.0]}")
print(f"Proportion of flag = 1 to flag = 0: {proportion_flag_1_to_0} %")

This is the head of the df: 
     NS_SQUID   Z_SQUID   NS_Fluxgate  EW_Fluxgate    Z_Fluxgate   H Component  \
0 -17.136570  0.276270  10934.364450   -42.370033 -22656.488219  10934.446542   
1 -17.113152  0.350281  10934.410213   -42.314737 -22656.308095  10934.492090   
2 -17.235085  0.185658  10934.400975   -42.445483 -22656.408618  10934.483360   
3 -17.030711 -0.242669  10934.482125   -42.588208 -22656.805620  10934.565063   
4 -16.934697 -0.295667  10934.616765   -42.609385 -22656.955439  10934.699784   

   flag  
0   0.0  
1   0.0  
2   0.0  
3   0.0  
4   0.0  
This is the shape: (163594, 7)
Flag = 1 counts: 8590
Proportion of flag = 1 to flag = 0: 5.542328810431709 %


Create test data

In [4]:
# Example multivariate time series data (e.g., 3 variables) - highly imbalanced
np.random.seed(42)
n_samples = 2000  # Simulate 2000 samples for demonstration (about 6.6 hours)
n_features = 3
data = np.random.rand(n_samples, n_features)

# Introduce imbalance (90% 0s and 10% 1s)
data[-200:, -1] = 1  # artificially make 10% of last column `1` to simulate imbalance

In [5]:
# print(df.iloc[:, 2:7])
X, Y = mod_def.create_lagged_features(df, n_lags=n_lags, shift_steps = shift_steps) 
print(f"This is the shape of X", X.shape)
print(f"This is the shape of Y", Y.shape)

# Train-test split (without shuffling to maintain temporal order)
test_size = 0.3
split_idx = int((1 - test_size) * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
Y_train, Y_test = Y[:split_idx], Y[split_idx:]

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

This is the shape of X (162689, 41)
This is the shape of Y (162689,)


In [None]:
# Define the SVC model
svc = SVC(class_weight='balanced')

# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=2)  # Only 3 splits to reduce computation

# Genetic Algorithm Search for best hyperparameters
evolved_svc = GASearchCV(
    estimator=svc,
    param_grid=param_grid,
    scoring="accuracy",
    population_size=20,  # Reduce population size
    generations=10,  # Reduce number of generations
    n_jobs=-1,  # Use all cores
    cv=tscv,  # Time series cross-validation
    verbose=True,
)

# Fit the model
evolved_svc.fit(X_train_scaled, Y_train)

# Best parameters from genetic algorithm
print("Best Parameters found by GA:", evolved_svc.best_params_)

# Make predictions using the best found parameters
Y_pred = evolved_svc.predict(X_test_scaled)

# Classification report
print("Classification Report with Best Parameters:")
print(classification_report(Y_test, Y_pred))

# Plot true vs predicted
plt.plot(Y_test, label="True")
plt.plot(Y_pred, label="Predicted", linestyle='--')
plt.title("True vs Predicted Binary Labels (3-Hour Forecast)")
plt.legend()
plt.show()


Confusion Matrix

In [None]:
cm_c = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm_c, annot=True, fmt='d').set_title(f"Confusion matrix of SVM with a C value of {svc.C}")

ROC Plot


In [None]:
# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(Y_test, Y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

Write results to file

In [9]:
# Save the classification report to a file
report = classification_report(Y_test, Y_pred)
with open("/Users/tristan/Library/CloudStorage/OneDrive-StellenboschUniversity/Academics/Final_year/Semester_2/Skripsie/Report/images/results/reports/classification_report.txt", "a") as file:
    file.write(f"\nSVC Parameters:\nC: {svc.C}\nKernel: {svc.kernel}\n")
    file.write(f"Classification Report:\n{report}\n")

In [11]:
# # Check if the DataFrame has a time index
# if not isinstance(df.index, pd.DatetimeIndex):
#     # Create a time index based on the sampling rate
#     time_index = pd.date_range(start=0, periods=len(df), freq=f'{1/(1/12)}s')
#     df.index = time_index



# Perform the Fourier transform on each column
# fft_results = {}
# for column in df.columns[0:6]:
#     print(column)
#     ser = df[column].squeeze()
#     print(ser)
#     fft_data = fft(ser)
#     # fft_results[column] = fft_data
# print("GOt to this point!")
# # Create a DataFrame from the Fourier transform results
# fft_df = pd.DataFrame(fft_results)

# # Calculate the frequency axis based on the sampling rate and number of samples
# frequency_axis = np.fft.fftfreq(len(df), 1/(1/12))

# # Add the frequency axis as a new index
# fft_df.index = frequency_axis