In [4]:
import os
import math
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import math

In [5]:
# Custom Dataset
from sklearn.discriminant_analysis import StandardScaler


class CustomDataset(Dataset):
    def __init__(self, csv_path="data/dataset_train_2024.csv"):
        # Load data from CSV
        data = pd.read_csv(csv_path)
        
        # Extract features
        self.sequences_1 = data.iloc[:, 1:129].values * 100  # Columns 1-128 (1-based indexing)
        self.sequences_2 = data.iloc[:, 129:257].values * 100  # Columns 129-256
        self.extra_feature = data.iloc[:, 257].values.reshape(-1, 1)  # Column 257

        # Combine features
        all_features = np.hstack([self.sequences_1, self.sequences_2, self.extra_feature])
        
        # Normalize features
        self.scaler = StandardScaler()
        self.normalized_features = self.scaler.fit_transform(all_features)
        self.features = torch.tensor(self.normalized_features, dtype=torch.float32)


        # Encode labels
        self.label_encoder = LabelEncoder()
        self.labels = torch.tensor(self.label_encoder.fit_transform(data.iloc[:, -1]), dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
    
    def inverseTransform(self, array):
        return self.label_encoder.inverse_transform(array)

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

# Load the dataset
dataset = CustomDataset(csv_path="data/dataset_train_2024.csv")
features = dataset.features.numpy()  # Convert features to NumPy array
labels = dataset.labels.numpy()  # Convert labels to NumPy array

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define SVM model and hyperparameter grid
svm = SVC()

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],  # Only relevant for 'poly' kernel
    'gamma': ['scale', 'auto'],  # RBF/Sigmoid-specific
}

# Perform Grid Search
grid_search = GridSearchCV(svm, param_grid, scoring='accuracy', cv=5, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate on the test set
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  19.4s
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  19.2s
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  18.7s
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  21.2s
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  16.9s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   9.2s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   9.3s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=  10.0s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   8.8s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   9.0s
[CV] END ...........C=0.1, degree=2, gamma=scale, kernel=rbf; total time=  12.8s
[CV] END ...........C=0.1, degree=2, gamma=scal