In [43]:
%load_ext autoreload
%autoreload 2
import numpy as np 
from logistic_regression import CustomeLogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import torch
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import PredefinedSplit
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_breast_cancer
#neural network
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(42)
np.random.seed(42)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
class Linear_SVM_Regularize:
    def __init__(self, C= 1, learning_rate = 0.001, epochs = 200, batch_size = 64, lmda = 0.1):
        self.C = C
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.lmda = lmda
    
    def fit(self, X, y):
        X = torch.tensor(X, dtype= torch.float32)
        y = torch.tensor(y, dtype = torch.float32)

        num_feature = X.shape[1]
        self.W = torch.randn(num_feature, 1, requires_grad= True, dtype = torch.float32)
        self.b = torch.randn(1, requires_grad= True, dtype = torch.float32)

        optimizer = torch.optim.SGD([self.W, self.b], lr = self.learning_rate)

        for epoch in range(self.epochs):
            dataloader = DataLoader(TensorDataset(X,y), batch_size= self.batch_size, shuffle= True)
            for batch_X, batch_y in dataloader:
                # torch.clamp() choose each element to be larger than 0, otherwise 0 
                hinge_loss = torch.mean(torch.clamp(1- batch_y * (batch_X @ self.W + self.b), min=0))
                reg_loss = 0.5 * self.lmda * torch.sum(self.W ** 2)/self.C #regularization loss
                loss = hinge_loss + reg_loss

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        X = torch.tensor(X, dtype = torch.float32)
        return ((torch.sign(X @ self.W  +self.b)+1)/2).detach().numpy()
    
    def score(self, X , y):
        #torch.tensor(y, dtype= torch.float32) makes y into a tensor with size y.shape which is (750, )
        y = torch.tensor(y, dtype= torch.float32).unsqueeze(1)
        y_pred = torch.tensor(self.predict(X), dtype= torch.float32)
        return torch.mean((y_pred == y).float())


In [45]:
class Linear_SVM_L1(Linear_SVM_Regularize):
    def fit(self, X, y):
        X = torch.tensor(X, dtype= torch.float32)
        y = torch.tensor(y, dtype = torch.float32)

        num_feature = X.shape[1]
        self.W = torch.randn(num_feature, 1, requires_grad= True, dtype = torch.float32)
        self.b = torch.randn(1, requires_grad= True, dtype = torch.float32)

        optimizer = torch.optim.SGD([self.W, self.b], lr = self.learning_rate)

        for epoch in range(self.epochs):
            dataloader = DataLoader(TensorDataset(X,y), batch_size= self.batch_size, shuffle= True)
            for batch_X, batch_y in dataloader:
                # torch.clamp() choose each element to be larger than 0, otherwise 0 
                hinge_loss = torch.mean(torch.clamp(1- batch_y * (batch_X @ self.W + self.b), min=0))
                reg_loss = 0.5 * self.lmda * torch.sum(torch.abs(self.W)) #regularization loss
                loss = hinge_loss + reg_loss

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
    


In [46]:
class KernelSVM(Linear_SVM_Regularize):
    def __init__(self, kernel = 'linear', degree = 3, gamma = None, coef0 = 0, **kwargs):
        super().__init__(**kwargs)
        self.kernel = kernel
        self.degree = degree
        self.gamma = gamma
        self.coef0 = coef0
    
    def compute_kernel(self, X1, X2):
        if self.kernel == 'linear':
            return X1 @ X2.T
        
        elif self.kernel == 'rbf':
            gamma = self.gamma or 1/X1.shape[1]
            pairwise_sq_dists = torch.sum(X1 **2, dim=1, keepdim= True) - 2*X1@X2.T + torch.sum(X2 ** 2, dim = 1)
            return torch.exp(-gamma * pairwise_sq_dists)
        
        elif self.kernel == 'polynomial':
            return (X1@X2.T * self.gamma + self.coef0) ** self.degree

    def fit(self, X, y):
        self.X_train = torch.tensor(X, dtype= torch.float32)
        self.y_train = torch.tensor(y, dtype = torch.float32)
        K = self.compute_kernel(self.X_train, self.X_train)
        super().fit(K, y)
    
    def predict(self, X):
        X = torch.tensor(X, dtype = torch.float32)
        K = self.compute_kernel(X, self.X_train)
        return super().predict(K)

In [47]:
class KernelLogisticRegression:
    '''
    The KernelLogisticRegression class extends your original CustomeLogisticRegression 
    by introducing kernel functions for transforming the input data into a higher-dimensional space.
    '''
    def __init__(self, learning_rate, C= 1, kernel = 'linear', degree = 3, gamma = None, coef0 = 0 ,**kwargs):
        '''
        The class takes additional arguments like kernel, degree, gamma, and coef0, 
        which determine the type of kernel function to be used and its parameters.
        '''
        self.kernel = kernel
        self.degree = degree
        self.gamma = gamma
        self.coef0 = coef0
        self.learning_rate = learning_rate
        self.C = C
    
    def compute_kernel(self, X1, X2):
        if self.kernel == 'linear':
            return X1 @ X2.T
        elif self.kernel == 'rbf':
            gamma = self.gamma or 1/ X1.shape[1]
            pairwise_sq_dists = torch.sum(X1**2, dim = 1, keepdim = True) - 2*X1 @X2.T + torch.sum(X2**2, dim = 1)
            return torch.exp(-gamma*pairwise_sq_dists)
        
        elif self.kernel == 'polynomial':
            return (X1@X2.T *self.gamma + self.coef0) ** self.degree
        
    def fit(self, X, y):
        self.X_train = torch.tensor(X, dtype = torch.float32)
        self.y_train = torch.tensor(y, dtype = torch.float32)
        K = self.compute_kernel(self.X_train, self.X_train)
        scaler = StandardScaler().fit(K)
        #scaler = MinMaxScaler().fit(K)
        K_scaled = scaler.transform(K)
        self.scaler = scaler

        self.model = LogisticRegression(C = self.C, solver = 'lbfgs', max_iter= 1000)
        self.model.fit(K_scaled, y)
        #self.model.fit(K, y)

    def predict(self, X):
        X = torch.tensor(X, dtype= torch.float32)
        K = self.compute_kernel(X, self.X_train).numpy()
        K_scaled = self.scaler.transform(K)
        return self.model.predict(K_scaled)
        #return self.model.predict(K)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
        

In [48]:
def best_linear(x_train, y_train, x_val, y_val, x_test, y_test, reg = 'L2'):
    best_lr = None
    best_epochs = None
    best_lambda = None
    best_batch = None
    best_model = None
    lrs = np.linspace(0.001, 0.041, 2)
    epochs_list = np.linspace(50, 250, 2)
    lambdas = np.linspace(0.01, 0.21, 2)
    batch_sizes = [32, 64]
    best_score = -1
    Cs = [1,2,3,5, 10, 20]
    for lr in lrs:
        for epo in epochs_list:
            for ld in lambdas:
                for batch in batch_sizes:
                    for c in Cs:
                        if reg == 'L2':
                            linear_svm_model = Linear_SVM_Regularize(C = c, learning_rate= lr,epochs= int(epo),batch_size= batch, lmda =  ld)
                        elif reg == 'L1':
                            linear_svm_model = Linear_SVM_L1(C = c, learning_rate= lr,epochs= int(epo),batch_size= batch, lmda =  ld)
                        linear_svm_model.fit(x_train, y_train)
                        score = linear_svm_model.score(x_val, y_val).item()
                        if score > best_score:
                            best_score = score
                            best_lr = lr
                            best_epochs = epo
                            best_lambda = ld
                            best_batch = batch
                            best_model = linear_svm_model
                        if best_score == 1:
                            break
    #score on test 
    print(f'highest test score is {best_model.score(x_test, y_test).item()} with {best_lr} lr, {best_lambda} lambda, {best_batch} best size, and {best_epochs} epochs')
    return best_model.score(x_test, y_test).item(), best_model

In [49]:
def best_svm_kernel(x_train, y_train, x_val, y_val, x_test, y_test, kernel, model):
    best_score = -1
    best_model = None

    lr = 0.0001
    epo = 70
    lambdas = np.linspace(0.01, 0.21, 4)
    batch  = 64
    gammas =  [0.01, 0.1, 1, 10]
    coefs = [1, 2, 3, 5, 7, 9]
    Cs = [1,2,3,5, 10, 20, 50, 70, 100]
    for ld in lambdas:
        if kernel == 'polynomial':
            for gamma in gammas:
                for coef in coefs:
                    for degree in [2,3,4]:
                        if model == 'KernelSVM':
                            kernelsvm_model = KernelSVM(learning_rate = lr, epochs = epo, batch_size = batch, lmda = ld, gamma= gamma, degree= degree, coef0= coef, kernel= kernel)
                        else:
                            kernelsvm_model = KernelLogisticRegression(learning_rate = lr, epochs = epo, batch_size = batch, lmda = ld, gamma= gamma, degree= degree, coef0= coef, kernel= kernel)
                        kernelsvm_model.fit(x_train, y_train)
                        score_kern = kernelsvm_model.score(x_val, y_val).item()
                        if score_kern > best_score:
                            best_score = score_kern
                            best_model = kernelsvm_model
        elif kernel =='rbf':
            for gamma in [0.0001, 0.001,0.005, 0.01, 0.15, 0.1, 1, 10]:
                for c in Cs:
                    if model == 'KernelSVM':
                        kernelsvm_model = KernelSVM(C = c, learning_rate = lr, epochs = epo, batch_size = batch, lmda = ld, gamma= gamma, kernel= kernel)
                    else:
                        kernelsvm_model = KernelLogisticRegression(C = c, learning_rate = lr, epochs = epo, batch_size = batch, lmda = ld, gamma= gamma, kernel= kernel)
                    kernelsvm_model.fit(x_train, y_train)
                    score_kern = kernelsvm_model.score(x_val, y_val).item()
                    if score_kern > best_score:
                        best_score = score_kern
                        best_model = kernelsvm_model
        elif kernel == 'linear':
            if model == 'KernelSVM':
                kernelsvm_model = KernelSVM(learning_rate = lr, epochs = epo, batch_size = batch, lmda = ld, kernel= kernel)
            else:
                kernelsvm_model = KernelLogisticRegression(learning_rate = lr, epochs = epo, batch_size = batch, lmda = ld, kernel= kernel)
            kernelsvm_model.fit(x_train, y_train)
            score_kern = kernelsvm_model.score(x_val, y_val).item()
            if score_kern > best_score:
                best_score = score_kern
                best_model = kernelsvm_model
            
        print(f"Best score {best_model.score(x_test, y_test)}")
    return best_model.score(x_test, y_test).item(), best_model

In [50]:
def logi_reg(x_train, y_train, x_val, y_val, x_test, y_test):
    best_epoch_lr = None
    best_lr_lr = None
    best_model = None
    best_accuracy = -1
    epochs = [100, 300, 600, 800]
    learning_rates = np.linspace(0.001, 0.1, 10)
    for epoch in epochs:
        for lr in learning_rates:
            lr_model = CustomeLogisticRegression(learning_rate = lr)
            lr_model.fit(x_train, y_train, epoch = epoch)
            accuracy = lr_model.accuracy(y_val, x_val)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_epoch_lr = epoch
                best_lr_lr = lr
                best_model = lr_model
    print(f'{best_model.accuracy(y_test, x_test)} with learning rate {best_lr_lr} and {best_epoch_lr} epochs')
    return best_accuracy, best_model

2.3 Real data

In [51]:
scaler = StandardScaler()
data = load_breast_cancer()
X = data.data
y = data.target

if np.any(np.isnan(X)):
    print("There are missing values in the dataset.")

if np.any(np.isinf(X)):
    print("There are infinity values in the dataset.")
    
X = scaler.fit_transform(X)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size= 0.25, random_state= 42)


linear svm and logistic regression

In [64]:
acc_linear, best_linear_svm = best_linear(X_train, y_train, X_val, y_val, X_test, y_test)
feature_importance = np.abs(best_linear_svm.W.detach().numpy().flatten())
top_important_features = np.argsort(feature_importance)[-10:]
print(f"L2 Regularized SVM Test Accuracy: {acc_linear:.4f}")
print(feature_importance)

highest test score is 0.8584070801734924 with 0.001 lr, 0.01 lambda, 64 best size, and 50.0 epochs
L2 Regularized SVM Test Accuracy: 0.8584
[0.8681871  1.5263065  2.3645935  0.96321374 0.91663784 2.0173714
 0.5558926  0.88757825 0.3621307  0.48202047 0.91064143 0.824216
 0.21472584 1.2552162  1.5512909  0.53053534 0.7989287  1.8691585
 0.03021394 1.4671551  0.62459046 0.45845327 1.4710674  1.006325
 0.02441451 0.3871482  0.01346058 1.4597906  1.5284353  0.01301512]


In [52]:
acc_linear_L1, best_linear_L1 = best_linear(X_train, y_train, X_val, y_val, X_test, y_test, reg = "L1")
feature_importance2 = np.abs(best_linear_L1.W.detach().numpy().flatten())
top_important_features2 = np.argsort(feature_importance2)[-10:]
print(f"L1 Regularized SVM Test Accuracy: {acc_linear_L1:.4f}")
print(feature_importance2)

highest test score is 0.8771929740905762 with 0.001 lr, 0.21 lambda, 64 best size, and 250.0 epochs
L1 Regularized SVM Test Accuracy: 0.8772
[1.0435293  1.2397331  0.65221024 0.06650675 1.613175   0.45472473
 0.72214925 0.02559206 0.09438893 1.257321   0.12434014 0.4478095
 0.4296006  1.6613344  0.39254898 0.6565991  1.0370171  0.52156913
 0.16434248 0.08603621 1.0617669  0.01676443 0.42827913 0.49123383
 1.3109468  0.19145648 1.1907551  0.12759489 0.1954511  0.6430857 ]


In [53]:
acc_logis, best_logis = logi_reg(X_train, y_train, X_val, y_val, X_test, y_test)
print(f"L1 Regularized SVM Test Accuracy: {acc_logis:.4f}")

0.5964912280701754 with learning rate 0.001 and 100 epochs
L1 Regularized SVM Test Accuracy: 0.5965


In [41]:
# Train a Kernel SVM with the selected features (RBF)
accuracy_rbf_svm, rbf_svm_model = best_svm_kernel(X_train, y_train, X_val, y_val, X_test, y_test, kernel = 'rbf', model = 'KernelSVM')
accuracy_rbf_svm

  X = torch.tensor(X, dtype= torch.float32)
  X = torch.tensor(X, dtype = torch.float32)


Best score 0.8157894611358643
Best score 0.8157894611358643
Best score 0.859649121761322
Best score 0.859649121761322


0.859649121761322

In [42]:
# Train a Kernel SVM with the selected features (Polynomial)
accuracy_poly_svm, rbf_poly_model = best_svm_kernel(X_train, y_train, X_val, y_val, X_test, y_test, kernel = 'polynomial', model = 'KernelSVM')
accuracy_poly_svm

  X = torch.tensor(X, dtype= torch.float32)
  X = torch.tensor(X, dtype = torch.float32)


Best score 0.8508771657943726
Best score 0.8508771657943726
Best score 0.8508771657943726
Best score 0.8859649300575256


0.8859649300575256

Selecting top features

In [65]:
top_important_features

array([13, 27, 19, 22,  1, 28, 14, 17,  5,  2], dtype=int64)

In [54]:
top_important_features2

array([ 6, 16,  0, 20, 26,  1,  9, 24,  4, 13], dtype=int64)

In [71]:
# Select the top features
X_train_selected = X_train[:, top_important_features2]
X_val_selected = X_val[:, top_important_features2]
X_test_selected = X_test[:, top_important_features2]

In [74]:
# Train a SVM with the selected features
accuracy_rbf_svm, rbf_svm_model = best_linear(X_train_selected, y_train, X_val_selected, y_val, X_test_selected, y_test, reg= 'L1')
accuracy_rbf_svm

highest test score is 0.7168141603469849 with 0.001 lr, 0.01 lambda, 64 best size, and 50.0 epochs


0.7168141603469849

In [56]:
# Train a Kernel SVM with the selected features (RBF)
accuracy_rbf_svm, rbf_svm_model = best_svm_kernel(X_train_selected, y_train, X_val_selected, y_val, X_test_selected, y_test, kernel = 'rbf', model = 'KernelSVM')
accuracy_rbf_svm

  X = torch.tensor(X, dtype= torch.float32)
  X = torch.tensor(X, dtype = torch.float32)


Best score 0.8070175647735596
Best score 0.8070175647735596
Best score 0.8684210777282715
Best score 0.8684210777282715


0.8684210777282715

In [57]:
# Train a Kernel SVM with the selected features (Polynomial)
accuracy_poly_svm, rbf_poly_model = best_svm_kernel(X_train_selected, y_train, X_val_selected, y_val, X_test_selected, y_test, kernel = 'polynomial', model = 'KernelSVM')
accuracy_poly_svm

  X = torch.tensor(X, dtype= torch.float32)
  X = torch.tensor(X, dtype = torch.float32)


Best score 0.6666666865348816
Best score 0.6666666865348816
Best score 0.8245614171028137
Best score 0.8245614171028137


0.8245614171028137

logistic regression with kernel svm

In [58]:
accuracy_rbf_logis_svm, rbf_logis_svm_model = best_svm_kernel(X_train, y_train, X_val, y_val, X_test, y_test, 'rbf', model = 'KernelLogisticRegression')
accuracy_rbf_logis_svm

Best score 0.9824561403508771
Best score 0.9824561403508771
Best score 0.9824561403508771
Best score 0.9824561403508771


0.9824561403508771

In [59]:
accuracy_poly_logis_svm, poly_logis_svm_model = best_svm_kernel(X_train, y_train, X_val, y_val, X_test, y_test, 'polynomial', model = 'KernelLogisticRegression')
accuracy_poly_logis_svm

Best score 0.956140350877193
Best score 0.956140350877193
Best score 0.956140350877193
Best score 0.956140350877193


0.956140350877193

knn vs neural network

In [60]:
#Knn Classifiers
def knn(X_combined, y_combined, x_test, y_test, split):
    knn = KNeighborsClassifier()
    knn_grid = {'n_neighbors': np.arange(1, 30)}
    knn_grid_search = GridSearchCV(knn, knn_grid, cv = split)
    knn_grid_search.fit(X_combined, y_combined)

    best_k = knn_grid_search.best_estimator_
    knn_accuracy = best_k.score(x_test, y_test)
    print(knn_accuracy)
    return knn_accuracy, best_k

In [61]:
#knn
X_combined = np.vstack((X_train, X_val))
y_combined = np.hstack((y_train, y_val))
# Create predefinded 
test_fold = np.concatenate([
    np.full(X_train.shape[0], -1),
    np.zeros(X_val.shape[0])
])
predefined_split = PredefinedSplit(test_fold)

knn_accuracy, best_knn_model= knn(X_combined, y_combined, X_test, y_test, predefined_split)
knn_accuracy

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.9473684210526315


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.9473684210526315

In [62]:
class CircleData(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype = torch.float32)
        self.y = torch.tensor(y, dtype= torch.long)

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]
        
class Net(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, num_classes, dropout_rate):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, num_classes)
        self.dropout = nn.Dropout(dropout_rate)
    
        # Initialize
        nn.init.uniform_(self.fc1.weight,-1, 1)
        nn.init.uniform_(self.fc2.weight,-1, 1)
        nn.init.uniform_(self.fc3.weight,-1,1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.softmax(self.fc3(x), dim = 1)
        return x

In [63]:
###
batch_size = 128
num_epoch = 200
learning_rate = 0.015
weight_decay = 0.0005
dropout_rate = 0.3
hidden_size1 = 600
hidden_size2= 400
early_stopping_patience = 10
input_size = X_train_val.shape[1]
num_classes = 2

skf = StratifiedKFold(n_splits= 5)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    train_data = CircleData(X_train, y_train)
    test_data = CircleData(X_test, y_test)

    #Create data loaders
    train_loader = DataLoader(dataset= train_data, batch_size= batch_size, shuffle= True)
    test_loader = DataLoader(dataset= test_data, batch_size = batch_size, shuffle= False)
    
    model = Net(input_size= input_size, hidden_size1= hidden_size1, hidden_size2= hidden_size2, num_classes= num_classes, dropout_rate= dropout_rate)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr= learning_rate, weight_decay= weight_decay)

    best_test_loss = float('inf')
    early_stopping_counter = 0

    #train model 
for epoch in range(num_epoch):
    model.train()
    for i, (features, labels) in enumerate(train_loader):
        #flatten
        features = features.reshape(-1, input_size)
        #forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)

        #backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for features, labels in test_loader:
            features = features.reshape(-1, input_size)
            outputs = model(features)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
        
        accuracy = correct / len(test_data)
        test_loss /= len(test_loader)
    print(f"Epoch {epoch + 1}/{num_epoch}, Accuracy: {accuracy:.3f}")

    #Early stopping 
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print(f'Early stopping at epoch {epoch + 1}')
            break
print(f'Fold completed with the test accuracy {accuracy}.')


Epoch 1/200, Accuracy: 0.876
Epoch 2/200, Accuracy: 0.850
Epoch 3/200, Accuracy: 0.841
Epoch 4/200, Accuracy: 0.876
Epoch 5/200, Accuracy: 0.920
Epoch 6/200, Accuracy: 0.929
Epoch 7/200, Accuracy: 0.956
Epoch 8/200, Accuracy: 0.965
Epoch 9/200, Accuracy: 0.965
Epoch 10/200, Accuracy: 0.965
Epoch 11/200, Accuracy: 0.965
Epoch 12/200, Accuracy: 0.965
Epoch 13/200, Accuracy: 0.965
Epoch 14/200, Accuracy: 0.947
Epoch 15/200, Accuracy: 0.938
Epoch 16/200, Accuracy: 0.912
Epoch 17/200, Accuracy: 0.956
Epoch 18/200, Accuracy: 0.956
Early stopping at epoch 18
Fold completed with the test accuracy 0.9557522123893806.
