## Introduction
In this notebook, optimal hyperparameters will be selected and the performance of both models will be evaluated.

### Imports
The analysis commences with the necessary imports.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

project_root = Path.cwd()
while not (project_root / "src").exists():
    project_root = project_root.parent

sys.path.append(str(project_root / "src"))

from model_selection import grid_search_cv
from models import SVM, LogisticRegression
from util import calculate_accuracy, calculate_metrics

### Notebook Parameter

In [None]:
METRIC = 'accuracy'
CV = 5
RANDOM_STATE = 1

### Data Loading
The data will be loaded.

In [None]:
X_train_df = pd.read_csv('../data/processed/X_train.csv')
y_train_df = pd.read_csv('../data/processed/y_train.csv')
X_test_df = pd.read_csv('../data/processed/X_test.csv')
y_test_df = pd.read_csv('../data/processed/y_test.csv')

y_train = np.where(y_train_df['quality'] >= 6, 1, -1)
y_test = np.where(y_test_df['quality'] >= 6, 1, -1)

X_train = X_train_df.to_numpy()
X_test = X_test_df.to_numpy()

# Linear Models Evaluation

## Hyperparameter Tuning

### SVM
For SVMs, two primary parameters require optimization: the number of iterations (*n_iters*) and the regularization parameter lambda (*lambda_param*). Typically, the number of folds ranges between 5 to 10. According to SVM theory, the higher the number of iterations, the lower the error should be.

In [None]:
svm_n_iters_list = [100, 200, 500, 1000, 2000, 5000]
svm_lambda_param_list = [1e-1, 1e-2, 1e-3, 1e-4]

svm_param_grid = {
        'n_iters': svm_n_iters_list,
        'lambda_param' : svm_lambda_param_list
    }

svm_best_params, svm_best_metrics = grid_search_cv(SVM, svm_param_grid, X_train, y_train, cv=CV, scoring=METRIC, random_state=RANDOM_STATE)
print(f'SVM best parameter: {svm_best_params}')
print(f'SVM best metrics: {svm_best_metrics}')

svm_n_iters = svm_best_params['n_iters']
svm_lambda_param = svm_best_params['lambda_param']

### Logistic Regression
As with SVMs, the parameters include *n_iters* and *lambda_param*, however, this model additionally incorporates the learning rate parameter (*learning_rate*).

In [None]:
lr_n_iters_list = [2, 5, 10, 20, 50]
lr_lambda_param_list = [1e-1, 1e-2, 1e-3, 1e-4]
lr_learning_rate_list = [1e-1, 1e-2, 1e-3, 1e-4]

lr_param_grid = {
        'n_iters': lr_n_iters_list,
        'lambda_param': lr_lambda_param_list,
        'learning_rate': lr_learning_rate_list
    }

lr_best_params, lr_best_metrics = grid_search_cv(LogisticRegression, lr_param_grid, X_train, y_train, cv=CV, scoring=METRIC, random_state=RANDOM_STATE)
print(f'Logistic Regression best parameter: {lr_best_params}')
print(f'Logistic Regression best metrics: {lr_best_metrics}')

lr_n_iters = lr_best_params['n_iters']
lr_lambda_param = lr_best_params['lambda_param']
lr_learning_rate = lr_best_params['learning_rate']

## Learning Curves
It is particularly valuable to analyze the learning curves of the various algorithms to observe how and when convergence occurs.

### Helper Functions

In [None]:
def plot_learning_curve(model_class, X_train, y_train, X_test, y_test, iterations_list, figname=None, **model_kwargs):
    
    train_scores = []
    test_scores = []
    
    for n_iter in iterations_list:
        model = model_class(n_iters=n_iter, **model_kwargs)
        model.fit(X_train, y_train)
        
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        
        train_score = calculate_accuracy(train_pred, y_train)
        test_score = calculate_accuracy(test_pred, y_test)
        
        train_scores.append(train_score)
        test_scores.append(test_score)
        
        print(f"Iter {n_iter}: Train={train_score:.3f}, Test={test_score:.3f}")
    
    plt.figure(figsize=(8, 5))
    plt.plot(iterations_list, train_scores, 'o-', label='Training', color='blue')
    plt.plot(iterations_list, test_scores, 'o-', label='Test', color='red')
    
    plt.xlabel('Iteration')
    plt.ylabel('Accuracy')
    plt.title('Learning Curve')
    plt.legend()
    plt.grid(True, alpha=0.3)

    if figname is not None:
        plt.savefig(f'../plots/{figname}.pdf')

    plt.show()

### SVM

In [None]:
plot_learning_curve(SVM, X_train, y_train, X_test, y_test, svm_n_iters_list, 'SVM learning curve', lambda_param=svm_lambda_param, random_state=RANDOM_STATE)

### Logistic Regression

In [None]:
plot_learning_curve(LogisticRegression, X_train, y_train, X_test, y_test, lr_n_iters_list, 'LR learning curve', lambda_param=lr_lambda_param, learning_rate=lr_learning_rate)

### Conclusions
It is evident that SVM requires significantly more iterations than logistic regression, which is expected given that logistic regression updates parameters for each example at every iteration. Additionally, SVM performance has improved, while logistic regression exhibits slight overfitting tendencies as iterations progress.

## Evaluation

### Helper Functions

In [None]:
metrics_for_print = []
def plot_metrics(predictions, y_test, figname=None):
    metrics = calculate_metrics(predictions, y_test)
    metrics_for_print.append(metrics)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    names = ['Accuracy', 'Precision', 'Recall', 'F1']
    values = [metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1']]
    
    ax1.bar(names, values, color=['skyblue', 'lightcoral', 'lightgreen', 'orange'])
    ax1.set_ylim(0, 1)
    ax1.set_title('Metrics')
    
    for i, v in enumerate(values):
        ax1.text(i, v + 0.02, f'{v:.3f}', ha='center')
    
    cm = np.array([[metrics['tn'], metrics['fp']], [metrics['fn'], metrics['tp']]], dtype=float)
    
    cm_normalized = cm / cm.sum()
    
    sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=ax2, xticklabels=['Bad', 'Good'], yticklabels=['Bad', 'Good'])
    
    ax2.set_title('Confusion Matrix (%)')
    ax2.set_xlabel('Predicted')
    ax2.set_ylabel('Actual')
    
    plt.tight_layout()

    if figname is not None:
        plt.savefig(f'../plots/{figname}.pdf')

    plt.show()


### SVM

In [None]:
svm = SVM(svm_n_iters, svm_lambda_param)
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)
plot_metrics(predictions, y_test, 'SVM stats')

### Logistic Regression

In [None]:
lr = LogisticRegression(lr_n_iters, lr_lambda_param, lr_learning_rate)
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
plot_metrics(predictions, y_test, 'LR stats')

### Conclusions

Performance visualization indicates that logistic regression generally demonstrates superior performance on this specific dataset.

# Kernel Models Evaluation

## Hyperparameter Tuning

### SVM
Non-linear kernel models will now be tuned with the previous best parameter.

In [None]:
N_AVERAGED_STATES = 10
svm_k_param_grid = {
        'kernel': ['poly'],
        'n_iters': svm_n_iters_list,
        'lambda_param': svm_lambda_param_list,
        'degree': [2],
        'n_averaged_states': [N_AVERAGED_STATES]
    }

svm_k_best_params, svm_k_best_metrics = grid_search_cv(SVM, svm_k_param_grid, X_train, y_train, cv=CV, scoring=METRIC, random_state=RANDOM_STATE)
print(f'SVM best parameter: {svm_k_best_params}')
print(f'SVM best metrics: {svm_k_best_metrics}')

svm_degree = svm_k_best_params['degree']

### Logistic Regression

In [None]:
lr_k_param_grid = {
        'kernel': ['poly'],
        'n_iters': lr_n_iters_list,
        'lambda_param': lr_lambda_param_list,
        'learning_rate': lr_learning_rate_list,
        'degree': [2]
    }

lr_k_best_params, lr_k_best_metrics = grid_search_cv(LogisticRegression, lr_k_param_grid, X_train, y_train, cv=CV, scoring=METRIC, random_state=RANDOM_STATE)
print(f'SVM best parameter: {lr_k_best_params}')
print(f'SVM best metrics: {lr_k_best_metrics}')

lr_degree = lr_k_best_params['degree']


## Learning Curves
It is particularly valuable to analyze the learning curves of the various algorithms to observe how and when convergence occurs.

### SVM

In [None]:
plot_learning_curve(SVM, X_train, y_train, X_test, y_test, svm_n_iters_list, 'k-SVM learning curve', lambda_param=svm_lambda_param, kernel='poly', degree=svm_degree, n_averaged_states=N_AVERAGED_STATES, random_state=RANDOM_STATE)

### Logistic Regression

In [None]:
plot_learning_curve(LogisticRegression, X_train, y_train, X_test, y_test, lr_n_iters_list, 'k-LR learning curve', lambda_param=lr_lambda_param, learning_rate=lr_learning_rate, kernel='poly', degree=lr_degree)

### Conclusions
It is evident that SVM requires significantly more iterations than logistic regression, which is expected given that logistic regression updates parameters for each example at every iteration. Additionally, SVM performance has improved, while logistic regression exhibits slight overfitting tendencies as iterations progress.

## Evaluation

### SVM

In [None]:
svm = SVM(n_iters=svm_n_iters, lambda_param=svm_lambda_param, kernel='poly', degree=svm_degree, n_averaged_states=N_AVERAGED_STATES)
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)
plot_metrics(predictions, y_test, 'k-SVM stats')

### Logistic Regression

In [None]:
lr = LogisticRegression(n_iters=lr_n_iters, lambda_param=lr_lambda_param, learning_rate=lr_learning_rate, kernel='poly', degree=lr_degree)
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
plot_metrics(predictions, y_test, 'k-LR stats')

### Conclusion
With kernel methods, SVM performance are similar while logistic regression performance slightly improved. It is also notable that recall metrics are consistently higher than precision across both models. Regarding accuracy, performance approximates 75%, which represents a satisfactory result considering the baseline probability established by the dataset imbalance (60-40).

In [None]:
print(f"{metrics_for_print[0]['accuracy']:.3f},{metrics_for_print[2]['accuracy']:.3f},{metrics_for_print[1]['accuracy']:.3f},{metrics_for_print[3]['accuracy']:.3f}")