## Exercise 1

For the wine dataset and the k-NN method, write a code to select the best hyperparameter based on validation performance.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_wine

# Load the wine dataset
# Carica il dataset dei vini
wine = load_wine()
X = wine.data
y = wine.target

# Split the data into training, validation, and test sets
# Dividi i dati in set di train, validazione e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Define a range of k values to search
# Definisci un intervallo di valori k da cercare
k_values = [1, 3, 5, 7, 9]

# Initialize variables to keep track of the best hyperparameters and accuracy
# Inizializza le variabili per tenere traccia dei migliori hyperparameter e dell'accuracy
best_k = None
best_accuracy = 0

# Scorri i valori di k e scegli il migliore in base all'accuracy del set di validazione
# Iterate over k values and choose the best one based on validation set accuracy
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_val, y_val)

    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

# Train the final model with the best hyperparameter on the combined training and validation sets
# Allena il modello finale con il miglior hyperparameter sui set di training e validazione combinati
final_knn = KNeighborsClassifier(n_neighbors=best_k)
final_knn.fit(X_train, y_train)

# Evaluate the final model on the test set
# Valuta il modello finale sul set di test
test_accuracy = final_knn.score(X_test, y_test)

print(f"Best k: {best_k}")
print(f"Validation set accuracy with the best k: {best_accuracy}")
print(f"Test set accuracy with the best k: {test_accuracy}")

Best k: 3
Validation set accuracy with the best k: 0.7352941176470589
Test set accuracy with the best k: 0.7555555555555555


## Exercise 2

A better way of performing model validation is applying k-fold cross validation. See ```sklearn.model_selection.cross_val_score``` ```sklearn.model_selection.ShuffleSplit``` to implement k-fold cross validation when k=2 and k=3.

In [None]:

# In this code, we use ShuffleSplit to create cross-validation objects with 2 and 3 splits (k=2 and k=3),
# and we perform cross-validation for each k value. We select the best k value based on the highest mean cross-validation accuracy,
# and we also keep track of the cross-validation method (2-fold or 3-fold) with the best result.

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_wine

# Load the wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Define a range of k values to search
k_values = [1, 3, 5, 7, 9]

# Initialize variables to keep track of the best hyperparameters and accuracy
best_k = None
best_accuracy = 0

# Create a ShuffleSplit cross-validation object with k=2 and k=3
cv_2fold = ShuffleSplit(n_splits=2, test_size=0.25, random_state=42)
cv_3fold = ShuffleSplit(n_splits=3, test_size=0.25, random_state=42)

# Iterate over k values and choose the best one based on cross-validation scores
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)

    # Perform 2-fold cross-validation
    scores_2fold = cross_val_score(knn, X, y, cv=cv_2fold)
    mean_accuracy_2fold = scores_2fold.mean()

    # Perform 3-fold cross-validation
    scores_3fold = cross_val_score(knn, X, y, cv=cv_3fold)
    mean_accuracy_3fold = scores_3fold.mean()

    if mean_accuracy_2fold > best_accuracy:
        best_k = k
        best_accuracy = mean_accuracy_2fold
        best_cv = '2-fold'
    if mean_accuracy_3fold > best_accuracy:
        best_k = k
        best_accuracy = mean_accuracy_3fold
        best_cv = '3-fold'

print(f"Best k: {best_k}")
print(f"Best cross-validation method: {best_cv}")
print(f"Mean cross-validation accuracy with the best k: {best_accuracy}")

Best k: 1
Best cross-validation method: 2-fold
Mean cross-validation accuracy with the best k: 0.7555555555555555
