# Lab 3 : Classification

In [None]:
import numpy as np
import pylab as plt
import scipy as sp
import pandas as pd
import seaborn as sns
import sklearn as sk

## 1) Loading the data

In [None]:
# Loading the digits dataset

data_digits = np.load('digits.npz')

x2_train = data_digits['xt'] / 255.0
y2_train = data_digits['yt'].ravel()

x2_test = data_digits['x'] / 255.0
y2_test = data_digits['y'].ravel()


In [None]:
# Loading the pima dataset

data_pima = np.load('pima.npz')

matrix_names = data_pima.files
for name in matrix_names:
    print(name)

x1 = data_pima['xall']
y1 = data_pima['yall']
varnames = data_pima['varnames']

In [None]:
df = pd.DataFrame(x1, columns=varnames)
df.describe()

In [None]:
df_y1 = pd.DataFrame(y1, columns=['Résultat'])
df_y1.describe()

In [None]:
# Reshape the data into 28x28 images
x2_images = x2.reshape(-1, 28, 28)

# Create a grid of image subplots
plt.figure(figsize=(10, 6))
for i in range(5):
    plt.subplot(1, 5, i + 1)
    plt.imshow(x2_images[i], cmap='gray')
    plt.title(f"Class: {y2[i]}")
    plt.axis('off')

plt.show()

## 2) Predicting Diabetes on the Pima dataset


### 2.1 Know the data


In [None]:
# Create scatterplots between pairs of variables with class coloring
df['Class'] = y1
sns.pairplot(df, hue='Class', palette='tab10')
plt.show()


**What are the variables that seem to help predict the class? Do those variable make sense from a
medical perspective ?**

In [None]:
from sklearn.model_selection import train_test_split

# Define the number of samples to keep for training/validation
n_train_validation = 300

# Split the data into training/validation and test sets
x_train_val, x_test, y_train_val, y_test = train_test_split(
    df.drop('Class', axis=1), df['Class'], test_size=len(df) - n_train_validation, random_state=42)

# Print the shapes of the resulting sets
print("Training/Validation Set (X):", x_train_val.shape)
print("Training/Validation Set (Y):", y_train_val.shape)
print("\nTest Set (X):", x_test.shape)
print("Test Set (Y):", y_test.shape)

In [None]:
# Calculate mean and variance of the original data
mean_before_scaling = x_train_val.mean()
variance_before_scaling = x_train_val.var()

In [None]:
# Print the statistics before and after scaling
print("Original Data - Mean:")
print(mean_before_scaling)
print("Original Data - Variance:")
print(variance_before_scaling)

In [None]:
from sklearn.preprocessing import StandardScaler

# Instantiate the StandardScaler
scaler = StandardScaler()

# Fit and transform the training/validation data
x_train_val_scaled = scaler.fit_transform(x_train_val)

# Transform the test data using the same scaler
x_test_scaled = scaler.transform(x_test)

# Check the variances of the scaled data
print("Variances of scaled training/validation data:")
print(np.var(x_train_val_scaled, axis=0))

print("Variances of scaled test data:")
print(np.var(x_test_scaled, axis=0))


In [None]:
# Number of features
num_features = x_train_val.shape[1]

# Create a figure with subplots for all features
fig, axes = plt.subplots(num_features, 2, figsize=(12, 2 * num_features))

for feature_index in range(num_features):
    feature_name = varnames[feature_index]  # Get the feature name
    # Plot histograms of the feature before and after scaling
    axes[feature_index, 0].hist(x_train_val.values[:, feature_index], bins=30, color='b', alpha=0.7, label='Original Data')
    axes[feature_index, 0].set_title(f'Feature: {feature_name} - Before Scaling')
    axes[feature_index, 0].legend()

    axes[feature_index, 1].hist(x_train_val_scaled[:, feature_index], bins=30, color='r', alpha=0.7, label='Scaled Data')
    axes[feature_index, 1].set_title(f'Feature: {feature_name} - After Scaling')
    axes[feature_index, 1].legend()

plt.tight_layout()
plt.show()


### 2.2 Bayesian decision and linear classification


#### LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer

In [None]:
# Create an LDA classifier with default parameters
lda = LinearDiscriminantAnalysis()

# Fit the LDA classifier on the training data
lda.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred = lda.predict(x_test_scaled)

# Calculate accuracy on the test data
accuracy_default_lda = accuracy_score(y_test, y_pred)

# Calculate AUC (Area Under the ROC Curve) on the test data
# First, get the probability estimates for class 1
y_prob = lda.predict_proba(x_test_scaled)[:, 1]

# Calculate the AUC
auc_default_lda = roc_auc_score(y_test, y_prob)

In [None]:
print(f"Accuracy: {accuracy_default_lda:.2f}")
print(f"AUC: {auc_default_lda:.2f}")

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Create an LDA classifier
lda = LinearDiscriminantAnalysis()

# Define the range of values for the 'solver' parameter
param_grid = {'solver': ['svd', 'lsqr', 'eigen'],
              'shrinkage': np.linspace(0,1,100)}  # Example: Different solver options

# Create a GridSearchCV object
grid_search = GridSearchCV(lda, param_grid, scoring=make_scorer(roc_auc_score), verbose=1, n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(x_train_val_scaled, y_train_val)

In [None]:
# Get the results of the grid search
print("Best parameters:", grid_search.best_params_)

In [None]:
# Get the best estimator and best parameter
best_lda = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best estimator on the test data
best_lda.fit(x_train_val_scaled, y_train_val)
y_pred = best_lda.predict(x_test_scaled)
y_prob_best_lda = best_lda.predict_proba(x_test_scaled)[:, 1]


In [None]:
accuracy_best_lda = accuracy_score(y_test, y_pred)
auc_best_lda = roc_auc_score(y_test, y_prob)

In [None]:
# Print the results
print(f"accuracy_best_lda: {accuracy_best_lda:.2f}")
print(f"auc_best_lda: {auc_best_lda:.2f}")

#### QDA

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
# Create a QDA classifier with default parameters
qda = QuadraticDiscriminantAnalysis()

# Fit the QDA classifier on the training data
qda.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred_qda = qda.predict(x_test_scaled)

# Calculate accuracy of QDA on the test data
accuracy_default_qda = accuracy_score(y_test, y_pred_qda)

# Calculate AUC (Area Under the ROC Curve) for QDA on the test data
y_prob_qda = qda.predict_proba(x_test_scaled)[:, 1]
auc_default_qda = roc_auc_score(y_test, y_prob_qda)


In [None]:
print(f"QDA Accuracy: {accuracy_default_qda:.2f}")
print(f"QDA AUC: {auc_default_qda:.2f}")

In [None]:
# Create a QDA classifier
qda = QuadraticDiscriminantAnalysis()

# Define the range of values for the 'reg_param' parameter
param_grid = {'reg_param': [0.001, 0.01, 0.1, 1.0]}

# Create a scorer for ROC AUC
roc_auc_scorer = make_scorer(roc_auc_score)

# Create a GridSearchCV object
grid_search = GridSearchCV(qda, param_grid, scoring=roc_auc_scorer, cv=5)

# Fit the GridSearchCV to the training data
grid_search.fit(x_train_val_scaled, y_train_val)

# Get the best estimator and best parameter
best_qda = grid_search.best_estimator_
best_reg_param = grid_search.best_params_['reg_param']

# Evaluate the best estimator on the test data
best_qda.fit(x_train_val_scaled, y_train_val)
y_prob_best_qda = best_qda.predict_proba(x_test_scaled)[:, 1]

In [None]:
accuracy_best_qda = accuracy_score(y_test, best_qda.predict(x_test_scaled))
auc_best_qda = roc_auc_score(y_test, y_prob_best_qda)

In [None]:
# Print the best parameters and the corresponding AUC
print(f"Best reg_param: {best_reg_param}")
print(f"Best AUC: {grid_search.best_score_:.2f}")

In [None]:
#compare LDA and QDA results
print(f"accuracy_best_lda: {accuracy_best_lda:.2f}")
print(f"auc_best_lda: {auc_best_lda:.2f}")
print(f"accuracy_best_qda: {accuracy_best_qda:.2f}")
print(f"auc_best_qda: {auc_best_qda:.2f}")

#### Gaussian Naive Bayes (NB)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# Create a Gaussian Naive Bayes (NB) classifier
gnb = GaussianNB()

# Fit the NB classifier on the training data
gnb.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred_gnb = gnb.predict(x_test_scaled)

# Calculate accuracy of NB on the test data
accuracy_default_gnb = accuracy_score(y_test, y_pred_gnb)

# Calculate AUC (Area Under the ROC Curve) for NB on the test data
y_prob_gnb = gnb.predict_proba(x_test_scaled)[:, 1]
auc_default_gnb = roc_auc_score(y_test, y_prob_gnb)

In [None]:
print(f"NB Accuracy: {accuracy_default_gnb:.2f}")
print(f"NB AUC: {auc_default_gnb:.2f}")

In [None]:
# Print the performance of QDA and LDA for comparison
print(f"QDA Accuracy: {accuracy_best_qda:.2f}")
print(f"QDA AUC: {auc_best_qda:.2f}")
print(f"LDA Accuracy: {accuracy_best_lda:.2f}")
print(f"LDA AUC: {auc_best_lda:.2f}")

#### Logistic regression


In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Create a Logistic Regression classifier
lr = LogisticRegression()

# Fit the LR classifier on the training data
lr.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred_lr = lr.predict(x_test_scaled)

# Calculate accuracy of LR on the test data
accuracy_default_lr = accuracy_score(y_test, y_pred_lr)

# Calculate AUC (Area Under the ROC Curve) for LR on the test data
y_prob_lr = lr.predict_proba(x_test_scaled)[:, 1]
auc_default_lr = roc_auc_score(y_test, y_prob_lr)

In [None]:
print(f"LR Accuracy: {accuracy_default_lr:.2f}")
print(f"LR AUC: {auc_default_lr:.2f}")

In [None]:
# Create a Logistic Regression classifier
lr = LogisticRegression(penalty='l1', solver='liblinear')

# Define the range of values for the 'C' parameter
param_grid = {'C': np.logspace(-3, 3, 7)}

# Create a scorer for ROC AUC
roc_auc_scorer = make_scorer(roc_auc_score)

# Create a GridSearchCV object
grid_search = GridSearchCV(lr, param_grid, scoring=roc_auc_scorer, cv=5)

# Fit the GridSearchCV to the training data
grid_search.fit(x_train_val_scaled, y_train_val)

# Get the best estimator and best parameter
best_lr = grid_search.best_estimator_
best_C = grid_search.best_params_['C']

# Evaluate the best estimator on the test data
best_lr.fit(x_train_val_scaled, y_train_val)
y_prob_best_lr = best_lr.predict_proba(x_test_scaled)[:, 1]

accuracy_best_lr = accuracy_score(y_test, best_lr.predict(x_test_scaled))
auc_best_lr = roc_auc_score(y_test, y_prob_best_lr)

In [None]:
# Print the best parameters and the corresponding AUC
print(f"Best C: {best_C}")
print(f"Best AUC: {grid_search.best_score_:.2f}")

In [None]:
#print the results
print(f"accuracy_best_lr: {accuracy_best_lr:.2f}")
print(f"auc_best_lr: {auc_best_lr:.2f}")

**What is the best decision method so far? Is the best model linear (LAD,LR) on quadratic (QDA,NB)?**


In [None]:
data_linear = {
"LDA" : [accuracy_default_lda, auc_default_lda, accuracy_best_lda, auc_best_lda],
"QDA" : [accuracy_default_qda, auc_default_qda, accuracy_best_qda, auc_best_qda],
"GNB" : [accuracy_default_gnb, auc_default_gnb, accuracy_default_gnb, auc_default_gnb],
"LR" : [accuracy_default_lr, auc_default_lr, accuracy_best_lr, auc_best_lr]
}

# Create a DataFrame
df_linear = pd.DataFrame(data_linear, index=["Accuracy Default", "AUC Default", "Accuracy Best", "AUC Best"])

# Applying a style to have a better result :)
styled_df_linear = df_linear.style.set_properties(**{'text-align': 'center'})
styled_df_linear.set_table_styles([{'selector': 'th', 'props': [('text-align', 'center')]}])

# Plot the dataframe
styled_df_linear

In [None]:
# Create histograms for each class in 1D
plt.figure(figsize=(16, 6))

# LDA
plt.subplot(141)
plt.hist(y_prob_best_lda[y_test == -1], color='blue', alpha=0.5, label='Class -1')
plt.hist(y_prob_best_lda[y_test == 1], color='red', alpha=0.5, label='Class 1')
plt.title('LDA Score Distribution')
plt.xlabel('Scores')
plt.legend()

# QDA
plt.subplot(142)
plt.hist(y_prob_best_qda[y_test == -1], color='blue', alpha=0.5, label='Class -1')
plt.hist(y_prob_best_qda[y_test == 1], color='red', alpha=0.5, label='Class 1')
plt.title('QDA Score Distribution')
plt.xlabel('Scores')
plt.legend()

# Gaussian Naive Bayes (GNB)
plt.subplot(143)
plt.hist(y_prob_gnb[y_test == -1], color='blue', alpha=0.5, label='Class -1')
plt.hist(y_prob_gnb[y_test == 1], color='red', alpha=0.5, label='Class 1')
plt.title('GNB Score Distribution')
plt.xlabel('Scores')
plt.legend()

# Logistic Regression (LR)
plt.subplot(144)
plt.hist(y_prob_best_lr[y_test == -1], color='blue', alpha=0.5, label='Class -1')
plt.hist(y_prob_best_lr[y_test == 1], color='red', alpha=0.5, label='Class 1')
plt.title('LR Score Distribution')
plt.xlabel('Scores')
plt.legend()

plt.tight_layout()
plt.show()


**Interpret the weight for a good linear model. What is the effect of each variable on the risk of diabetes?
Does it make medical sense?**


### 2.3 Nonlinear methods


#### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
# Create a Random Forest classifier
rf = RandomForestClassifier()

# Fit the RF classifier on the training data
rf.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred_rf = rf.predict(x_test_scaled)

# Calculate accuracy of RF on the test data
accuracy_default_rf = accuracy_score(y_test, y_pred_rf)

# Calculate AUC (Area Under the ROC Curve) for RF on the test data
y_prob_rf = rf.predict_proba(x_test_scaled)[:, 1]
auc_default_rf = roc_auc_score(y_test, y_prob_rf)

In [None]:
print(f"RF default Accuracy: {accuracy_default_rf:.2f}")
print(f"RF default AUC: {auc_default_rf:.2f}")

In [None]:
# Define the parameter distribution to search
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(randint(10, 30).rvs(5, random_state=42)),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=10, scoring=make_scorer(accuracy_score), cv=5, random_state=42)

# Fit the random search to the training data
random_search.fit(x_train_val_scaled, y_train_val)

# Get the best parameters and the corresponding model
best_params = random_search.best_params_
best_rf = random_search.best_estimator_

# Make predictions on the test data with the best model
y_pred_rf = best_rf.predict(x_test_scaled)

# Calculate accuracy on the test data
accuracy_best_rf = accuracy_score(y_test, y_pred_rf)

# Calculate AUC (Area Under the ROC Curve) for RF on the test data
y_prob_best_rf = best_rf.predict_proba(x_test_scaled)[:, 1]
auc_best_rf = roc_auc_score(y_test, y_prob_best_rf)


In [None]:
print(f"Best Parameters: {best_params}")
print(f"\nBest RF Accuracy: {accuracy_best_rf:.2f}")
print(f"Best RF AUC: {auc_best_rf:.2f}")

#### SVC

In [None]:
from sklearn.svm import SVC

In [None]:
# Create a SVC classifier
svc = SVC()

# Fit the SVC classifier on the training data
svc.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred_svc = svc.predict(x_test_scaled)

# Calculate accuracy of SVC on the test data
accuracy_default_svc = accuracy_score(y_test, y_pred_svc)

# Calculate AUC (Area Under the ROC Curve) for SVC on the test data
y_prob_svc = svc.decision_function(x_test_scaled)
auc_default_svc = roc_auc_score(y_test, y_prob_svc)


In [None]:
print(f"SVC default Accuracy: {accuracy_default_svc:.2f}")
print(f"SVC default AUC: {auc_default_svc:.2f}")

In [None]:
# Define the parameter distribution to search
param_dist = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'gamma': ['scale', 'auto', 0.1, 1, 10, 100],  # Kernel coefficient
    'degree': [2, 3, 4],  # Degree of the polynomial kernel
}

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(estimator=SVC(), param_distributions=param_dist,
                                   n_iter=10, scoring='accuracy', cv=5, random_state=42)

# Fit the random search to the training data
random_search.fit(x_train_val_scaled, y_train_val)

# Get the best parameters and the corresponding model
best_params_svc = random_search.best_params_
best_svc = random_search.best_estimator_

# Train and evaluate the SVC with the best parameters
y_pred_best_svc = best_svc.predict(x_test_scaled)
accuracy_best_svc = accuracy_score(y_test, y_pred_best_svc)
y_prob_best_svc = best_svc.decision_function(x_test_scaled)
auc_best_svc = roc_auc_score(y_test, y_prob_best_svc)

In [None]:
print(f"Best SVC Parameters: {best_params_svc}")
print(f"\nBest SVC Accuracy: {accuracy_best_svc:.2f}")
print(f"Best SVC AUC: {auc_best_svc:.2f}")

#### MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
# Create a MLP classifier
mlp = MLPClassifier(max_iter=10000)

# Fit the MLP classifier on the training data
mlp.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred_mlp = mlp.predict(x_test_scaled)

# Calculate accuracy of MLP on the test data
accuracy_default_mlp = accuracy_score(y_test, y_pred_mlp)

# Calculate AUC (Area Under the ROC Curve) for MLP on the test data
y_prob_mlp = mlp.predict_proba(x_test_scaled)[:, 1]
auc_default_mlp = roc_auc_score(y_test, y_prob_mlp)

In [None]:
print(f"MLP default Accuracy: {accuracy_default_mlp:.2f}")
print(f"MLP default AUC: {auc_default_mlp:.2f}")

In [None]:
# Define the parameter distribution to search
param_dist = {
    'hidden_layer_sizes': [(100,), (50, 25)],
    'activation': ['logistic', 'tanh'],
    'alpha': [0.0001, 0.001],
    'max_iter': [2000],
}

# Create an MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10, random_state=42)

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(estimator=mlp, param_distributions=param_dist,
                                   n_iter=8, scoring='accuracy', cv=5, random_state=42)

# Fit the random search to the training data
random_search.fit(x_train_val_scaled, y_train_val)

# Get the best parameters and the corresponding model
best_params_mlp = random_search.best_params_
best_mlp = random_search.best_estimator_

# Make predictions on the test data with the best model
y_pred_best_mlp = best_mlp.predict(x_test_scaled)

# Calculate accuracy on the test data
accuracy_best_mlp = accuracy_score(y_test, y_pred_best_mlp)

# Calculate AUC on the test data
y_prob_best_mlp = best_mlp.predict_proba(x_test_scaled)[:, 1]
auc_best_mlp = roc_auc_score(y_test, y_prob_best_mlp)

In [None]:
print(f"Best MLP Parameters: {best_params_mlp}")
print(f"\nBest MLP Accuracy: {accuracy_best_mlp:.2f}")
print(f"Best MLP AUC: {auc_best_mlp:.2f}")

#### GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Create a Gradient Boosting classifier
gbc = GradientBoostingClassifier()

# Fit the GBC classifier on the training data
gbc.fit(x_train_val_scaled, y_train_val)

# Make predictions on the test data
y_pred_gbc = gbc.predict(x_test_scaled)

# Calculate accuracy of GBC on the test data
accuracy_default_gbc = accuracy_score(y_test, y_pred_gbc)

# Calculate AUC (Area Under the ROC Curve) for GBC on the test data
y_prob_gbc = gbc.predict_proba(x_test_scaled)[:, 1]
auc_default_gbc = roc_auc_score(y_test, y_prob_gbc)

In [None]:
print(f"GBC default Accuracy: {accuracy_default_gbc:.2f}")
print(f"GBC default AUC: {auc_default_gbc:.2f}")

In [None]:
# Define the parameter distribution to search
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create a Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=42)

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(estimator=gbc, param_distributions=param_dist,
                                   n_iter=10, scoring='accuracy', cv=5, random_state=42)

# Fit the random search to the training data
random_search.fit(x_train_val_scaled, y_train_val)

# Get the best parameters and the corresponding model
best_params_gbc = random_search.best_params_
best_gbc = random_search.best_estimator_

# Make predictions on the test data with the best model
y_pred_best_gbc = best_gbc.predict(x_test_scaled)

# Calculate accuracy on the test data
accuracy_best_gbc = accuracy_score(y_test, y_pred_best_gbc)

# Calculate AUC on the test data
y_prob_best_gbc = best_gbc.predict_proba(x_test_scaled)[:, 1]
auc_best_gbc = roc_auc_score(y_test, y_prob_best_gbc)

In [None]:
print(f"Best GBC Parameters: {best_params_gbc}")
print(f"\nBest GBC Accuracy: {accuracy_best_gbc:.2f}")
print(f"Best GBC AUC: {auc_best_gbc:.2f}")

### 2.4 Comparison and interpretation


In [None]:
data_non_linear = {
"RF" : [accuracy_default_rf, auc_default_rf, accuracy_best_rf, auc_best_rf],
"SVC" : [accuracy_default_svc, auc_default_svc, accuracy_best_svc, auc_best_svc],
"MLP" : [accuracy_default_mlp, auc_default_mlp, accuracy_best_mlp, auc_best_mlp],
"GBC" : [accuracy_default_gbc, auc_default_gbc, accuracy_best_gbc, auc_best_gbc]
}

# Create a DataFrame
df_non_linear = pd.DataFrame(data_non_linear, index=["Accuracy Default", "AUC Default", "Accuracy Best", "AUC Best"])

# Applying a style to have a better result :)
styled_df_non_linear = df_non_linear.style.set_properties(**{'text-align': 'center'})
styled_df_non_linear.set_table_styles([{'selector': 'th', 'props': [('text-align', 'center')]}])

df_methods = df_linear.join(df_non_linear, rsuffix='_non_linear')

# Display the combined styled DataFrame
df_methods

**Which model is best from a medical/practical perspective? Do we need non-linearity in this application?**

#### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
def calculate_fnr(confusion_matrix):
    """
    Calculate the False Negative Rate (FNR) from a confusion matrix.

    Parameters:
    - confusion_matrix: numpy.ndarray
        The confusion matrix.

    Returns:
    - fnr: float
        The False Negative Rate.
    """
    TP = confusion_matrix[1, 1]  # True Positives
    FN = confusion_matrix[1, 0]  # False Negatives

    fnr = FN / (FN + TP)
    return fnr

In [None]:
y_pred_best_model = best_lr.predict(x_test_scaled)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_best_model)
conf_matrix_plot = ConfusionMatrixDisplay(conf_matrix, display_labels=best_lr.classes_)

FNR = calculate_fnr(conf_matrix)

conf_matrix_plot.plot()
print(f"\nFalse Negative Rate (FNR): {FNR:.4f}")

**Since a false negative can have an important medical impact, propose a new threshold for the predicted
score that leads to a FNR of less that 10% (this can be done by changing manually the value of the
intercept_ in the trained classifier).**

In [None]:
# Set the desired FNR (e.g., 10% or 0.10)
desired_fnr = 0.10

# Calculate the new threshold based on the desired FNR
threshold = -np.log(1/desired_fnr - 1)

# Apply the threshold to the predicted probabilities
y_pred_new_threshold = (best_lr.predict_proba(x_test_scaled)[:, 1] >= threshold).astype(int)

# Calculate the confusion matrix with the new threshold
conf_matrix_new_threshold = confusion_matrix(y_test, y_pred_new_threshold)

# Calculate the new FNR with the modified threshold
new_fnr = calculate_fnr(conf_matrix_new_threshold)
print(f"New False Negative Rate (FNR): {new_fnr:.4f}")

In [None]:
# print the results
print("Threshold : ", threshold)
print(f"\nNew False Negative Rate (FNR): {new_fnr:.4f}")

# Plot the confusion matrix with the new threshold
conf_matrix_new_threshold_display = ConfusionMatrixDisplay(conf_matrix_new_threshold, display_labels=best_lr.classes_)
conf_matrix_new_threshold_display.plot()

## 3) Predicting Classes on the Digits dataset


In [None]:
print("X training shape : ",x2_train.shape)
print("X test shape : ",x2_test.shape)
print("\nY training shape : ",y2_train.shape)
print("Y test shape : ",y2_test.shape)

### 3.1 Evaluate the different supervised methods


#### LinearDiscriminantAnalysis

In [None]:
# Create and fit the LDA model with default parameters
lda = LinearDiscriminantAnalysis()
lda.fit(x2_train, y2_train)

# Make predictions on the test data
y2_pred = lda.predict(x2_test)

# Calculate accuracy on the test data
accuracy_default_lda_digits = accuracy_score(y2_test, y2_pred)

In [None]:
print(f"LDA default Accuracy on Digits Dataset: {accuracy_default_lda_digits:.2f}")

In [None]:
# Define a parameter grid for LDA
param_grid = {
    'solver': ['lsqr', 'eigen'],
}

# Create an LDA model with 'lsqr' solver
lda = LinearDiscriminantAnalysis(solver='lsqr')

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=lda, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit the grid search to the training data
grid_search.fit(x2_train, y2_train)

# Get the best parameters and the corresponding model
best_params_lda_digits = grid_search.best_params_
best_lda_digits = grid_search.best_estimator_

# Make predictions on the test data with the best model
y2_pred_best = best_lda_digits.predict(x2_test)

# Calculate accuracy on the test data
accuracy_best_lda_digits = accuracy_score(y2_test, y2_pred_best)

In [None]:
print(f"Best LDA Parameters: {best_params_lda_digits}")
print(f"Best LDA Accuracy on Digits Dataset: {accuracy_best_lda_digits:.2f}")

#### LogisticRegression

In [None]:
# Create and fit the LR model with default parameters
lr = LogisticRegression(max_iter=1000)
lr.fit(x2_train, y2_train)

# Make predictions on the test data
y2_pred = lr.predict(x2_test)

# Calculate accuracy on the test data
accuracy_default_lr_digits = accuracy_score(y2_test, y2_pred)

In [None]:
print(f"LR default Accuracy on Digits Dataset: {accuracy_default_lr_digits:.2f}")

In [None]:
# Define a reduced parameter grid for LR
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 1, 100],
    'max_iter': [100, 1000],
    'solver': ['liblinear'],
}

# Create an LR model
lr = LogisticRegression()

# Create a GridSearchCV instance with parallel processing
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(x2_train, y2_train)

# Get the best parameters and the corresponding model
best_params_lr = grid_search.best_params_
best_lr = grid_search.best_estimator_

# Make predictions on the test data with the best model
y2_pred_best = best_lr.predict(x2_test)

# Calculate accuracy on the test data
accuracy_best_lr_digits = accuracy_score(y2_test, y2_pred_best)

In [None]:
print(f"Best LR Parameters: {best_params_lr}")
print(f"Best LR Accuracy on Digits Dataset: {accuracy_best_lr_digits:.2f}")

#### SVC

In [None]:
# Create and fit the SVC model with default parameters
svc = SVC()
svc.fit(x2_train, y2_train)

# Make predictions on the test data
y2_pred = svc.predict(x2_test)

# Calculate accuracy on the test data
accuracy_default_svc_digits = accuracy_score(y2_test, y2_pred)

In [None]:
print(f"SVC default Accuracy on Digits Dataset: {accuracy_default_svc_digits:.2f}")

In [None]:
# Define a parameter grid for SVC
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
}

# Create an SVC model
svc = SVC()

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit the grid search to the training data
grid_search.fit(x2_train, y2_train)

# Get the best parameters and the corresponding model
best_params_svc = grid_search.best_params_
best_svc = grid_search.best_estimator_

# Make predictions on the test data with the best model
y2_pred_best_svc = best_svc.predict(x2_test)

# Calculate accuracy on the test data
accuracy_best_svc_digits = accuracy_score(y2_test, y2_pred_best_svc)

In [None]:
print(f"Best SVC Parameters: {best_params_svc}")
print(f"Best SVC Accuracy on Digits Dataset: {accuracy_best_svc_digits:.2f}")

#### MLPClassifier

In [None]:
# Create and fit the MLP model with default parameters
mlp = MLPClassifier()
mlp.fit(x2_train, y2_train)

# Make predictions on the test data
y2_pred = mlp.predict(x2_test)

# Calculate accuracy on the test data
accuracy_default_mlp_digits = accuracy_score(y2_test, y2_pred)

In [None]:
print(f"MLP default Accuracy on Digits Dataset: {accuracy_default_mlp_digits:.2f}")

In [None]:
# Define the parameter distribution to search
param_dist = {
    'hidden_layer_sizes': [(100,), (50, 25)],
    'activation': ['logistic', 'tanh'],
    'alpha': [0.0001, 0.001],
    'max_iter': [2000],
}

# Create an MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10, random_state=42)

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(estimator=mlp, param_distributions=param_dist,
                                   n_iter=8, scoring='accuracy', cv=5, random_state=42)

# Fit the grid search to the training data
grid_search.fit(x2_train, y2_train)

# Get the best parameters and the corresponding model
best_params_mlp = grid_search.best_params_
best_mlp = grid_search.best_estimator_

# Make predictions on the test data with the best model
y2_pred_best_mlp = best_mlp.predict(x2_test)

# Calculate accuracy on the test data
accuracy_best_mlp_digits = accuracy_score(y2_test, y2_pred_best_mlp)

In [None]:
print(f"Best MLP Parameters: {best_params_mlp}")
print(f"Best MLP Accuracy on Digits Dataset: {accuracy_best_mlp_digits:.2f}")

### 3.2 Interpreting the classifier


## Bonus: Convolutional Neural network (CNN)
