# AIML 231 Assignment One
> Shemaiah Rangitaawa `300601546`

## Contents
- #### [Classification with SKLearn](#part-one)
- #### [Accuracy by Hyperparameter Plots](#task-i)
- #### [Summary Tables](#task-ii)
- #### [ROC Curve](#task-iii)

## Install Requirements
> I had some issues with the latest version of kaleido, therefore version 0.1.0 must be used.

In [None]:
# For training progress monitoring
!pip install tqdm

# For interactive graphs
!pip install plotly

# For plotly static image export
!conda install python-kaleido==0.1.0

## Imports

In [None]:
# Standard libraries
import os
import random

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization library - Plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Scikit-learn
from sklearn import datasets, metrics
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.exceptions import ConvergenceWarning

# To suppress warnings
from sklearn.utils._testing import ignore_warnings

# Progress bar utility
from tqdm.notebook import tqdm

# NumPy print options
np.set_printoptions(precision=3)  # limit precision when printing arrays

## Load Data

In [None]:
csv_files = {
    "ionosphere": "data/ionosphere_new.csv",
    "steelplates": "data/steelplates_new.csv",
    "banknotes": "data/banknotes_new.csv",
}
datasets  = {}
for name, path in csv_files.items():
    if os.path.exists(path):
        df = pd.read_csv(path)
        X, y = df.drop(columns=df.columns[-1]), df[df.columns[-1]]
        datasets[name] = (X, y)
    else:
        print(f"File {path} not found.")

## Initialize Number of Trials

In [None]:
nTrials = 50

# Classification using the SKLearn library <a class="anchor" id="part-one"></a>
> Refactored to implement plotly for interactive graphs and tqdm for loading bars.
> 
> Score results are saved to `data/pdframe/NameOfDataset_Classifier_Hyperparameter.csv` 

In [None]:
@ignore_warnings(category=ConvergenceWarning)
def test(dataname, classifier, controlName, controlOptions):
    """
    Test a classifier with various control options on a specified dataset

    Parameters:
    - dataname: The name of the dataset
    - classifier: The classifier class to be tested
    - controlName: The name of the hyperparameter to vary
    - controlOptions: A list of options/values for the hyperparameter

    Returns:
    - A DataFrame with scores for each option and trial
    """
    # Ensure reproducibility
    random.seed(100)
    np.random.seed(100)

    # Initialize path to save results and ensure the directory exists
    save_path = f"data/pdframe/{dataname}_{classifier.__name__}_{controlName}.csv"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    
    # Load dataset and standardize features
    X, y = datasets[dataname]
    X = StandardScaler().fit_transform(X)
    
    scores_list = []  # Store results here
    
    # Perform trials
    for t in tqdm(range(nTrials), desc=f"{str(dataname).capitalize()}"):
        
        # Split dataset into training and testing parts
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)  # 50:50 split
        
        # Test each hyperparameter option
        for option in controlOptions:
            
            # Initialize classifier with current option
            model = classifier(**{controlName: option})
            model.fit(X_train, y_train)  # Train modedl
            score = model.score(X_test, y_test)  # Evaluate model
            scores_list.append({'Option': option, 'Score': score, 'Trial': t})
    
    # Convert scores list to DataFrame
    scores_df = pd.DataFrame(scores_list)
    
    # Adjust 'Option' column based on controlName specifics
    if controlName in ['alpha']:
        scores_df['Option'] = np.log10(scores_df['Option'])
    elif controlName in ['kernel']:
        scores_df['Option'] = scores_df['Option'].astype(str)
        
    # Save results to CSV
    scores_df.to_csv(save_path, index=False)
    return scores_df
    

def test_several_datasets(classifier, controlName, options):
    """
    Test a classifier on several datasets and visualize the results

    Parameters:
    - classifier: The classifier class to be tested
    - controlName: The name of the hyperparameter to vary
    - options: A list of options/values for the hyperparameter
    """
    # Prepare subplot figure
    fig = make_subplots(rows=1, cols=3, subplot_titles=["banknotes", "steelplates", "ionosphere"])
    datasets_to_test = ["banknotes", "steelplates", "ionosphere"]
    
    # Test each dataset
    for i, dataset in enumerate(datasets_to_test, start=1):
        scores_df = test(dataset, classifier, controlName, options)
        
        # Add a box plot for each option's scores to the subplot
        for option in scores_df['Option'].unique():
            fig.add_trace(
                go.Box(y=scores_df[scores_df['Option'] == option]['Score'], name=str(option), showlegend=False),
                row=1, col=i
            )
            
        # Update axes titles
        fig.update_xaxes(title=controlName, row=1, col=i)
    fig.update_yaxes(title="Accuracy", row=1, col=1)
    
    # Update layout and display the figure
    fig.update_layout(height=500, width=1100, title_text=f"Performance of {classifier.__name__}", showlegend=False)
    fig.show()
    
    os.makedirs("plots", exist_ok=True)
    
    # Saving to PDF for vector format
    file_name = f"plots/{classifier.__name__}.pdf"
    fig.write_image(file_name)
    print(f"Figure saved to {file_name}")

# Plot Generation <a class="anchor" id="task-i"></a>

> Results are organized to show the accuracy and hyperparameter settings of the six classifiers, across the three different datasets.

## KNN

In [None]:
test_several_datasets(KNeighborsClassifier,"n_neighbors", range(1,6,1))

## Logistic Regression

In [None]:
test_several_datasets(LogisticRegression,"C", [.1,.5,1.0,2.0, 5.0])

## Decision Tree Classifier

In [None]:
test_several_datasets(DecisionTreeClassifier,"max_depth", range(1,11,1))

## Random Forest Classifier

In [None]:
test_several_datasets(RandomForestClassifier, "max_depth", range(1,11,1))

## MLPClassifier

In [None]:
test_several_datasets(MLPClassifier, "alpha", [1e-5, 1e-3,0.1,10.0])

## Support Vector Classification (SVC)

In [None]:
test_several_datasets(SVC, "kernel" , ['linear','poly', 'rbf', 'sigmoid'])

# Summary Tables <a class="anchor" id="task-ii"></a>
> `create_summary_tables()` is used to generate dataframes containing the lowest mean test errors and the corresponding hyperparameter values from the CSV files saved in the previous hyperparameter tests, considering different hyperparameters for each classifier.

In [None]:
def create_summary_tables(datasets, classifier_hyperparams, base_path="data/pdframe"):
    """
    Generate summary tables showing the lowest mean test errors and corresponding 
    hyperparameters for various classifiers across different datasets.
    
    Parameters:
    - datasets: A list of dataset names to be analyzed
    - classifier_hyperparams: A dictionary where keys are classifier names and values are the names of the hyperparameters tested
    - base_path: The base path where the CSV files with test scores are stored. Defaults to "data/pdframe"
    
    Returns:
    - A tuple of two DataFrames: (lowest_mean_errors, corresponding_hyperparams).
      - lowest_mean_errors: DataFrame with classifiers as rows, datasets as columns, and the lowest mean test error as values
      - corresponding_hyperparams: DataFrame with classifiers as rows, datasets as columns, and the hyperparameters 
                                   that led to the lowest mean test error as values
    """
    # Initialize classifiers list and DataFrames for storing summary results
    classifiers = list(classifier_hyperparams.keys())
    lowest_mean_errors = pd.DataFrame(columns=datasets, index=classifiers)
    corresponding_hyperparams = pd.DataFrame(columns=datasets, index=classifiers)

    # Iterate over each dataset
    for dataset in datasets:
        # Iterate over each classifier and its hyperparameter
        for classifier, hyperparam in classifier_hyperparams.items():
            
            # Construct the file path for the CSV file containing test scores
            file_path = f"{base_path}/{dataset}_{classifier}_{hyperparam}.csv"
            
            # Check if the file exists; print a message and skip to the next if not
            if not os.path.isfile(file_path):
                print(f"File not found: {file_path}")
                continue

            # Load test scores from the CSV file
            scores_df = pd.read_csv(file_path)
            # Calculate test error for each test case
            scores_df['Test Error'] = 1 - scores_df['Score']
            # Calculate mean test error for each hyperparameter option
            mean_errors = scores_df.groupby('Option')['Test Error'].mean()
            
            # Find the hyperparameter option with the lowest mean test error
            best_option = mean_errors.idxmin()
            lowest_mean_error = mean_errors.min()

            # Update the summary tables with the lowest mean error and corresponding hyperparameter for the current classifier and dataset
            lowest_mean_errors.at[classifier, dataset] = lowest_mean_error
            corresponding_hyperparams.at[classifier, dataset] = best_option

    os.makedirs(f"{base_path}/summary_tables", exist_ok=True)
    
    # Save the tables
    lowest_mean_errors.to_csv(f"{base_path}/summary_tables/lowest_mean_errors.csv", index=True)
    corresponding_hyperparams.to_csv(f"{base_path}/summary_tables/corresponding_hyperparams.csv", index=True)
    
    # Return the summary tables
    return lowest_mean_errors, corresponding_hyperparams

### Initialize Hyperparameter Map and Generate Summary Tables

In [None]:
classifier_hyperparams = {
    "DecisionTreeClassifier": "max_depth",
    "KNeighborsClassifier": "n_neighbors",
    "LogisticRegression": "C",
    "MLPClassifier": "alpha",
    "RandomForestClassifier": "max_depth",
    "SVC": "kernel"
}

lowest_mean_errors_table, corresponding_hyperparams_table = create_summary_tables(
    list(datasets.keys()), 
    classifier_hyperparams
)

## Lowest Mean Test Errors

In [None]:
lowest_mean_errors_table

## Hyperparameter Values for Lowest Mean Test Errors

In [None]:
corresponding_hyperparams_table

# ROC Curve <a class="anchor" id="task-iii"></a>

In [None]:
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import RocCurveDisplay

random.seed(100)
np.random.seed(100)

X, y = datasets["ionosphere"]

# Standardize the feature values
scaler = StandardScaler().fit(X)
X_std = scaler.transform(X)

# Perform train-test split with a 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, random_state=100)

# Initialize and train classifier
clf = RandomForestClassifier(max_depth=5).fit(X_train, y_train)

# Generate and display ROC curve
roc_disp = RocCurveDisplay.from_estimator(clf, X_test, y_test)

In [None]:
# Initialize and train classifier
clf = SVC(kernel='rbf').fit(X_train, y_train)

# Generate and display ROC curve
roc_disp = RocCurveDisplay.from_estimator(clf, X_test, y_test)