KNN Classifier Model

In [13]:
# Libraries and modules

import os
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


# Configuration and feature extraction

BASE_DIR = Path.cwd() if 'notebooks' not in str(Path.cwd()) else Path.cwd().parent
DATA_DIR = BASE_DIR / 'data'
FEATURES_DIR = DATA_DIR / 'features' / 'birdcall_features_song.csv'
RANDOM_STATE = 42

def extract_features(features_path):
    df = pd.read_csv(features_path)
    return df

In [14]:
# Preprocessing and scaling

def preprocess_data(df):
    X = df.select_dtypes(include=[np.number])       # Only numeric columns
    y = df['species']                               # Target variable    
    
    scaler = StandardScaler()                       # Standardize features
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y, scaler


# Data split --> test and train

def split_data(X, y):
    return train_test_split(X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE)

In [None]:
# Train KNN Classifier --> tune hyperparameters

def train_knn(X_train, y_train):
    param_grid = {
        'n_neighbors': list(range(50, 201)),        # Number of neighbors
        'weights': ['uniform', 'distance'],         # Weight function
    }

    grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)

    print(f"Best parameters: {grid.best_params_}")
    print(pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score']])
    print(f"Best score: {grid.best_score_}")

    return grid.best_estimator_


# Evaluate Model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))
    disp.plot(xticks_rotation=45, cmap='Blues')
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

In [23]:
# Main function

def main():
    df = extract_features(FEATURES_DIR)
    X_scaled, y, _ = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_scaled, y)
    model = train_knn(X_train, y_train)
    evaluate_model(model, X_test, y_test)
    print(f"Train accuracy: {model.score(X_train, y_train):.2f}")
    print(f"Test accuracy: {model.score(X_test, y_test):.2f}")

if __name__ == "__main__":
    main()

Best parameters: {'n_neighbors': 10001, 'weights': 'uniform'}
                                          params  mean_test_score
0   {'n_neighbors': 10001, 'weights': 'uniform'}              NaN
1  {'n_neighbors': 10001, 'weights': 'distance'}              NaN
Best score: nan


Traceback (most recent call last):
  File "c:\UPF\Taller de Tecnologia Musical\Birdify\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\UPF\Taller de Tecnologia Musical\Birdify\.venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\UPF\Taller de Tecnologia Musical\Birdify\.venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\UPF\Taller de Tecnologia Musical\Birdify\.venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1750, n_neighbors = 10001