In [None]:
# Sonar Signal Classification using K-Nearest Neighbors
# This project classifies sonar signals to distinguish between rocks and metal objects

# Import necessary libraries
import pandas as pd          # For data manipulation and analysis
import numpy as np          # For numerical computations
import matplotlib.pyplot as plt  # For plotting and visualization
import seaborn as sb        # For statistical data visualization

In [None]:
# Load the sonar dataset
df = pd.read_csv('sonar.all-data.csv')

In [None]:
# Display first 5 rows to understand the data structure
df.head()

In [None]:
# Check the distribution of target classes (R: Rock, M: Metal)
df['Label'].value_counts()

In [None]:
# Create binary target variable: R=0 (Rock), M=1 (Metal)
df['target'] = df['Label'].map({'R':0, 'M':1})

In [None]:
# Verify the mapping worked correctly
df.head()

In [None]:
# Check for missing values in the dataset
df.isnull().sum()

In [None]:
# Prepare features (X) and target (y) for machine learning
# Drop both 'target' and 'Label' columns to get only the feature columns
X = df.drop(['target', 'Label'], axis='columns')
y = df['target']

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
# Use 90% for training, 10% for testing with fixed random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Import required modules for preprocessing and model building
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.pipeline import Pipeline            # For creating ML pipelines
from sklearn.neighbors import KNeighborsClassifier  # KNN classifier
from sklearn.model_selection import GridSearchCV   # For hyperparameter tuning

In [None]:
# Create a pipeline with preprocessing and model
# StandardScaler normalizes features to have mean=0 and std=1 (important for KNN)
# KNeighborsClassifier implements the KNN algorithm
operations = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]
pipe = Pipeline(operations)

In [None]:
# Define range of k values to test (1 to 30 neighbors)
k_values = list(range(1, 31))

In [None]:
# Set up parameter grid for GridSearchCV
# 'knn__n_neighbors' refers to the n_neighbors parameter of the KNN classifier in the pipeline
param_grid = {
    'knn__n_neighbors': k_values
}

In [None]:
# Perform grid search with 5-fold cross-validation
# cv=5: Use 5-fold cross-validation for robust evaluation
# scoring='accuracy': Optimize for accuracy metric
model = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

In [None]:
# Train the model with all parameter combinations
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set using the best found parameters
y_pred = model.predict(X_test)

In [None]:
# Extract the best k value found by grid search
best_k = model.best_estimator_.get_params()['knn__n_neighbors']

In [None]:
# Import metrics for model evaluation
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

In [None]:
# Display results
print(f'Best k estimated by Grid Search Cross Validation is {best_k}.\n')

In [None]:
# Show detailed classification metrics (precision, recall, f1-score for each class)
print(f'Classification report:\n\n{classification_report(y_test, y_pred)}')

In [None]:
# Calculate and display error rate as percentage
err = float(f'{np.round(1 - accuracy_score(y_test,y_pred),2)*100 : .2f}')
print(f'Error:\n\n{err}%')

In [None]:
# Display confusion matrix visualization
print(f'Confusion matrix:\n\n{ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)}\n')

In [None]:
# Show all parameters of the best estimator
model.best_estimator_.get_params()

In [None]:
# Display first 5 rows of cross-validation results
pd.DataFrame(model.cv_results_).head()

In [None]:
# Plot the cross-validation scores for different k values
# This helps visualize how performance changes with different k values
pd.DataFrame(model.cv_results_)['mean_test_score'].plot()

In [None]:
# Alternative approach: Train KNN with a fixed k=5 for comparison
print("\n" + "="*50)
print("COMPARISON: KNN with fixed k=5")
print("="*50)

In [None]:
# Create and train a KNN model with k=5 (without scaling for comparison)
model_fixed = KNeighborsClassifier(n_neighbors=5)
model_fixed.fit(X_train, y_train)

In [None]:
# Make predictions with the fixed k model
y_pred_fixed = model_fixed.predict(X_test)

In [None]:
# Evaluate the fixed k model
print(f'Classification report:\n\n{classification_report(y_test, y_pred_fixed)}')

In [None]:
# Calculate error rate for fixed k model
err_fixed = float(f'{np.round(1 - accuracy_score(y_test,y_pred_fixed),2)*100 : .2f}')
print(f'Error:\n\n{err_fixed}%')

In [None]:
# Display confusion matrix for fixed k model
print(f'Confusion matrix:\n\n{ConfusionMatrixDisplay.from_estimator(model_fixed, X_test, y_test)}\n')

# Key takeaways:
# 1. StandardScaler is crucial for KNN as it's distance-based
# 2. GridSearchCV helps find optimal hyperparameters
# 3. Cross-validation provides robust model evaluation
# 4. The comparison shows the importance of proper preprocessing and hyperparameter tuning