In [31]:
import numpy as np
from collections import Counter

class SimpleKNNClassifier:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def predict(self, X):
        predictions = []
        for sample in X:
            distances = []
            for idx, train_sample in enumerate(self.X_train):
                dist = self.euclidean_distance(sample, train_sample)
                distances.append((dist, self.y_train[idx]))
            
            # Sort distances and get top k
            distances.sort(key=lambda x: x[0])
            k_nearest = distances[:self.k]
            
            # Get most common class among k nearest neighbors
            k_nearest_labels = [label for _, label in k_nearest]
            most_common = Counter(k_nearest_labels).most_common(1)
            predictions.append(most_common[0][0])
        
        return np.array(predictions)

# Data preprocessing
def preprocess_data(data):
    """Prepare data for classification"""
    # Remove rows with missing values
    clean_data = data.dropna()
    
    # Select features for classification
    features = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
    X = clean_data[features].values
    y = clean_data['species'].values
    
    # Normalize features
    X_normalized = (X - X.mean(axis=0)) / X.std(axis=0)
    
    return X_normalized, y

# Example usage
def evaluate_model(X, y, test_size=0.2):
    # Randomly shuffle data
    indices = np.random.permutation(len(X))
    test_size = int(len(X) * test_size)
    
    # Split into train and test sets
    train_indices = indices[test_size:]
    test_indices = indices[:test_size]
    
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    # Train and evaluate model
    model = SimpleKNNClassifier(k=3)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = np.mean(predictions == y_test)
    
    return accuracy, model

In [32]:
import pandas as pd

# Read and preprocess data
data = pd.read_csv('penguins.csv')
X_normalized, y = preprocess_data(data)

# Train and evaluate model
accuracy, model = evaluate_model(X_normalized, y)
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 1.00
