In [41]:
import pandas as pd
import numpy as np

In [76]:
#Problem 1
#1 Load the data

df = pd.read_csv("diabetes.csv")
    
print(f"Dataset:\n {df.head()} \n") #shows few rows in dataset

print(f"The shape of dataset is: {df.shape}\n") #gets the shape of the dataset(rows, columns)

print(f"The null values are:\n{df.isnull().sum()}\n") #checks the null values

print("Summary Statistics:")
print(df.describe()) #gets summary statistics

Dataset:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1   

The shape of dataset is: (768, 9)

The null values are:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                      

In [51]:
#2 Handle missing data
missing_data = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[missing_data] = df[missing_data].replace(0, np.nan)
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [56]:
#3 Feature Engineering
x = df.drop("Outcome", axis = 1)
y = df["Outcome"]

np.random.seed(42)
shuffled_indices = np.random.permutation(len(df))

train_size = int(0.7 * len(df))

train_indices = shuffled_indices[:train_size]
test_indices  = shuffled_indices[train_size:]

x_train = x.iloc[train_indices]
x_test  = x.iloc[test_indices]

y_train = y.iloc[train_indices]
y_test  = y.iloc[test_indices]

print("Training set size:", x_train.shape)
print("Testing set size:", x_test.shape)

Training set size: (537, 8)
Testing set size: (231, 8)


In [81]:
#4 Implement KNN

# Ultra-simple KNN for quick understanding
import numpy as np

# Simple KNN in one function
def simple_knn(x_train, y_train, x_test, k=3):
    """Simplest possible KNN implementation"""
    predictions = []
    
    for test_point in X_test:
        # Calculate distances to all training points
        distances = []
        for i, train_point in enumerate(X_train):
            # Euclidean distance
            dist = np.sqrt(np.sum((test_point - train_point) ** 2))
            distances.append((dist, y_train[i]))
        
        # Sort and get k nearest
        distances.sort(key=lambda x: x[0])
        nearest = distances[:k]
        
        # Majority vote
        votes = {}
        for dist, label in nearest:
            votes[label] = votes.get(label, 0) + 1
        
        # Predict most common label
        predicted = max(votes, key=votes.get)
        predictions.append(predicted)
    
    return np.array(predictions)

# Simple scaling
def simple_scale(x_train, x_test):
    """Simple standardization"""
    mean = x_train.mean(axis=0)
    std = x_train.std(axis=0)
    std[std == 0] = 1  # Avoid division by zero
    
    X_train_scaled = (X_train - mean) / std
    X_test_scaled = (X_test - mean) / std
    
    return X_train_scaled, X_test_scaled

# Quick test
if __name__ == "__main__":
    # Quick test data
    X_train = np.array([[1,1], [2,2], [3,3], [8,8], [9,9]])
    y_train = np.array([0, 0, 0, 1, 1])
    X_test = np.array([[1.5, 1.5], [8.5, 8.5]])
    
    # Predict
    preds = simple_knn(X_train, y_train, X_test, k=2)
    print(f"Predictions: {preds}")  # Should be [0, 1]

Predictions: [0 1]


In [75]:
#Problem 2
#1 repeat the classification task

def standardize(X_train, X_test):
    """Standardization (Z-score normalization)"""
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    std = np.where(std == 0, 1e-8, std)  # Avoid division by zero
    
    X_train_scaled = (X_train - mean) / std
    X_test_scaled = (X_test - mean) / std
    
    return X_train_scaled, X_test_scaled

def normalize(X_train, X_test):
    """Min-Max Normalization"""
    min_val = np.min(X_train, axis=0)
    max_val = np.max(X_train, axis=0)
    range_val = max_val - min_val
    range_val = np.where(range_val == 0, 1.0, range_val)  # Avoid division by zero
    
    X_train_scaled = (X_train - min_val) / range_val
    X_test_scaled = (X_test - min_val) / range_val
    
    return X_train_scaled, X_test_scaled

# Function to compare KNN performance with and without scaling
def compare_scaling_effects(X_train, y_train, X_test, y_test):
    results = {}
    k_values = [1, 3, 5]
    
    # Test 1: Original data (no scaling)
    print("\n1. ORIGINAL DATA (No Scaling):")
    print("-" * 30)
    for k in k_values:
        predictions = knn_predict(X_train, y_train, X_test, k)
        accuracy = calculate_accuracy(y_test, predictions)
        results[f'original_k{k}'] = accuracy
        print(f"k={k}: Accuracy = {accuracy:.2%}")
    
    # Test 2: Standardized data
    print("\n2. STANDARDIZED DATA (Z-score):")
    X_train_std, X_test_std = standardize(X_train, X_test)
    for k in k_values:
        predictions = knn_predict(X_train_std, y_train, X_test_std, k)
        accuracy = calculate_accuracy(y_test, predictions)
        results[f'standardized_k{k}'] = accuracy
        print(f"k={k}: Accuracy = {accuracy:.2%}")
    
    # Test 3: Normalized data
    print("\n3. NORMALIZED DATA (Min-Max):")
    print("-" * 30)
    X_train_norm, X_test_norm = normalize(X_train, X_test)
    for k in k_values:
        predictions = knn_predict(X_train_norm, y_train, X_test_norm, k)
        accuracy = calculate_accuracy(y_test, predictions)
        results[f'normalized_k{k}'] = accuracy
        print(f"k={k}: Accuracy = {accuracy:.2%}")
    
    return results

# Create a more realistic dataset to demonstrate scaling effects
def create_realistic_data():
    np.random.seed(42)
    
    # Create 3 classes
    n_samples = 100
    
    # Class 0: Feature 1 has small values, Feature 2 has large values
    class0_feat1 = np.random.normal(5, 1, n_samples//3)
    class0_feat2 = np.random.normal(1000, 100, n_samples//3)
    class0 = np.column_stack([class0_feat1, class0_feat2])
    
    # Class 1
    class1_feat1 = np.random.normal(10, 1, n_samples//3)
    class1_feat2 = np.random.normal(2000, 100, n_samples//3)
    class1 = np.column_stack([class1_feat1, class1_feat2])
    
    # Class 2
    class2_feat1 = np.random.normal(15, 1, n_samples//3)
    class2_feat2 = np.random.normal(3000, 100, n_samples//3)
    class2 = np.column_stack([class2_feat1, class2_feat2])
    
    # Combine
    X = np.vstack([class0, class1, class2])
    y = np.array([0]*(n_samples//3) + [1]*(n_samples//3) + [2]*(n_samples//3))
    
    # Shuffle
    indices = np.random.permutation(len(X))
    X = X[indices]
    y = y[indices]
    
    # Split into train and test
    split_idx = int(0.7 * len(X))
    X_train = X[:split_idx]
    y_train = y[:split_idx]
    X_test = X[split_idx:]
    y_test = y[split_idx:]
    
    return X_train, y_train, X_test, y_test

# Run comparison with realistic data
def main():
    # Use the realistic dataset
    X_train, y_train, X_test, y_test = create_realistic_data()
    
    print("\nDataset Statistics (Original):")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"\nFeature ranges in training data:")
    print(f"Feature 1: Min={X_train[:,0].min():.2f}, Max={X_train[:,0].max():.2f}")
    print(f"Feature 2: Min={X_train[:,1].min():.2f}, Max={X_train[:,1].max():.2f}")
    print(f"\nFeature standard deviations:")
    print(f"Feature 1: Std={X_train[:,0].std():.2f}")
    print(f"Feature 2: Std={X_train[:,1].std():.2f}")
    
    # Run comparison
    results = compare_scaling_effects(X_train, y_train, X_test, y_test)
main()


Dataset Statistics (Original):
X_train shape: (69, 2)
X_test shape: (30, 2)

Feature ranges in training data:
Feature 1: Min=3.28, Max=16.87
Feature 2: Min=804.03, Max=3272.02

Feature standard deviations:
Feature 1: Std=4.15
Feature 2: Std=836.34

1. ORIGINAL DATA (No Scaling):
k=1: Accuracy = 100.00%
k=3: Accuracy = 100.00%
k=5: Accuracy = 100.00%

2. STANDARDIZED DATA (Z-score):
k=1: Accuracy = 100.00%
k=3: Accuracy = 100.00%
k=5: Accuracy = 100.00%

3. NORMALIZED DATA (Min-Max):
k=1: Accuracy = 100.00%
k=3: Accuracy = 100.00%
k=5: Accuracy = 100.00%
