**Problem 1: Perform a classification task with knn from scratch.**

In the diabetes dataset, certain columns may contain 0 where it is biologically invalid (e.g., BMI, Glucose, etc.).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
# Load dataset
df = pd.read_csv('diabetes.csv')
# Display first few rows
print(df.head())

# Check data types, missing values, and summary statistics
print(df.info())
print(df.isnull().sum())
print(df.describe())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

# 2. Handle Missing Data:

In [None]:
# Check for missing values and handle appropriately
# Example: Replace zeros in specific columns with the mean or median
columns_to_impute = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in columns_to_impute:
    df[col] = df[col].replace(0, np.nan)
    df[col].fillna(df[col].median(), inplace=True)

# Verify no missing values
print(df.isnull().sum())

# 3. Feature Engineering:

In [None]:
# Separate features and target variable
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Manual train-test split
train_size = int(0.7 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Verify the split
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# Implement KNN:

In [None]:
# Euclidean distance function
def euclidean_distance(point1, point2):
    """
    Compute the Euclidean distance between two points.

    Args:
    point1 (numpy array or pandas Series): The first data point.
    point2 (numpy array or pandas Series): The second data point.

    Returns:
    float: The Euclidean distance between the two points.
    """
    # Calculate the squared differences for each feature and sum them up
    return np.sqrt(np.sum((point1 - point2) ** 2))

# KNN prediction for a single query
def knn_predict_single(query, X_train, y_train, k=3):
    """
    Predict the class for a single query using KNN.

    Args:
    query (numpy array or pandas Series): The query data point.
    X_train (pandas DataFrame): The training data.
    y_train (pandas Series): The labels for the training data.
    k (int): The number of nearest neighbors to consider. Default is 3.

    Returns:
    int: The predicted class label for the query.
    """
    # Compute the Euclidean distance between the query point and each training point
    distances = [euclidean_distance(query, x) for x in X_train.values]
    
    # Sort the distances and get the indices of the k nearest neighbors
    sorted_indices = np.argsort(distances)[:k]
    
    # Get the corresponding labels of the k nearest neighbors
    nearest_labels = y_train.iloc[sorted_indices]
    
    # Predict the most frequent class label (majority vote)
    prediction = np.bincount(nearest_labels).argmax()
    
    return prediction

# KNN prediction for all test samples
def knn_predict(X_test, X_train, y_train, k=3):
    """
    Predict the classes for all test samples using KNN.

    Args:
    X_test (pandas DataFrame): The test data.
    X_train (pandas DataFrame): The training data.
    y_train (pandas Series): The labels for the training data.
    k (int): The number of nearest neighbors to consider. Default is 3.

    Returns:
    numpy array: The predicted class labels for all test samples.
    """
    # Predict the class for each test sample using knn_predict_single
    return np.array([knn_predict_single(query, X_train, y_train, k) for query in X_test.values])

# Evaluate performance of KNN
try:
    predictions = knn_predict(X_test, X_train, y_train, k=3)  # Run the KNN prediction function
    accuracy = (predictions == y_test.values).mean() * 100  # Calculate accuracy
    print(f"Accuracy: {accuracy:.2f}%")  # Print accuracy if successful
except Exception as e:
    print(f"An error occurred during the prediction or accuracy calculation: {e}")  # Print a user-friendly error message


# Problem-2 : Experimentation


### Repeat the classification task:

In [3]:
# 1. Manually scale the data
def scale_data(X):
    """
    Scale the feature matrix X using Z-score normalization (standardization).

    Args:
    X (pandas DataFrame): The feature matrix to be scaled.

    Returns:
    pandas DataFrame: The scaled feature matrix.
    """
    try:
        # Calculate the mean and standard deviation for each feature
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        
        # Apply Z-score normalization (standardization)
        X_scaled = (X - mean) / std
        return X_scaled
    except Exception as e:
        print(f"An error occurred during data scaling: {e}")
        return None

# Apply scaling to the feature matrix X
X_scaled = scale_data(X)
if X_scaled is None:
    print("Data scaling failed.")
else:
    # 2. Split the data into training and testing sets (using a 70%-30% split as per the problem)
    train_size = int(0.7 * len(X))  # 70% training data

    """
    Split the data into training and testing sets based on a 70%-30% ratio.

    Args:
    X (pandas DataFrame): The feature matrix.
    y (pandas Series): The target variable.

    Returns:
    X_train_scaled (pandas DataFrame): The scaled training feature matrix.
    X_test_scaled (pandas DataFrame): The scaled testing feature matrix.
    y_train (pandas Series): The training target variable.
    y_test (pandas Series): The testing target variable.
    """
    try:
        # Split the scaled data into training and testing sets
        X_train_scaled, X_test_scaled = X_scaled[:train_size], X_scaled[train_size:]
        y_train, y_test = y[:train_size], y[train_size:]
    except Exception as e:
        print(f"An error occurred during data splitting: {e}")
        X_train_scaled, X_test_scaled, y_train, y_test = None, None, None, None

    if X_train_scaled is None or X_test_scaled is None:
        print("Data splitting failed.")
    else:
        # 3. Train and test KNN on scaled data using the previously defined knn_predict function
        try:
            scaled_predictions = knn_predict(X_test_scaled, X_train_scaled, y_train, k=3)
        except Exception as e:
            print(f"An error occurred during KNN prediction: {e}")
            scaled_predictions = None

        if scaled_predictions is not None:
            """
            Calculate the accuracy of the model by comparing predicted labels to actual labels.

            Args:
            scaled_predictions (numpy array): The predicted labels from the KNN model.
            y_test (pandas Series): The actual labels from the test set.

            Returns:
            float: The accuracy of the model as a percentage.
            """
            try:
                scaled_accuracy = (scaled_predictions == y_test.values).mean() * 100
                # Print the accuracy on scaled data
                print(f"Accuracy on scaled data: {scaled_accuracy:.2f}%")
            except Exception as e:
                print(f"An error occurred during accuracy calculation: {e}")


NameError: name 'X' is not defined

## Problem - 3: Experimentation with k

### 1. Vary the number of neighbors - k:
- Run the KNN model on both the original and scaled datasets for a range of:
  - \( k = 1, 2, 3, \dots, 15 \)
- For each \( k \), record:
  - Accuracy.
  - Time taken to make predictions.

### 2. Visualize the Results:
- Plot the following graphs:
  - k vs. Accuracy for original and scaled datasets.
  - k vs. Time Taken for original and scaled datasets.

### 3. Analyze and Discuss:
- Discuss how the choice of \( k \) affects the accuracy and computational cost.
- Identify the optimal \( k \) based on your analysis.

In [4]:
import time
import matplotlib.pyplot as plt

# Function to compute and print accuracy and time for each k
def evaluate_knn_k_values(x_train, y_train, x_test, y_test, x_train_scaled, x_test_scaled, y_train_scaled, k_values):
    original_accuracies = []
    scaled_accuracies = []
    original_times = []
    scaled_times = []

    # Loop through each k value
    for k in k_values:
        # Original dataset
        start_time = time.time()
        predictions = knn_predict(x_test, x_train, y_train, k=k)
        elapsed_time = time.time() - start_time
        accuracy = (predictions == y_test.values).mean() * 100
        original_accuracies.append(accuracy)
        original_times.append(elapsed_time)
        print(f"Original Dataset - k = {k}: Accuracy = {accuracy:.2f}%, Time Taken = {elapsed_time:.4f}s")

        # Scaled dataset
        start_time = time.time()
        scaled_predictions = knn_predict(x_test_scaled, x_train_scaled, y_train_scaled, k=k)
        elapsed_time = time.time() - start_time
        accuracy = (scaled_predictions == y_test.values).mean() * 100
        scaled_accuracies.append(accuracy)
        scaled_times.append(elapsed_time)
        print(f"Scaled Dataset - k = {k}: Accuracy = {accuracy:.2f}%, Time Taken = {elapsed_time:.4f}s")

    # Plotting results
    plt.figure(figsize=(12, 6))

    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(k_values, original_accuracies, label='Original Dataset', marker='o')
    plt.plot(k_values, scaled_accuracies, label='Scaled Dataset', marker='o')
    plt.xlabel('k (Number of Neighbors)')
    plt.ylabel('Accuracy (%)')
    plt.title('k vs. Accuracy')
    plt.legend()
    plt.grid(True)

    # Time Plot
    plt.subplot(1, 2, 2)
    plt.plot(k_values, original_times, label='Original Dataset', marker='o')
    plt.plot(k_values, scaled_times, label='Scaled Dataset', marker='o')
    plt.xlabel('k (Number of Neighbors)')
    plt.ylabel('Time Taken (seconds)')
    plt.title('k vs. Time Taken')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

# Define the range of k values
k_values = range(1, 16)

# Run the evaluation
evaluate_knn_k_values(X_train, y_train, X_test, y_test, X_train_scaled, X_test_scaled, y_train, k_values)

NameError: name 'X_train' is not defined