3 To - Do Exercise:

For the provided dataset:

• diabetes.csv


Complete the following Problems.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/diabetes.csv')

In [None]:
df.info()

Problem - 1: Perform a classification task with knn from scratch.
1. Load the Dataset:

• Read the dataset into a pandas DataFrame.

• Display the first few rows and perform exploratory data analysis (EDA) to understand the dataset
(e.g., check data types, missing values, summary statistics).

2. Handle Missing Data:

• Handle any missing values appropriately, either by dropping or imputing them based on the data.

3. Feature Engineering:

• Separate the feature matrix (X) and target variable (y).

• Perform a train - test split from scratch using a 70% − 30% ratio.

4. Implement KNN:

• Build the KNN algorithm from scratch (no libraries like sickit-learn for KNN).

• Compute distances using Euclidean distance.

• Write functions for:

– Predicting the class for a single query.

– Predicting classes for all test samples.

• Evaluate the performance using accuracy.

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# 1. Load the Dataset
file_path = "/content/drive/MyDrive/diabetes.csv"  # Ensure this file path is valid
df = load_and_explore_dataset('/content/drive/MyDrive/diabetes.csv')
    # Read the dataset
df = pd.read_csv("/content/drive/MyDrive/diabetes.csv")

# Display the first few rows and perform EDA
print("First few rows:")
print(df.head())

print("\nData Types:")
print(df.dtypes)

print("\nMissing Values:")
print(df.isnull().sum())

print("\nSummary Statistics:")
print(df.describe())

return df

# 2. Handle Missing Data
def handle_missing_data(df, target_column):
    # Drop categorical columns except the target
    categorical_columns = df.select_dtypes(include=['object']).columns
    df = df.drop(columns=[col for col in categorical_columns if col != target_column])

    # Check for missing values and handle them
    missing_info = df.isnull().sum() / len(df) * 100
    for column in df.columns:
        if missing_info[column] > 10:
            # Impute columns with >10% missing values with mean
            df[column].fillna(df[column].mean(), inplace=True)
        else:
            # Drop rows with missing values if <10% missing
            df.dropna(subset=[column], inplace=True)
    return df

# 3. Feature Engineering
def split_features_target(df, target_column):
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values
    return X, y

def train_test_split(X, y, test_size=0.3, random_state=42):
    np.random.seed(random_state)
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    split_idx = int(len(X) * (1 - test_size))
    train_indices, test_indices = indices[:split_idx], indices[split_idx:]

    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

# 4. Implement KNN
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

def knn_predict_single(X_train, y_train, query, k=3):
    # Compute distances from query to all training points
    distances = [euclidean_distance(query, x) for x in X_train]

    # Get the indices of the k-nearest neighbors
    k_indices = np.argsort(distances)[:k]

    # Get the labels of the k-nearest neighbors
    k_nearest_labels = [y_train[i] for i in k_indices]

    # Determine the most common class label
    most_common = Counter(k_nearest_labels).most_common(1)
    return most_common[0][0]

def knn_predict(X_train, y_train, X_test, k=3):
    predictions = [knn_predict_single(X_train, y_train, query, k) for query in X_test]
    return predictions

def accuracy_score(y_true, y_pred):
    correct = sum(np.array(y_true) == np.array(y_pred))
    return correct / len(y_true)

# Main Workflow
def main():
    # Load and explore dataset
    file_path = "/content/drive/MyDrive/diabetes.csv"
    target_column = "Survived"  # Replace with the actual target column name in your dataset
    df = load_and_explore_dataset(file_path)

    # Handle missing data
    df_cleaned = handle_missing_data(df, target_column)

    # Display cleaned data
    print("\nData after processing:")
    print(df_cleaned.head())
    print("\nMissing values after processing:")
    print(df_cleaned.isnull().sum())

    # Feature engineering
    X, y = split_features_target(df_cleaned, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Implement KNN
    k = 5  # Define the number of neighbors
    y_pred = knn_predict(X_train, y_train, X_test, k)

    # Evaluate performance
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {acc:.2f}")

if __name__ == "__main__":
    main()
