In [3]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = '/content/drive/MyDrive/Dataset/diabetes.csv'
df = pd.read_csv(file_path)

# Replace zero values in specific columns with NaN and then impute with column mean
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[columns_with_zeros] = df[columns_with_zeros].replace(0, np.nan)
df.fillna(df.mean(), inplace=True)

# Separate features (X) and target variable (y)
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Perform a 70-30 train-test split manually
train_size = int(0.7 * len(df))
shuffled_indices = np.random.permutation(len(df))
train_indices = shuffled_indices[:train_size]
test_indices = shuffled_indices[train_size:]

X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

# Function to compute Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# Function to predict the class for a single query point
def predict_single(X_train, y_train, query, k=5):
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(X_train.iloc[i], query)
        distances.append((dist, y_train.iloc[i]))
    distances = sorted(distances, key=lambda x: x[0])
    k_nearest_neighbors = [label for _, label in distances[:k]]
    prediction = np.argmax(np.bincount(k_nearest_neighbors))
    return prediction

# Function to predict for all test samples
def predict(X_train, y_train, X_test, k=5):
    predictions = []
    for i in range(len(X_test)):
        pred = predict_single(X_train, y_train, X_test.iloc[i], k)
        predictions.append(pred)
    return predictions

# Predict on the test set
k = 5
y_pred = predict(X_train, y_train, X_test, k)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test) * 100
print(f"Accuracy of KNN (k={k}): {accuracy:.2f}%")


Accuracy of KNN (k=5): 71.86%


In [4]:
from sklearn.preprocessing import MinMaxScaler

# Step 1: Scale the feature matrix
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-test split for scaled data
X_train_scaled, X_test_scaled = X_scaled.iloc[train_indices], X_scaled.iloc[test_indices]

# Function to predict using KNN (reused from Problem 1)
def predict_scaled(X_train, y_train, X_test, k=5):
    predictions = []
    for i in range(len(X_test)):
        pred = predict_single(X_train, y_train, X_test.iloc[i], k)
        predictions.append(pred)
    return predictions

# Predict on the scaled test set
y_pred_scaled = predict_scaled(X_train_scaled, y_train, X_test_scaled, k)

# Calculate accuracy on scaled data
accuracy_scaled = np.mean(y_pred_scaled == y_test) * 100

# Step 2: Comparative Analysis
print(f"Accuracy of KNN on original dataset (k={k}): {accuracy:.2f}%")
print(f"Accuracy of KNN on scaled dataset (k={k}): {accuracy_scaled:.2f}%")

# Discussion
if accuracy_scaled > accuracy:
    print("\nScaling improved the KNN performance because the algorithm is distance-based and sensitive to feature magnitudes.")
elif accuracy_scaled < accuracy:
    print("\nScaling decreased the KNN performance, possibly due to loss of natural feature distribution.")
else:
    print("\nScaling had no impact on the KNN performance for this dataset.")


Accuracy of KNN on original dataset (k=5): 71.86%
Accuracy of KNN on scaled dataset (k=5): 72.29%

Scaling improved the KNN performance because the algorithm is distance-based and sensitive to feature magnitudes.


In [None]:
import time
import matplotlib.pyplot as plt

# Lists to store results
results_original = {'k': [], 'accuracy': [], 'time': []}
results_scaled = {'k': [], 'accuracy': [], 'time': []}

# Function to measure performance for a given k
def evaluate_knn(X_train, y_train, X_test, y_test, k):
    start_time = time.time()
    y_pred = predict(X_train, y_train, X_test, k)
    end_time = time.time()
    accuracy = np.mean(y_pred == y_test) * 100
    elapsed_time = end_time - start_time
    return accuracy, elapsed_time

# Experiment with k for both original and scaled datasets
for k in range(1, 16):
    # Evaluate for original dataset
    acc, elapsed = evaluate_knn(X_train, y_train, X_test, y_test, k)
    results_original['k'].append(k)
    results_original['accuracy'].append(acc)
    results_original['time'].append(elapsed)

    # Evaluate for scaled dataset
    acc, elapsed = evaluate_knn(X_train_scaled, y_train, X_test_scaled, y_test, k)
    results_scaled['k'].append(k)
    results_scaled['accuracy'].append(acc)
    results_scaled['time'].append(elapsed)

# Plot k vs. Accuracy
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(results_original['k'], results_original['accuracy'], label="Original Dataset", marker='o')
plt.plot(results_scaled['k'], results_scaled['accuracy'], label="Scaled Dataset", marker='s')
plt.title("k vs. Accuracy")
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy (%)")
plt.legend()

# Plot k vs. Time Taken
plt.subplot(1, 2, 2)
plt.plot(results_original['k'], results_original['time'], label="Original Dataset", marker='o')
plt.plot(results_scaled['k'], results_scaled['time'], label="Scaled Dataset", marker='s')
plt.title("k vs. Time Taken")
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Time Taken (seconds)")
plt.legend()

plt.tight_layout()
plt.show()

# Discussion and Analysis
optimal_k_scaled = results_scaled['k'][np.argmax(results_scaled['accuracy'])]
optimal_k_original = results_original['k'][np.argmax(results_original['accuracy'])]

print(f"Optimal k for original dataset: {optimal_k_original}")
print(f"Optimal k for scaled dataset: {optimal_k_scaled}")

if optimal_k_scaled == optimal_k_original:
    print("\nThe optimal k is the same for both datasets.")
else:
    print("\nThe optimal k differs between the original and scaled datasets.")

print("\nDiscussion:")
print("- Smaller values of k tend to overfit the training data.")
print("- Larger values of k can smooth the decision boundary but may underfit.")
print("- Scaling generally improves the consistency of the results because KNN is sensitive to feature magnitudes.")


In [None]:
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[columns_with_zeros] = df[columns_with_zeros].replace(0, np.nan)
df.fillna(df.mean(), inplace=True)

X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Scale the features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-test split
train_size = int(0.7 * len(df))
shuffled_indices = np.random.permutation(len(df))
train_indices = shuffled_indices[:train_size]
test_indices = shuffled_indices[train_size:]

X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

X_train_scaled, X_test_scaled = X_scaled.iloc[train_indices], X_scaled.iloc[test_indices]

# Function to compute Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# Function for Weighted KNN
def weighted_knn(X_train, y_train, X_test, k=5):
    predictions = []
    for i in range(len(X_test)):
        distances = []
        for j in range(len(X_train)):
            dist = euclidean_distance(X_train.iloc[j], X_test.iloc[i])
            distances.append((dist, y_train.iloc[j]))
        distances = sorted(distances, key=lambda x: x[0])
        k_nearest = distances[:k]
        weights = [1 / (d[0] + 1e-5) for d in k_nearest]
        labels = [d[1] for d in k_nearest]
        weighted_vote = np.bincount(labels, weights=weights).argmax()
        predictions.append(weighted_vote)
    return predictions

# Dimensionality Reduction with PCA
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

def predict(X_train, y_train, X_test, k=5):
    predictions = []
    for i in range(len(X_test)):
        distances = [(euclidean_distance(X_train.iloc[j], X_test.iloc[i]), y_train.iloc[j]) for j in range(len(X_train))]
        distances = sorted(distances, key=lambda x: x[0])
        k_nearest_labels = [label for _, label in distances[:k]]
        prediction = np.argmax(np.bincount(k_nearest_labels))
        predictions.append(prediction)
    return predictions

# KNN with FAISS (Approximate Nearest Neighbors)
def knn_faiss(X_train_np, y_train, X_test_np, k=5):
    index = faiss.IndexFlatL2(X_train_np.shape[1])
    index.add(X_train_np)
    _, neighbors = index.search(X_test_np, k)
    predictions = []
    for idxs in neighbors:
        nearest_labels = y_train.iloc[idxs]
        prediction = nearest_labels.value_counts().idxmax()
        predictions.append(prediction)
    return predictions

# Parallel KNN Prediction
def parallel_knn(X_train, y_train, X_test, k=5, n_jobs=-1):
    def predict_single_point(i):
        return predict_single(X_train, y_train, X_test.iloc[i], k)
    predictions = Parallel(n_jobs=n_jobs)(delayed(predict_single_point)(i) for i in range(len(X_test)))
    return predictions

# Evaluate and Compare Results
start = time.time()
y_pred_pca = predict(pd.DataFrame(X_train_pca), y_train, pd.DataFrame(X_test_pca), k=5)
accuracy_pca = np.mean(y_pred_pca == y_test) * 100
end = time.time()
print(f"PCA Accuracy: {accuracy_pca:.2f}%, Time: {end - start:.2f} seconds")

start = time.time()
X_train_np = X_train_scaled.to_numpy().astype('float32')
X_test_np = X_test_scaled.to_numpy().astype('float32')
y_pred_faiss = knn_faiss(X_train_np, y_train, X_test_np, k=5)
accuracy_faiss = np.mean(y_pred_faiss == y_test) * 100
end = time.time()
print(f"FAISS Accuracy: {accuracy_faiss:.2f}%, Time: {end - start:.2f} seconds")

start = time.time()
y_pred_weighted = weighted_knn(X_train_scaled, y_train, X_test_scaled, k=5)
accuracy_weighted = np.mean(y_pred_weighted == y_test) * 100
end = time.time()
print(f"Weighted KNN Accuracy: {accuracy_weighted:.2f}%, Time: {end - start:.2f} seconds")
