In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv("diabetes_prediction_new.csv") 

In [None]:
data.head(10)

In [None]:
data.isnull()

In [None]:
X = data.drop(columns=["gender","diabetes"])
y = data["diabetes"]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.replace({'not current': 2, 'never': 3, 'ever':4, 'former':1, 'No Info':0 },inplace=True)

In [None]:
X.head()

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
def minkowski_distance(a, b, p=2):
    dim = len(a)
    distance = 0 # initial dist
    for d in range(dim):
        distance += abs(a[d] - b[d])**p    
    distance = distance**(1/p)
    
    return distance
minkowski_distance(a=X.iloc[0], b=X.iloc[1], p=1) #Manhattan dist

92.13

In [28]:
test_pt = [148, 85, 103,71, 44]
distances = []

for i in X.index:
    
    distances.append(minkowski_distance(test_pt, X.iloc[i]))
    
df_dists = pd.DataFrame(data=distances, index=X.index, columns=['dist'])
df_dists.head()

Unnamed: 0,dist
0,165.017624
1,178.855311
2,192.707609
3,188.577047
4,167.255193


In [29]:
df_nn = df_dists.sort_values(by=['dist'], axis=0)[:3] #using 3 nearest neighbour
df_nn

Unnamed: 0,dist
81631,163.458137
73114,163.474281
28082,163.549533


In [30]:
from collections import Counter
counter = Counter(y[df_nn.index])
counter.most_common()[0][0]

1

In [None]:
def knn_predict(X_train, X_test, y_train, y_test, k, p):
    
    from collections import Counter
    y_hat_test = []

    for test_point in X_test:
        distances = []

        for train_point in X_train:
            distance = minkowski_distance(test_point, train_point, p=p)
            distances.append(distance)
            
        df_dists = pd.DataFrame(data=distances, columns=['dist'], 
                                index=y_train.index)
        
        df_nn = df_dists.sort_values(by=['dist'], axis=0)[:k]

        counter = Counter(y_train[df_nn.index])

        prediction = counter.most_common()[0][0]
        
        y_hat_test.append(prediction)
        
    return y_hat_test

y_hat_test = knn_predict(X_train, X_test, y_train, y_test, k=3, p=1)

print(y_hat_test)


In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_hat_test))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3, p=2)   #just to verify 
clf.fit(X_train, y_train)
y_pred_test = clf.predict(X_test)

print(f"Sklearn KNN Accuracy: {accuracy_score(y_test, y_pred_test)}")


In [None]:
accuracies = []

for k in range(1,100):
    y_hat_test = knn_predict(X_train, X_test, y_train, y_test, k, p=1)
    accuracies.append(accuracy_score(y_test, y_hat_test))

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(range(1,100), accuracies)
ax.set_xlabel('# of Nearest Neighbors (k)')
ax.set_ylabel('Accuracy (%)')

In [None]:
y_test.head()

In [None]:
print(y_test)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(9, 6))
plt.scatter(X_test[:,0], y_test, color='blue', label='Actual vs. Predicted')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('kNN Diabetes Prediction')
plt.legend()
plt.show()

In [None]:
def fitness_function(params):
    n_neighbors = int(params['n_neighbors'])
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    return -accuracy_score(y_test, y_pred) 

In [None]:
n_neighbors_range = (1, 30) 

def initialize_population(population_size):
    return [random.randint(n_neighbors_range[0], n_neighbors_range[1]) for _ in range(population_size)]


In [None]:
def OOA(SearchAgents, Max_iterations, lowerbound, upperbound, fitness):
    X = np.random.randint(lowerbound, upperbound + 1, size=(SearchAgents,))
    fit = np.array([fitness({'n_neighbors': X[i]}) for i in range(SearchAgents)])
    best_so_far = np.zeros(Max_iterations)
    
    for t in range(1, Max_iterations + 1):
        Fbest, blocation = min(fit), np.argmin(fit)

        if t == 1 or Fbest < best_so_far[t-2]:
            xbest, fbest = X[blocation], Fbest

        for i in range(SearchAgents):
            fish_position = np.where(fit < fit[i])[0]

            if fish_position.size == 0:
                selected_fish = xbest
            else:
                k = np.random.randint(fish_position.size)
                selected_fish = X[fish_position[k]]

            I = round(1 + np.random.rand())
            X_new_P1 = X[i] + np.random.rand() * (selected_fish - I * X[i])
            X_new_P1 = np.maximum(X_new_P1, lowerbound)
            X_new_P1 = np.minimum(X_new_P1, upperbound)

            fit_new_P1 = fitness({'n_neighbors': X_new_P1})
            if fit_new_P1 < fit[i]:
                X[i] = X_new_P1
                fit[i] = fit_new_P1

            X_new_P1 = X[i] + (lowerbound + np.random.rand() * (upperbound - lowerbound)) / t
            X_new_P1 = np.maximum(X_new_P1, lowerbound)
            X_new_P1 = np.minimum(X_new_P1, upperbound)

            fit_new_P1 = fitness({'n_neighbors': X_new_P1})
            if fit_new_P1 < fit[i]:
                X[i] = X_new_P1
                fit[i] = fit_new_P1

        best_so_far[t-1] = fbest

    return fbest, xbest, best_so_far


population_size = 10
max_iterations = 50
lowerbound = n_neighbors_range[0]
upperbound = n_neighbors_range[1]

Best_score, Best_pos, OOA_curve = OOA(population_size, max_iterations, lowerbound, upperbound, fitness_function)

print("Best Score:", Best_score)
print("Best Number of Neighbors:", Best_pos)
