<a href="https://colab.research.google.com/github/Rahmamouradsayed/ML-models-from-scratch-/blob/main/KNN_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install -U scikit-learn

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import Normalizer
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Import iris dataset
iris = datasets.load_iris()

# Access the features (X) and target variable (y)
X = iris.data
y = iris.target

# Convert the feature array into a DataFrame
iris_df = pd.DataFrame(X, columns=iris.feature_names)

# Add the target variable as a new column in the DataFrame
iris_df['target'] = y

print(iris_df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [None]:
#preprocessing and data cleaning step

In [None]:
# Show the dataset infromation
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [None]:
#check for missing data
missing_data =iris_df.isnull().sum()
print(missing_data)

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [None]:
#check duplicates
duplicate=iris_df.duplicated().sum()
print(" num_of_dublicate rows is :",duplicate)

 num_of_dublicate rows is : 1


In [None]:
#drop duplicates
drop_duplicate=iris_df.drop_duplicates(inplace=True)
print("num_of_rows after remove duplicate:",drop_duplicate)

num_of_rows after remove duplicate: None


In [None]:
#check the traing set size and test set size:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the training set size
train_size = X_train.shape[0]
print("Training set size:", train_size)

# Check the test set size
test_size = X_test.shape[0]
print("Test set size:", test_size)

Training set size: 120
Test set size: 30


In [None]:
def distance_ecu(normalized_X_train, normalized_X_test):
    """
    Input:
        - normalized_X_train: Corresponding to the training data
        - normalized_X_test: Corresponding to the test point

    Output:
        - distances: The distances between the test point and each point in the training data.
    """
    distances = []

    # Loop over the rows of normalized_X_train
    for row in range(len(normalized_X_train)):
        # Get the points one by one
        current_train_point = normalized_X_train[row]

        # Initialize the distance to zero
        current_distance = 0

        # Loop over the columns of the row
        for col in range(len(current_train_point)):
            current_distance += (current_train_point[col] - normalized_X_test[col]) ** 2

        current_distance = np.sqrt(current_distance)

        # Append the distances
        distances.append(current_distance)

    # Store distances in a DataFrame
    distances = pd.DataFrame(data=distances, columns=['distance'])
    return distances

In [None]:
def nearest_neighbors(distance_point, K):
    """
    Input:
        - distance_point: The distances between the test point and each point in the training data.
        - K: The number of neighbors

    Output:
        - df_nearest: The nearest K neighbors between the test point and the training data
    """
    # Sort distances using the sort_values function
    df_nearest = distance_point.sort_values(by='distance')

    # Take only the first K neighbors
    df_nearest = df_nearest.head(K)
    return df_nearest

In [None]:

def voting(df_nearest, y_train):
    """
    Input:
        - df_nearest: DataFrame containing the nearest K neighbors between the full training dataset and the test point
        - y_train: The labels of the training dataset

    Output:
        - y_pred: The prediction based on Majority Voting
    """
    # Use the Counter object to get the labels with K nearest neighbors
    counter_vote = Counter(y_train[df_nearest.index])

    # Perform Majority Voting
    y_pred = counter_vote.most_common(1)[0][0]

    return y_pred

In [None]:
def KNN_from_scratch(x_train, y_train, x_test, K):
    """
    Input:
    - x_train: The full training dataset
    - y_train: The labels of the training dataset
    - x_test: The full test dataset
    - K: The number of neighbors

    Output:
    - y_pred: The prediction for the whole test set based on Majority Voting
    """
    y_pred = []

    # Loop over all the test set and perform the three steps
    for test_point in x_test:
        # Step 1
        distance_point = distance_ecu(x_train, test_point)
        # Step 2
        df_nearest_point = nearest_neighbors(distance_point, K)
        # Step 3
        y_pred_point = voting(df_nearest_point, y_train)

        y_pred.append(y_pred_point)

    return y_pred

In [None]:
#test at k = 3
K1=3
y_pred_scratch = KNN_from_scratch(X_train,y_train,X_test,K)
print(y_pred_scratch)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [None]:
K2=5
y_pred_scratch = KNN_from_scratch(X_train,y_train,X_test,K)
print(y_pred_scratch)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [None]:
K3=7
y_pred_scratch = KNN_from_scratch(X_train,y_train,X_test,K)
print(y_pred_scratch)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [None]:
accuracy_dict = {}
k_values=[3,5,7]
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(X_train, y_train)
    y_pred_sklearn = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_sklearn)
    accuracy_dict[k] = accuracy
    print(f"Accuracy for K = {k}: {accuracy}")

best_k = max(accuracy_dict, key=accuracy_dict.get)
best_accuracy = accuracy_dict[best_k]

print(f"\nBest K value: {best_k}")
print(f"Best accuracy: {best_accuracy}")

Accuracy for K = 3: 1.0
Accuracy for K = 5: 1.0
Accuracy for K = 7: 0.9666666666666667

Best K value: 3
Best accuracy: 1.0


In [None]:
# Create a Normalizer object
normalizer = Normalizer()

# Fit the Normalizer to the training set
scaler = normalizer.fit(X_train)

# Apply the Normalizer to the training set
normalized_X_train = scaler.transform(X_train)

# Apply the Normalizer to the test set
normalized_X_test = scaler.transform(X_test)

In [None]:
print("X train before Normalization")
print(X_train[0:5])
print("\nX train after Normalization")
print(normalized_X_train[0:5])

X train before Normalization
[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]]

X train after Normalization
[[0.77577075 0.60712493 0.16864581 0.03372916]
 [0.77381111 0.59732787 0.2036345  0.05430253]
 [0.76945444 0.35601624 0.50531337 0.16078153]
 [0.786991   0.55745196 0.26233033 0.03279129]
 [0.78609038 0.57170209 0.23225397 0.03573138]]


In [None]:
#test at k = 3
K1=3
y_pred_scratch = KNN_from_scratch(normalized_X_train,y_train,normalized_X_test,K)
print(y_pred_scratch)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [None]:
K1=5
y_pred_scratch = KNN_from_scratch(normalized_X_train,y_train,normalized_X_test,K)
print(y_pred_scratch)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [None]:
K1=7
y_pred_scratch = KNN_from_scratch(normalized_X_train,y_train,normalized_X_test,K)
print(y_pred_scratch)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [None]:
accuracy_dict = {}
k_values=[3,5,7]
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(normalized_X_train, y_train)
    y_pred_sklearn = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_sklearn)
    accuracy_dict[k] = accuracy
    print(f"Accuracy for K = {k}: {accuracy}")

best_k = max(accuracy_dict, key=accuracy_dict.get)
best_accuracy = accuracy_dict[best_k]

print(f"\nBest K value: {best_k}")
print(f"Best accuracy: {best_accuracy}")

Accuracy for K = 3: 0.9666666666666667
Accuracy for K = 5: 1.0
Accuracy for K = 7: 1.0

Best K value: 5
Best accuracy: 1.0



In the case of the Iris dataset, normalization doesn't significantly affect accuracy because the data is balanced, consisting of the same type of units, leading to no substantial differences whether we use distance measures or normalization. However, normalization remains crucial as it can enhance accuracy in other cases by addressing outliers and improving precision in diverse scenarios.


