In [6]:
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Function to calculate Euclidean distance between two vectors
def euclidean_distance(vector1, vector2):
    squared_distances=[(a-b)**2 for a,b in zip(vector1, vector2)]
    return math.sqrt(sum(squared_distances))

# Function to calculate Manhattan distance between two vectors
def manhattan_distance(vector1, vector2):
    distance = sum(abs(x - y) for x, y in zip(vector1, vector2))
    return distance

# Function to convert categorical variables to numeric using label encoding
def label_encoding(data, column):
    labels = data[column].unique()
    encoding_dict = {label: i for i, label in enumerate(labels)}
    data[column] = data[column].map(encoding_dict)
    return data

# Function to convert categorical variables to numeric using One-Hot encoding
def one_hot_encoding(data, column):
    one_hot_encoded = pd.get_dummies(data[column], prefix=column, drop_first=True)
    data = pd.concat([data, one_hot_encoded], axis=1)
    data = data.drop(column, axis=1)
    return data

# Function to implement k-NN classifier
def knn_classifier(X_train, y_train, X_test, k):
    distances = []
    for i in range(X_train.shape[0]):
        distance = euclidean_distance(X_train[i], X_test)
        distances.append((distance, y_train.iloc[i]))

    distances.sort(key=lambda x: x[0])
    neighbors = distances[:k]
    neighbor_labels = [neighbor[1] for neighbor in neighbors]

    # Use the most common label among the neighbors
    prediction = max(set(neighbor_labels), key=neighbor_labels.count)
    return prediction

# Main program
if __name__ == "__main__":
    # Load dataset
    df = pd.read_csv('bmi.csv')

    # Features (X) and Labels (y)
    X = df[['Gender', 'Height', 'Weight']]
    y = df['Index']

    # Convert categorical variables to numeric using label encoding
    X_label_encoded = label_encoding(X.copy(), 'Gender')

    # Convert categorical variables to numeric using One-Hot encoding
    X_one_hot_encoded = one_hot_encoding(X.copy(), 'Gender')

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_one_hot_encoded, y, test_size=0.2, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_train)
    X_test_std = scaler.transform(X_test)

    # Display original and encoded data
    print("Original Data:")
    print(X.tail(10))
    print("\nLabel Encoded Data:")
    print(X_label_encoded.tail(10))
    print("\nOne-Hot Encoded Data:")
    print(X_one_hot_encoded.tail(10))

    # Calculate Euclidean distance for the last two rows in the standardized test set
    euclidean_dist_example = euclidean_distance(X_test_std[-2], X_test_std[-1])
    print("\nExample Euclidean Distance Calculation:")
    print(f"Euclidean Distance between last two rows in the standardized test set: {euclidean_dist_example}")

    # Calculate Manhattan distance for the last two rows in the standardized test set
    manhattan_dist_example = manhattan_distance(X_test_std[-2], X_test_std[-1])
    print("\nExample Manhattan Distance Calculation:")
    print(f"Manhattan Distance between last two rows in the standardized test set: {manhattan_dist_example}")

    #Use k-NN classifier with k=3
    k_value = 3
    prediction = knn_classifier(X_train_std, y_train, X_test_std[-1], k_value)

    # Print the prediction
    print(f"\nPrediction using k-NN with k={k_value}: {prediction}")



Original Data:
     Gender  Height  Weight
490  Female     164      59
491  Female     146     147
492  Female     198      50
493  Female     170      53
494    Male     152      98
495  Female     150     153
496  Female     184     121
497  Female     141     136
498    Male     150      95
499    Male     173     131

Label Encoded Data:
     Gender  Height  Weight
490       1     164      59
491       1     146     147
492       1     198      50
493       1     170      53
494       0     152      98
495       1     150     153
496       1     184     121
497       1     141     136
498       0     150      95
499       0     173     131

One-Hot Encoded Data:
     Height  Weight  Gender_Male
490     164      59            0
491     146     147            0
492     198      50            0
493     170      53            0
494     152      98            1
495     150     153            0
496     184     121            0
497     141     136            0
498     150      95         