In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# ----------------------------
# Step 2: Load Dataset
# ----------------------------
df = pd.read_csv("multiclass.csv")   # <-- your dataset in Colab

print("First 5 rows of dataset:\n", df.head())
print("\nDataset Info:\n")
print(df.info())
print("\nSummary Statistics:\n", df.describe())


First 5 rows of dataset:
    Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  class
0       3  12669  9656     7561     214              2674        1338      2
1       3   7057  9810     9568    1762              3293        1776      2
2       3   6353  8808     7684    2405              3516        7844      2
3       3  13265  1196     4221    6404               507        1788      1
4       3  22615  5410     7198    3915              1777        5185      1

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Region            440 non-null    int64
 1   Fresh             440 non-null    int64
 2   Milk              440 non-null    int64
 3   Grocery           440 non-null    int64
 4   Frozen            440 non-null    int64
 5   Detergents_Paper  440 non-null    int64
 6   Delicassen        440 non-n

In [3]:

# ----------------------------
# Step 3: Basic Preprocessing
# ----------------------------

# 3.1 Check for null values
print("\nMissing values per column:\n", df.isnull().sum())

# If there are nulls, fill or drop
# Example: df.fillna(df.mean(), inplace=True)

# 3.2 Encoding categorical columns
# Assume last column is the target
target_col = df.columns[-1]

if df[target_col].dtype == 'object':   # if target is string labels
    encoder = LabelEncoder()
    df[target_col] = encoder.fit_transform(df[target_col])

# Also encode other categorical features if any
for col in df.columns:
    if df[col].dtype == 'object' and col != target_col:
        df[col] = LabelEncoder().fit_transform(df[col])

# 3.3 Split into features (X) and target (y)
X = df.drop(target_col, axis=1)
y = df[target_col]

# 3.4 Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nBefore Scaling:\n", X.head())
print("\nAfter Scaling (first 5 rows):\n", X_scaled[:5])

# 3.5 Check for class imbalance
print("\nClass distribution:\n", y.value_counts())


Missing values per column:
 Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64

Before Scaling:
    Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0       3  12669  9656     7561     214              2674        1338
1       3   7057  9810     9568    1762              3293        1776
2       3   6353  8808     7684    2405              3516        7844
3       3  13265  1196     4221    6404               507        1788
4       3  22615  5410     7198    3915              1777        5185

After Scaling (first 5 rows):
 [[ 0.59066829  0.05293319  0.52356777 -0.04111489 -0.58936716 -0.04356873
  -0.06633906]
 [ 0.59066829 -0.39130197  0.54445767  0.17031835 -0.27013618  0.08640684
   0.08915105]
 [ 0.59066829 -0.44702926  0.40853771 -0.0281571  -0.13753572  0.13323164
   2.24329255]
 [ 0.59066829  0.10011141 -0.62401993 -0.3929

In [4]:

# ----------------------------
# Step 4: Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("\nShapes:")
print("X_train:", X_train.shape, "X_test:", X_test.shape)


Shapes:
X_train: (352, 7) X_test: (88, 7)


In [8]:

# ----------------------------
# Step 5: Scratch Implementation of KNN
# ----------------------------
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
      self.X_train = X
      self.y_train = np.array(y)   # ✅ convert to numpy array


    def predict(self, X):
        return [self._predict(x) for x in X]

    def _predict(self, x):
        # Calculate distance to all training points
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # Get indices of k nearest neighbors
        k_idx = np.argsort(distances)[:self.k]

        # Extract labels of neighbors
        k_neighbor_labels = [self.y_train[i] for i in k_idx]

        # Majority vote
        most_common = max(set(k_neighbor_labels), key=k_neighbor_labels.count)
        return most_common

# Train and test scratch KNN
knn_scratch = KNN(k=5)
knn_scratch.fit(X_train, y_train)

scratch_preds = knn_scratch.predict(X_test)
scratch_accuracy = np.mean(scratch_preds == y_test)

print("\nScratch KNN Accuracy:", scratch_accuracy)


Scratch KNN Accuracy: 0.9545454545454546


In [9]:

# ----------------------------
# Step 6: KNN using Scikit-learn
# ----------------------------
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

sk_preds = knn.predict(X_test)
sk_accuracy = np.mean(sk_preds == y_test)

print("Scikit-learn KNN Accuracy:", sk_accuracy)

Scikit-learn KNN Accuracy: 0.9545454545454546
