## Day 3 - K Nearest Neighbour classification and Decision Trees

### 1. Setup

In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn


### 2. K Nearest Neighbour

**K-Nearest Neighbors (KNN)** is a supervised learning algorithm that predicts the output of a data point based on the majority class or average value of its k closest neighbors.
It is a non-parametric, distance-based method that works for both classification and regression.

#### 2.1 Loading the dataset

We have a dataset of Iris flowers with measurements for their petals and sepals.

Our goal is to train a model that can identify the species of a new flower, Simply by measuring its dimensions.

In [None]:
import pandas as pd                     # Import pandas for data handling
from sklearn.datasets import load_iris  # Import function to load the iris dataset

print("\nLoading Iris Dataset...")

# Load the iris data from sklearn's built-in datasets
iris = load_iris()

# Create a DataFrame for easy data analysis and visualization
# 'iris.data' contains the feature values, and 'iris.feature_names' are their column names
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Add a new column for the species as integer labels (0, 1, or 2)
df['species_id'] = iris.target

# Define a mapping from the species IDs to their actual names for better clarity
species_map = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}

# Map each species_id to its corresponding species name and add it as a new column
df['species_name'] = df['species_id'].map(species_map)

# Display the first five rows so you can see how the data looks
print("First 5 rows of the Iris data:")
print(df.head())

#### 2.2 Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style for better visibility
plt.style.use('ggplot') 

plt.figure(figsize=(10, 6))


sns.scatterplot(data=df, x='sepal length (cm)', y='sepal width (cm)', 
                hue='species_name', s=100, palette='bright')
plt.title("Iris Species Distribution: Sepal Dimensions")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Sepal Width (cm)")
plt.legend(title='Species')
plt.grid(True)
plt.show()

#### 2.3 Data Preprocessing


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select sepal length and width as feature columns (X)
X = df[['sepal length (cm)', 'sepal width (cm)']]

# Select species name as label column (y)
y = df['species_name']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize the standard scaler for normalization
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Use the same scaler to transform the test set
X_test_scaled = scaler.transform(X_test)

# Show before and after scaling to verify
print("\nData Scaled.")
print(f"Original Feature Example: \n{X_train.iloc[0]}")
print(f"Scaled Feature Example: {X_train_scaled[0]}")


#### 2.4 Fitting the model

In [None]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

k_value = 32
knn = KNeighborsClassifier(n_neighbors=k_value)
knn.fit(X_train_scaled, y_train)

print(f"\nKNN Model trained with K={k_value}")

pred = knn.predict(X_test_scaled)

print("\n--- Confusion Matrix ---")
print(confusion_matrix(y_test, pred))
print("\n--- Classification Report ---")
print(classification_report(y_test, pred))

# Print additional metrics
acc = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')

print(f"\nAccuracy: {acc:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")


#### 2.5 Choosing the right K values

In [None]:
error_rate = []

range_of_k = 35
# Will check K values from 1 to 20
for i in range(1, range_of_k):
    knn_i = KNeighborsClassifier(n_neighbors=i)
    knn_i.fit(X_train_scaled, y_train)
    pred_i = knn_i.predict(X_test_scaled)
    error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10, 6))
plt.plot(range(1, range_of_k), error_rate, color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.xticks(range(1, range_of_k))
print("\nPlotting Error Rate... Look for the lowest error or the 'elbow'.")
plt.show()

#### 2.6 New predictions

Let's take some other values than the dataset and test what we actually get! 

In [None]:
mystery_flower = [[5.1, 3.5]]
mystery_scaled = scaler.transform(mystery_flower)

prediction = knn.predict(mystery_scaled)
print(f"\nMystery Flower Prediction (Dimensions: {mystery_flower[0]}):")
print(f"The Model identifies this as: {prediction[0]}")

### 3. Decision Trees

Topics: Information Gain, Gini Impurity, Non-Parametric Modeling

#### 3.1 Understanding Gini Index 

Gini Index measures node impurity by estimating how often a randomly chosen sample would be misclassified.


In [None]:

import numpy as np

def calculate_gini(labels):
    # Gini = 1 - sum(probabilities^2)
    # 0.0 = Perfect Purity (All same class)
    # 0.5 = Max Impurity (50/50 split in binary)
    unique, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    gini = 1 - np.sum(probabilities ** 2)
    return gini

# Example: A crate with 5 bottles of Wine A and 5 of Wine B (High Impurity)
mixed_crate = ['Wine A']*5 + ['Wine B']*5
# Example: A crate with 10 bottles of Wine A (Pure)
pure_crate = ['Wine A']*10

print(f"\nPart 2: Decision Trees Started.")
print(f"Gini of Mixed Crate (50/50): {calculate_gini(mixed_crate):.2f}")
print(f"Gini of Pure Crate (100/0):  {calculate_gini(pure_crate):.2f}")
print("The Tree searches for splits that result in 'Pure Crates'.")


#### 3.2 Loading the dataset

We load the Wine dataset from Scikit-Learn.

13 Features (Alcohol, Malic Acid, Ash, Alkalinity, etc.)

Target: Cultivar Class (0, 1, or 2)

In [None]:

import pandas as pd
from sklearn.datasets import load_wine

wine = load_wine()
df_wine = pd.DataFrame(data=wine.data, columns=wine.feature_names)
df_wine['target'] = wine.target

print("\n--- Wine Dataset Loaded ---")
print(f"Features: {len(wine.feature_names)}")
print(f"Target Classes: {wine.target_names}")
print("First 5 rows:")
print(df_wine.head())

X_wine = df_wine[wine.feature_names]
y_wine = df_wine['target']

from sklearn.model_selection import train_test_split

# Split data
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_wine, y_wine, test_size=0.2, random_state=42)



##### **MODEL 1: Gini impurity**

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
dt_gini.fit(X_train_w, y_train_w)

print("\nDecision Tree (Gini) Trained.")

#### 3.3 Visualization

In [None]:

from sklearn.tree import plot_tree, export_text
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 10))
plot_tree(dt_gini, 
          feature_names=wine.feature_names,  
          class_names=wine.target_names,
          filled=True, 
          rounded=True,
          fontsize=10)
plt.title("Visualizing the Decision Tree Logic (Gini)")
plt.show()

#### 3.3 Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

pred_w = dt_gini.predict(X_test_w)
acc = accuracy_score(y_test_w, pred_w)
precision = precision_score(y_test_w, pred_w, average='weighted', zero_division=0)
recall = recall_score(y_test_w, pred_w, average='weighted', zero_division=0)
f1 = f1_score(y_test_w, pred_w, average='weighted', zero_division=0)

print(f"\nModel Accuracy on Test Data: {acc*100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_report(y_test_w, pred_w, target_names=wine.target_names, zero_division=0))

# Predict for a single sample
sample_wine = X_test_w.iloc[0].values.reshape(1, -1)
prediction_code = dt_gini.predict(sample_wine)[0]
prediction_label = wine.target_names[prediction_code]
print(f"Prediction for Sample Wine: {prediction_label}")