In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

try:
    df = pd.read_csv('Iris.csv')
    print("Dataset loaded successfully:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'iris.csv' not found. Please make sure the file is in the same directory as the script.")
    exit()
df.drop('Id', axis=1, inplace=True)
X = df.drop('Species', axis=1)
y = df['Species']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=150, stratify=y
)

print("\nShape of training features (X_train):", X_train.shape)
print("Shape of testing features (X_test):", X_test.shape)
print("Shape of training target (y_train):", y_train.shape)
print("Shape of testing target (y_test):", y_test.shape)

print("\nClass distribution in the original dataset:")
print(y.value_counts(normalize=True))
print("\nClass distribution in the training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in the testing set:")
print(y_test.value_counts(normalize=True))


Dataset loaded successfully:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Shape of training features (X_train): (120, 4)
Shape of testing features (X_test): (30, 4)
Shape of training target (y_train): (120,)
Shape of testing target (y_test): (30,)

Class distribution in the original dataset:
Species
Iris-setosa        0.333333
Iris-versicolor    0.333333
Iris-virginica     0.333333
Name: proportion, dtype: float64

Class distribution in the training set:
Species
Iris-versicolor    0.333333
Iris-setosa        0.333333
Iris-virginica     0.333333
Name: proportion, dtype: float6

In [13]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# -------------------------------
# 1. Load and Prepare the Data
# -------------------------------
df = pd.read_csv("Iris.csv")

# Drop ID column if present
df.drop("Id", axis=1, inplace=True)

# Encode species labels to numbers
label_encoder = LabelEncoder()
df["Species"] = label_encoder.fit_transform(df["Species"])

# Separate features and labels
X = df.iloc[:, :-1].values  # All columns except last
y = df.iloc[:, -1].values   # Last column (Species)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# -------------------------------
# 2. KNN Classifier from Scratch
# -------------------------------
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for test_point in X_test:
            distances = [np.linalg.norm(test_point - x) for x in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            most_common = Counter(k_nearest_labels).most_common(1)[0][0]
            predictions.append(most_common)
        return predictions


# -------------------------------
# 3. Decision Tree from Scratch
# -------------------------------
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeClassifier:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / len(y)
        return 1 - np.sum(probs ** 2)

    def _best_split(self, X, y):
        m, n = X.shape
        best_gini = 1
        split_idx, split_thresh = None, None

        for feature in range(n):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left = y[X[:, feature] < threshold]
                right = y[X[:, feature] >= threshold]

                if len(left) == 0 or len(right) == 0:
                    continue

                gini = (len(left) * self._gini(left) + len(right) * self._gini(right)) / m

                if gini < best_gini:
                    best_gini = gini
                    split_idx = feature
                    split_thresh = threshold
        return split_idx, split_thresh

    def _build_tree(self, X, y, depth=0):
        classes, counts = np.unique(y, return_counts=True)
        majority_class = classes[np.argmax(counts)]

        if len(classes) == 1 or depth >= self.max_depth:
            return Node(value=majority_class)

        feat, thresh = self._best_split(X, y)
        if feat is None:
            return Node(value=majority_class)

        left_idx = X[:, feat] < thresh
        right_idx = ~left_idx

        left = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right = self._build_tree(X[right_idx], y[right_idx], depth + 1)

        return Node(feat, thresh, left, right)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _predict(self, node, x):
        if node.value is not None:
            return node.value
        if x[node.feature] < node.threshold:
            return self._predict(node.left, x)
        else:
            return self._predict(node.right, x)

    def predict(self, X_test):
        return [self._predict(self.root, x) for x in X_test]


# -------------------------------
# 4. Evaluate Classifiers
# -------------------------------
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


# Test KNN
knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
print("🔹 KNN Accuracy:", accuracy(y_test, knn_preds))

# Test Decision Tree
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)
tree_preds = tree.predict(X_test)
print("🔸 Decision Tree Accuracy:", accuracy(y_test, tree_preds))


🔹 KNN Accuracy: 1.0
🔸 Decision Tree Accuracy: 1.0


In [17]:
class DecisionNode:
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx  # Index of feature to split on
        self.threshold = threshold      # Threshold value for the split
        self.left = left                # Left subtree
        self.right = right              # Right subtree
        self.value = value              # Value if leaf node

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)
    
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or \
           n_samples < self.min_samples_split or \
           n_classes == 1:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)
        
        # Find best split
        best_feature, best_threshold = self._best_split(X, y, n_features)
        
        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)
        
        # Split the data
        left_indices = X[:, best_feature] < best_threshold
        right_indices = ~left_indices
        left = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices], y[right_indices], depth + 1)
        
        return DecisionNode(feature_idx=best_feature, threshold=best_threshold, 
                          left=left, right=right)
    
    def _best_split(self, X, y, n_features):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None
        
        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_indices = X[:, feature_idx] < threshold
                gini = self._gini_impurity(y[left_indices], y[~left_indices])
                
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_idx
                    best_threshold = threshold
        
        return best_feature, best_threshold
    
    def _gini_impurity(self, left_y, right_y):
        n = len(left_y) + len(right_y)
        p_left = len(left_y) / n
        p_right = len(right_y) / n
        
        gini_left = 1.0 - sum((np.sum(left_y == c) / len(left_y)) ** 2 for c in np.unique(left_y))
        gini_right = 1.0 - sum((np.sum(right_y == c) / len(right_y)) ** 2 for c in np.unique(right_y))
        
        return p_left * gini_left + p_right * gini_right
    
    def _most_common_label(self, y):
        counts = Counter(y)
        return counts.most_common(1)[0][0]
    
    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])
    
    def _predict_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_idx] < node.threshold:
            return self._predict_tree(x, node.left)
        else:
            return self._predict_tree(x, node.right)
