In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import random

# ---------------------------
# 1. Data Preparation
# ---------------------------

# Defining the dataset
data = {
    'age': [25, 45, 30, 50, 35, 60, 28, 40, 55, 33],
    'weight': [70, 85, 65, 90, 75, 95, 68, 80, 88, 72],
    'smoker': ['No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No'],
    'risk': ['Low', 'High', 'Low', 'High', 'Medium', 'High', 'Low', 'Medium', 'High', 'Medium']
}

df = pd.DataFrame(data)

print("Dataset:")
print(df)

# Encoding categorical variables
# Encoding 'smoker' (Yes=1, No=0)
df['smoker_encoded'] = df['smoker'].map({'No': 0, 'Yes': 1})

# Encoding 'risk' classes
# We'll map 'High' -> 0, 'Low' -> 1, 'Medium' -> 2
risk_mapping = {'High': 0, 'Low': 1, 'Medium': 2}
df['risk_encoded'] = df['risk'].map(risk_mapping)

# Features and target
X = df[['age', 'weight', 'smoker_encoded']].values
y = df['risk_encoded'].values

# Train-Test Split
def train_test_split_custom(X, y, test_size=0.3, random_state=None):
    if random_state:
        np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    test_size = int(X.shape[0] * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

X_train, X_test, y_train, y_test = train_test_split_custom(X, y, test_size=0.3, random_state=42)

print("\nTraining set:")
print(X_train)
print("\nTraining set labels:")
print(y_train)
print("\nTesting set:")
print(X_test)
print("\nTesting set labels:")
print(y_test)

Dataset:
   age  weight smoker    risk
0   25      70     No     Low
1   45      85    Yes    High
2   30      65     No     Low
3   50      90    Yes    High
4   35      75     No  Medium
5   60      95    Yes    High
6   28      68     No     Low
7   40      80    Yes  Medium
8   55      88    Yes    High
9   33      72     No  Medium

Training set:
[[25 70  0]
 [40 80  1]
 [30 65  0]
 [33 72  0]
 [35 75  0]
 [50 90  1]
 [28 68  0]]

Training set labels:
[1 2 1 2 2 0 1]

Testing set:
[[55 88  1]
 [45 85  1]
 [60 95  1]]

Testing set labels:
[0 0 0]


# **Tree Decision**

In [3]:
# ---------------------------
# 2. Decision Tree Implementation
# ---------------------------

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):

        self.tree = self._build_tree(X, y)

    def _gini(self, y):

        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return 1 - sum(p ** 2 for p in probabilities)

    def _best_split(self, X, y):

        best_split = {'feature_index': None, 'threshold': None, 'score': float('inf')}
        n_samples, n_features = X.shape

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_mask = X[:, feature_index] <= threshold
                right_mask = ~left_mask

                if len(y[left_mask]) < self.min_samples_split or len(y[right_mask]) < self.min_samples_split:
                    continue

                left_gini = self._gini(y[left_mask])
                right_gini = self._gini(y[right_mask])
                weighted_avg_gini = (len(y[left_mask]) * left_gini + len(y[right_mask]) * right_gini) / len(y)

                if weighted_avg_gini < best_split['score']:
                    best_split = {
                        'feature_index': feature_index,
                        'threshold': threshold,
                        'score': weighted_avg_gini
                    }
        return best_split if best_split['score'] < float('inf') else None

    def _build_tree(self, X, y, depth=0):

        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if depth == self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
            return {'type': 'leaf', 'class': np.bincount(y).argmax()}

        best_split = self._best_split(X, y)
        if best_split is None:
            return {'type': 'leaf', 'class': np.bincount(y).argmax()}

        left_mask = X[:, best_split['feature_index']] <= best_split['threshold']
        right_mask = ~left_mask

        return {
            'type': 'node',
            'feature_index': best_split['feature_index'],
            'threshold': best_split['threshold'],
            'left': self._build_tree(X[left_mask], y[left_mask], depth + 1),
            'right': self._build_tree(X[right_mask], y[right_mask], depth + 1),
        }

    def predict(self, X):

        return np.array([self._predict_input(x, self.tree) for x in X])

    def _predict_input(self, x, node):
        
        if node['type'] == 'leaf':
            return node['class']
        if x[node['feature_index']] <= node['threshold']:
            return self._predict_input(x, node['left'])
        else:
            return self._predict_input(x, node['right'])


# ---------------------------
# 4. Training and Evaluation
# ---------------------------

# Initialize and train Decision Tree
dt = DecisionTree(max_depth=3)
dt.fit(X_train, y_train)

# Predictions with Decision Tree
y_pred_dt = dt.predict(X_test)

# Calculate accuracy
accuracy_dt = np.sum(y_pred_dt == y_test) / len(y_test)
print("\nDecision Tree Accuracy:", accuracy_dt)


Decision Tree Accuracy: 1.0


# **RandomForest**

In [4]:
# ---------------------------
# 3. Random Forest Implementation
# ---------------------------

class RandomForest:
    def __init__(self, n_trees=10, max_depth=None, min_samples_split=2, max_features='sqrt'):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            indices = np.random.choice(len(X), len(X), replace=True)
            X_sample = X[indices]
            y_sample = y[indices]

            if self.max_features == 'sqrt':
                n_features = int(np.sqrt(X.shape[1]))
            elif self.max_features == 'log2':
                n_features = int(np.log2(X.shape[1]))
            elif isinstance(self.max_features, int):
                n_features = self.max_features
            else:
                n_features = X.shape[1]

            feature_indices = np.random.choice(X.shape[1], n_features, replace=False)
            X_sample = X_sample[:, feature_indices]

            # Train Decision Tree
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            tree.feature_indices = feature_indices 
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X[:, tree.feature_indices]) for tree in self.trees])
        tree_preds = tree_preds.T

        y_pred = [Counter(row).most_common(1)[0][0] for row in tree_preds]
        return y_pred


# Initialize and train Random Forest
rf = RandomForest(n_trees=5, max_depth=3, max_features='sqrt')
rf.fit(X_train, y_train)

# Predictions with Random Forest
y_pred_rf = rf.predict(X_test)

# Calculate accuracy
accuracy_rf = np.sum(y_pred_rf == y_test) / len(y_test)
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 1.0


Result:
Decision Tree Accuracy: 1.0
Random Forest Accuracy: 1.0 **bold text** **bold text**