In [57]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split


In [59]:
df = pd.read_csv("bank-additional-full.csv", sep=';', quotechar='"')
print("Initial Data Shape:", df.shape)
print("Columns:", df.columns)
df.rename(columns={'y': 'subscribed'}, inplace=True)

Initial Data Shape: (41188, 21)
Columns: Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')


In [61]:
df['subscribed'] = df['subscribed'].map({'yes': 1, 'no': 0})


for col in df.columns:
    if df[col].dtype == 'object':
        df = df[df[col] != 'unknown']

print("After Dropping 'unknown':", df.shape)

After Dropping 'unknown': (30488, 21)


In [63]:
cat_cols = [c for c in df.columns if df[c].dtype == 'object' and c != 'subscribed']
print("Categorical Columns:", cat_cols)


for col in cat_cols:
    unique_vals = sorted(df[col].unique())
    encoding_dict = {val: idx for idx, val in enumerate(unique_vals)}
    df[col] = df[col].map(encoding_dict)
    print(f"Encoding for '{col}':", encoding_dict)

print("Data preview after encoding:\n", df.head())

Categorical Columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
Encoding for 'job': {'admin.': 0, 'blue-collar': 1, 'entrepreneur': 2, 'housemaid': 3, 'management': 4, 'retired': 5, 'self-employed': 6, 'services': 7, 'student': 8, 'technician': 9, 'unemployed': 10}
Encoding for 'marital': {'divorced': 0, 'married': 1, 'single': 2}
Encoding for 'education': {'basic.4y': 0, 'basic.6y': 1, 'basic.9y': 2, 'high.school': 3, 'illiterate': 4, 'professional.course': 5, 'university.degree': 6}
Encoding for 'default': {'no': 0, 'yes': 1}
Encoding for 'housing': {'no': 0, 'yes': 1}
Encoding for 'loan': {'no': 0, 'yes': 1}
Encoding for 'contact': {'cellular': 0, 'telephone': 1}
Encoding for 'month': {'apr': 0, 'aug': 1, 'dec': 2, 'jul': 3, 'jun': 4, 'mar': 5, 'may': 6, 'nov': 7, 'oct': 8, 'sep': 9}
Encoding for 'day_of_week': {'fri': 0, 'mon': 1, 'thu': 2, 'tue': 3, 'wed': 4}
Encoding for 'poutcome': {'failure': 0, 'nonexistent': 1,

In [65]:
X = df.drop('subscribed', axis=1).values  
y = df['subscribed'].values  

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (30488, 20)
y shape: (30488,)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,             
    test_size=0.2,    
    random_state=42   
)

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes:", X_test.shape, y_test.shape)

Train shapes: (24390, 20) (24390,)
Test shapes: (6098, 20) (6098,)


In [69]:
def gini(y):
    counts = Counter(y)
    impurity = 1
    for label in counts:
        prob_of_label = counts[label] / len(y)
        impurity -= prob_of_label ** 2
    return impurity

In [None]:
def entropy(y):
    counts = Counter(y)
    impurity = 0
    for label in counts:
        prob_of_label = counts[label] / len(y)
        impurity -= prob_of_label * np.log2(prob_of_label)
    return impurity


In [71]:

class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion="gini"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = gini if criterion == "gini" else entropy
        self.tree = None

    def fit(self, X, y):
        print("Fitting Decision Tree...")
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        print(f"[Tree] Depth: {depth}, Samples: {n_samples}, Unique labels: {set(y)}")


        if (depth >= self.max_depth or 
            n_samples < self.min_samples_split or 
            len(set(y)) == 1):
            leaf_value = self._most_common_label(y)
            return leaf_value

        feat_idx, threshold = self._best_split(X, y, n_features)
        if feat_idx is None:
            leaf_value = self._most_common_label(y)
            return leaf_value

        left_idx = X[:, feat_idx] <= threshold
        right_idx = X[:, feat_idx] > threshold


        if len(y[left_idx]) < self.min_samples_leaf or len(y[right_idx]) < self.min_samples_leaf:
            leaf_value = self._most_common_label(y)
            return leaf_value      

        left = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right = self._build_tree(X[right_idx], y[right_idx], depth + 1)
        return (feat_idx, threshold, left, right)

    def _best_split(self, X, y, n_features):
        best_gain = -1
        split_idx, split_threshold = None, None
        for feat_idx in range(n_features):
            thresholds = np.unique(X[:, feat_idx])
            for threshold in thresholds:
                left_idx = X[:, feat_idx] <= threshold
                right_idx = X[:, feat_idx] > threshold
                if len(y[left_idx]) == 0 or len(y[right_idx]) == 0:
                    continue
                gain = self._information_gain(y, y[left_idx], y[right_idx])
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = threshold
        return split_idx, split_threshold

    def _information_gain(self, parent, left_child, right_child):
        weight_left = len(left_child) / len(parent)
        weight_right = len(right_child) / len(parent)
        gain = self.criterion(parent) - (
            weight_left * self.criterion(left_child) +
            weight_right * self.criterion(right_child)
        )
        return gain

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def _predict_sample(self, x, tree):
        if not isinstance(tree, tuple):
            return tree
        feature_idx, threshold, left, right = tree
        if x[feature_idx] <= threshold:
            return self._predict_sample(x, left)
        else:
            return self._predict_sample(x, right)

In [73]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion="gini"):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.trees = []

    def fit(self, X, y):
        print(f"Training Random Forest with {self.n_trees} trees...")
        self.trees = []
        for i in range(self.n_trees):
            idxs = np.random.choice(len(X), len(X), replace=True)
            X_sample = X[idxs]
            y_sample = y[idxs]
            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                criterion=self.criterion
            )
            print(f"Training tree {i + 1}/{self.n_trees}")
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        print("Predicting with Random Forest...")
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(X.shape[0])])

In [75]:
rf = RandomForest(n_trees=10, max_depth=10, min_samples_split=2, min_samples_leaf=1, criterion="gini")
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("First 10 predictions:", y_pred[:10])
print("True labels:        ", y_test[:10])

Training Random Forest with 10 trees...
Training tree 1/10
Fitting Decision Tree...
[Tree] Depth: 0, Samples: 24390, Unique labels: {0, 1}
[Tree] Depth: 1, Samples: 3586, Unique labels: {0, 1}
[Tree] Depth: 2, Samples: 1361, Unique labels: {0, 1}
[Tree] Depth: 3, Samples: 1134, Unique labels: {0, 1}
[Tree] Depth: 4, Samples: 456, Unique labels: {0, 1}
[Tree] Depth: 5, Samples: 294, Unique labels: {0, 1}
[Tree] Depth: 6, Samples: 56, Unique labels: {0, 1}
[Tree] Depth: 7, Samples: 49, Unique labels: {0, 1}
[Tree] Depth: 8, Samples: 38, Unique labels: {0}
[Tree] Depth: 8, Samples: 11, Unique labels: {0, 1}
[Tree] Depth: 9, Samples: 1, Unique labels: {1}
[Tree] Depth: 9, Samples: 10, Unique labels: {0}
[Tree] Depth: 7, Samples: 7, Unique labels: {0, 1}
[Tree] Depth: 8, Samples: 2, Unique labels: {1}
[Tree] Depth: 8, Samples: 5, Unique labels: {0}
[Tree] Depth: 6, Samples: 238, Unique labels: {0}
[Tree] Depth: 5, Samples: 162, Unique labels: {0, 1}
[Tree] Depth: 6, Samples: 143, Unique lab