In [19]:
import pandas as pd
import numpy as np
from collections import Counter


In [20]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,age,salary,experience,credit_score,purchased
0,56,21920,26,375,1
1,69,126121,37,308,0
2,46,97219,25,495,1
3,32,96872,37,589,1
4,60,101132,24,360,1
...,...,...,...,...,...
4995,24,80726,22,784,1
4996,66,112906,29,317,0
4997,26,36532,20,777,0
4998,53,142689,9,787,0


In [21]:
input_set = df.drop(columns='purchased')
output_set = df['purchased']

In [22]:
def gini_index(groups, classes):
    n = sum(len(g) for g in groups)
    gini = 0.0
    for g in groups:
        size = len(g)
        if size == 0:
            continue
        score = 0.0
        labels = g[:, -1]
        for c in classes:
            p = (labels == c).sum() / size
            score += p * p
        gini += (1 - score) * (size / n)
    return gini


In [23]:
def split_data(idx, val, data):
    return data[data[:, idx] < val], data[data[:, idx] >= val]


In [24]:
def best_split(data, n_features):
    classes = np.unique(data[:, -1])
    features = np.random.choice(data.shape[1]-1, n_features, replace=False)

    best, val, score, groups = None, None, 1e9, None

    # ✅ SAFE ROW SAMPLING
    n_rows = min(50, len(data))

    for f in features:
        for row in data[np.random.choice(len(data), n_rows, replace=False)]:
            left, right = split_data(f, row[f], data)
            gini = gini_index((left, right), classes)

            if gini < score:
                best, val, score, groups = f, row[f], gini, (left, right)

    # ✅ FALLBACK SAFETY
    if best is None:
        return to_leaf(data)

    return {"index": best, "value": val, "groups": groups}


In [25]:
def to_leaf(group):
    return Counter(group[:, -1]).most_common(1)[0][0]


In [40]:
def split_node(node, depth, max_depth):
    left, right = node['groups']
    del node['groups']

    if left.size == 0 or right.size == 0:
        node['left'] = node['right'] = to_leaf(np.vstack((left, right)))
        return

    if depth >= max_depth:
        node['left'], node['right'] = to_leaf(left), to_leaf(right)
        return

    node['left'] = best_split(left, int(np.sqrt(left.shape[1] - 1)))
    split_node(node['left'], depth + 1, max_depth)

    node['right'] = best_split(right, int(np.sqrt(right.shape[1] - 1)))
    split_node(node['right'], depth + 1, max_depth)


In [43]:
def build_tree(data, max_depth):
    root = best_split(data, int(np.sqrt(data.shape[1] - 1)))
    split_node(root, 1, max_depth)
    return root


In [44]:
def predict_tree(node, row):
    if row[node['index']] < node['value']:
        return predict_tree(node['left'], row) if isinstance(node['left'], dict) else node['left']
    else:
        return predict_tree(node['right'], row) if isinstance(node['right'], dict) else node['right']


In [45]:
def random_forest_train(X, y, n_trees=10, max_depth=5):
    data = np.column_stack((X, y))
    trees = []

    for _ in range(n_trees):
        sample = data[np.random.randint(0, len(data), len(data))]
        trees.append(build_tree(sample, max_depth))

    return trees


In [46]:
def random_forest_predict(trees, row):
    preds = [predict_tree(t, row) for t in trees]
    return Counter(preds).most_common(1)[0][0]


In [47]:
model = random_forest_train(input_set.values, output_set.values)
model


[{'index': np.int64(3),
  'value': np.int64(327),
  'left': {'index': np.int64(2),
   'value': np.int64(10),
   'left': {'index': np.int64(0),
    'value': np.int64(62),
    'left': {'index': np.int64(2),
     'value': np.int64(1),
     'left': {'index': np.int64(3),
      'value': np.int64(322),
      'left': np.int64(1),
      'right': np.int64(1)},
     'right': {'index': np.int64(3),
      'value': np.int64(309),
      'left': np.int64(0),
      'right': np.int64(0)}},
    'right': {'index': np.int64(2),
     'value': np.int64(3),
     'left': {'index': np.int64(1),
      'value': np.int64(101418),
      'left': np.int64(1),
      'right': np.int64(1)},
     'right': {'index': np.int64(1),
      'value': np.int64(59449),
      'left': np.int64(0),
      'right': np.int64(0)}}},
   'right': {'index': np.int64(0),
    'value': np.int64(53),
    'left': {'index': np.int64(2),
     'value': np.int64(19),
     'left': {'index': np.int64(0),
      'value': np.int64(41),
      'left': np.

In [48]:
random_forest_predict(model, input_set.values[0])


np.int64(1)

In [49]:
preds = [random_forest_predict(model, row) for row in input_set.values]


In [50]:
preds = [random_forest_predict(model, row) for row in input_set.values]


In [51]:
idx = np.random.permutation(len(df))
split = int(0.8 * len(df))

train_idx, test_idx = idx[:split], idx[split:]

X_train, X_test = input_set.values[train_idx], input_set.values[test_idx]
y_train, y_test = output_set.values[train_idx], output_set.values[test_idx]


In [52]:
model = random_forest_train(X_train, y_train)
preds = [random_forest_predict(model, row) for row in X_test]

accuracy = sum(preds[i] == y_test[i] for i in range(len(y_test))) / len(y_test)
accuracy



np.float64(0.484)

In [53]:
import pickle

with open("rf_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("features.pkl", "wb") as f:
    pickle.dump(list(input_set.columns), f)
