In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("C:\\Users\\AK\\Downloads\\binary_classification_train.csv")
data.head()

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Class
0,1,-36.963099,-15.014703,87.100435,101.52336,82.106571,-10.95618,0.592326,-51.919028,-90.650434,...,-81.183744,13.101921,18.05145,-40.606288,-39.697053,-13.870802,173.722987,-17.349169,-82.550844,0
1,2,-43.715674,18.847116,89.543406,-71.319314,35.597052,126.35857,-29.837495,-40.473764,-94.079238,...,-37.84826,7.457352,-77.420742,53.773718,-100.124294,9.87065,11.592519,-106.123605,-92.796421,1
2,3,-30.73755,-63.729643,106.081332,81.773948,112.769976,-12.425351,-29.913286,-41.7712,16.424511,...,-106.610289,5.930143,-30.177083,-138.969234,-56.054914,-12.790661,164.832498,-37.412902,-85.44115,0
3,4,-27.674757,-118.869495,135.605213,99.130189,50.947548,-63.704785,-7.353057,-58.140229,-80.209027,...,-149.056417,3.893419,-74.100869,-47.659832,-48.209817,-36.264323,59.001922,-59.064134,-78.538639,0
4,5,-28.654141,-77.746597,85.215365,50.374774,79.763207,-32.703048,-28.152031,-63.994794,-153.566789,...,-102.731465,11.160205,-13.395073,9.416237,2.649524,-43.578704,52.261888,-66.081738,-80.32511,0


In [3]:
X = data[['Feature_6', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18', 'Feature_19']]
y = data['Class']

In [4]:
split_length = int(0.8 * len(y))
X_train, X_dev = X[:split_length].values, X[split_length:].values
y_train, y_dev = y[:split_length].values, y[split_length:].values

In [5]:
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0)
X_train = (X_train - X_train_mean) / X_train_std
X_dev = (X_dev - X_train_mean) / X_train_std

In [6]:
def gini_impurity(y):
    counts = np.bincount(y.astype(int))
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities**2)

In [7]:
def information_gain(y, left_y, right_y):
    parent_gini = gini_impurity(y)
    n = len(y)
    left_weight = len(left_y) / n
    right_weight = len(right_y) / n
    weighted_gini = left_weight * gini_impurity(left_y) + right_weight * gini_impurity(right_y)
    return parent_gini - weighted_gini

In [8]:
def leaf(y):
    unique, counts = np.unique(y, return_counts=True)
    return unique[np.argmax(counts)]

In [9]:
def best_split(X, y, min_samples_split, min_samples_leaf):
    best_gain = -1
    split = None
    for feature_idx in range(X.shape[1]):
        sorted_indices = X[:, feature_idx].argsort()
        sorted_X, sorted_y = X[sorted_indices, feature_idx], y[sorted_indices]
        for i in range(min_samples_leaf, len(sorted_y) - min_samples_leaf):
            if sorted_X[i] == sorted_X[i - 1]:
                continue
            left_y, right_y = sorted_y[:i], sorted_y[i:]
            gain = information_gain(y, left_y, right_y)
            if gain > best_gain:
                best_gain = gain
                split = {
                    'feature': feature_idx,
                    'threshold': (sorted_X[i] + sorted_X[i - 1]) / 2,
                    'left_indices': sorted_indices[:i],
                    'right_indices': sorted_indices[i:]
                }
    return split

In [10]:
def build_tree(X, y, max_depth=None, min_samples_split=2, min_samples_leaf=1, depth=0):
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    if depth >= max_depth or n_samples < min_samples_split or n_labels == 1:
        return leaf(y)
    split_info = best_split(X, y, min_samples_split, min_samples_leaf)
    if not split_info:
        return leaf(y)
    left_tree = build_tree(
        X[split_info['left_indices']], y[split_info['left_indices']],
        max_depth, min_samples_split, min_samples_leaf, depth + 1
    )
    right_tree = build_tree(
        X[split_info['right_indices']], y[split_info['right_indices']],
        max_depth, min_samples_split, min_samples_leaf, depth + 1
    )
    return {
        'feature': split_info['feature'],
        'threshold': split_info['threshold'],
        'left': left_tree,
        'right': right_tree
    }

In [11]:
def traverse_tree(x, tree):
    if not isinstance(tree, dict):
        return tree
    if x[tree['feature']] <= tree['threshold']:
        return traverse_tree(x, tree['left'])
    return traverse_tree(x, tree['right'])

In [12]:
def predict(X, tree):
    return np.array([traverse_tree(x, tree) for x in X])

In [13]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [14]:
max_depth = 5
min_samples_split = 10
min_samples_leaf = 5
tree = build_tree(X_train, y_train, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
y_train_pred = predict(X_train, tree)
y_dev_pred = predict(X_dev, tree)
train_acc = accuracy(y_train, y_train_pred)
dev_acc = accuracy(y_dev, y_dev_pred)
print(f"Training Accuracy: {train_acc * 100:.2f}%")
print(f"Development Accuracy: {dev_acc * 100:.2f}%")

Training Accuracy: 93.09%
Development Accuracy: 93.11%


In [15]:
def calculate_precision(y_true, y_pred):
    tp = np.sum((y_true==1) & (y_pred==1))
    fp = np.sum((y_true==0) & (y_pred==1))
    return (tp / (tp+fp)) if (tp+fp) != 0 else 0

In [16]:
precision_dev = calculate_precision(y_dev, y_dev_pred)
print(f"Precision for Development Set: {precision_dev}")
precision_train = calculate_precision(y_train, y_train_pred)
print(f"Precision for Training Set: {precision_train}")

Precision for Development Set: 0.9341228719467062
Precision for Training Set: 0.9380007283321194


In [17]:
def calculate_recall(y_true, y_pred):
    tp = np.sum((y_true==1) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))
    return (tp / (tp+fn)) if (tp+fn) != 0 else 0

In [18]:
recall_dev = calculate_recall(y_dev, y_dev_pred)
print(f"Recall for Development Set: {recall_dev}")
recall_train = calculate_recall(y_train, y_train_pred)
print(f"Recall for Training Set: {recall_train}")

Recall for Development Set: 0.8393747921516461
Recall for Training Set: 0.8392115337623198


In [19]:
def calculate_f1score(precision, recall):
    return (2 * precision * recall) / (precision + recall)

In [20]:
f1score_train = calculate_f1score(precision_train, recall_train)
print(f"f1 for Training Set: {f1score_train}")

f1 for Training Set: 0.8858604531189544


In [21]:
f1score_dev = calculate_f1score(precision_dev, recall_dev)
print(f"f1 for Development Set: {f1score_dev}")

f1 for Development Set: 0.8842179015589421
