In [1]:
import numpy as np
from collections import Counter

def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

class Node:
    def __init__(
        self, feature=None, threshold=None, left=None, right=None, *, value=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


class DT:
    def __init__(self, min_samples_split=1, max_depth=1, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if (
            depth >= self.max_depth
            or n_labels == 1
            or n_samples < self.min_samples_split
        ):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        parent_entropy = entropy(y)
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
        ig = parent_entropy - child_entropy
        return ig

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

class ADA:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.models = []
        self.alphas = []

    def fit(self, X, y):
        n_samples = len(X)
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_estimators):
            model = DT()
            model.fit(X, y)

            predictions = model.predict(X)

            err = np.sum(weights * (predictions != y))

            alpha = 0.5 * np.log((1 - err) / (err + 1e-10))

            weights *= np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)

            self.models.append(model)
            self.alphas.append(alpha)

    def predict(self, X):
        preds = np.zeros(len(X))
        for model, alpha in zip(self.models, self.alphas):
            preds += alpha * model.predict(X)
        return np.sign(preds)

# Dataset 1

In [2]:
import pandas as pd

dataset = pd.read_csv("/content/project1_dataset1.txt", sep="\t")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
x

array([[1.245e+01, 1.570e+01, 8.257e+01, ..., 1.741e-01, 3.985e-01,
        1.244e-01],
       [1.126e+01, 1.996e+01, 7.372e+01, ..., 9.314e-02, 2.955e-01,
        7.009e-02],
       [1.143e+01, 1.539e+01, 7.306e+01, ..., 8.476e-02, 2.676e-01,
        6.765e-02],
       ...,
       [1.450e+01, 1.089e+01, 9.428e+01, ..., 1.221e-01, 2.889e-01,
        8.006e-02],
       [1.236e+01, 1.854e+01, 7.901e+01, ..., 8.442e-02, 2.983e-01,
        7.185e-02],
       [1.193e+01, 2.153e+01, 7.653e+01, ..., 7.247e-02, 2.438e-01,
        8.541e-02]])

In [4]:
y

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [7]:
x_train

array([[ 0.99066707,  0.92372376,  1.11661305, ...,  1.2841721 ,
        -0.96350722,  2.09762058],
       [ 2.60015848,  1.69074393,  2.7529165 , ...,  2.3925318 ,
         0.48551869,  0.19672436],
       [ 1.54225024,  0.89616614,  1.52261315, ...,  1.02700859,
        -0.5192685 , -0.43727208],
       ...,
       [-1.02049004, -0.16020895, -1.02165417, ..., -0.73583978,
        -1.1056636 , -0.3061004 ],
       [ 0.24107968, -0.12346547,  0.24883908, ..., -0.45356147,
        -0.07341438,  0.44595055],
       [ 0.49565653,  1.06610475,  0.48915834, ..., -0.03473083,
        -0.14287716, -1.15598356]])

In [8]:
x_test

array([[-0.53396539, -1.1982123 , -0.53281364, ..., -0.34302627,
        -0.35611174,  0.23061038],
       [-0.61033845,  0.33353158, -0.57013284, ...,  0.01940885,
        -0.6985794 ,  0.86296717],
       [-0.81399992,  0.1268495 , -0.81332281, ..., -1.4134882 ,
         0.63736756, -1.02754463],
       ...,
       [-1.32739657,  0.54939954, -1.31938758, ..., -1.15060994,
        -0.04110611, -0.05359492],
       [-1.01483277,  0.86860853, -1.02206427, ..., -1.24761021,
        -1.08304782, -0.88817472],
       [-0.59619529,  1.2819727 , -0.58079547, ...,  0.09009122,
        -0.48211399, -0.082562  ]])

In [9]:
y_train

array([1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,

In [10]:
y_test

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])

In [11]:
classifier = ADA()

classifier.fit(x_train, y_train)

In [12]:
y_pred = classifier.predict(x_test)

In [13]:
print(y_pred)
# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1.
 1. 1. 0. 1. 0. 0. 0. 0. 0.]


In [14]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[33  1]
 [ 7 16]]


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(X, y, model, n_splits=10):
    kf = []
    fold_size = len(X) // n_splits
    remainder = len(X) % n_splits
    start = 0
    for i in range(n_splits):
        if i < remainder:
            end = start + fold_size + 1
        else:
            end = start + fold_size
        test_indices = list(range(start, end))
        train_indices = list(set(range(len(X))) - set(test_indices))
        kf.append((train_indices, test_indices))
        start = end

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))

    print("Accuracy: {:.2f}".format(np.mean(accuracy_scores)))
    print("Precision: {:.2f}".format(np.mean(precision_scores)))
    print("Recall: {:.2f}".format(np.mean(recall_scores)))
    print("F1 Score: {:.2f}".format(np.mean(f1_scores)))

evaluate_model(x, y, classifier)

Accuracy: 0.91
Precision: 0.85
Recall: 0.90
F1 Score: 0.87


# Dataset 2

In [16]:
dataset = pd.read_csv("/content/project1_dataset2.txt", sep="\t")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [17]:
x

array([[123, 0.05, 4.61, ..., 23.23, 2.78, 16],
       [128, 0.5, 3.7, ..., 21.25, 22.73, 28],
       [114, 9.6, 2.51, ..., 25.67, 40.63, 46],
       ...,
       [138, 4.5, 2.85, ..., 24.78, 24.89, 56],
       [170, 7.6, 5.5, ..., 37.41, 6.17, 54],
       [128, 0.0, 10.58, ..., 28.41, 14.66, 48]], dtype=object)

In [18]:
y

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,

In [19]:
x[:,4]

array(['Absent', 'Present', 'Absent', 'Present', 'Present', 'Absent',
       'Present', 'Absent', 'Present', 'Absent', 'Present', 'Absent',
       'Absent', 'Present', 'Present', 'Present', 'Present', 'Present',
       'Present', 'Absent', 'Present', 'Absent', 'Absent', 'Present',
       'Present', 'Present', 'Present', 'Absent', 'Present', 'Absent',
       'Present', 'Present', 'Absent', 'Absent', 'Present', 'Present',
       'Present', 'Absent', 'Absent', 'Absent', 'Absent', 'Absent',
       'Present', 'Absent', 'Absent', 'Absent', 'Absent', 'Absent',
       'Absent', 'Absent', 'Absent', 'Absent', 'Present', 'Absent',
       'Absent', 'Absent', 'Present', 'Present', 'Present', 'Absent',
       'Present', 'Absent', 'Present', 'Absent', 'Present', 'Present',
       'Absent', 'Absent', 'Absent', 'Present', 'Present', 'Absent',
       'Absent', 'Present', 'Present', 'Absent', 'Absent', 'Present',
       'Absent', 'Present', 'Absent', 'Absent', 'Present', 'Absent',
       'Absent', 'Prese

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [21]:
x[:,4]

array([4.61, 3.7, 2.51, 6.38, 4.69, 10.53, 4.01, 6.53, 3.17, 9.78, 2.58,
       2.63, 4.82, 7.18, 6.57, 5.09, 2.66, 7.63, 5.73, 5.63, 5.11, 4.75,
       1.96, 4.9, 4.89, 4.67, 5.62, 1.87, 6.99, 3.47, 3.12, 8.53, 6.68,
       1.71, 3.36, 5.97, 2.19, 5.41, 4.91, 3.17, 5.46, 4.94, 4.89, 4.99,
       4.16, 6.08, 7.1, 3.89, 3.17, 7.32, 2.68, 5.56, 4.42, 4.22, 5.9,
       2.36, 8.41, 3.65, 4.55, 1.88, 4.79, 2.73, 6.23, 3.96, 3.59, 3.63,
       3.66, 5.59, 4.68, 2.44, 7.22, 3.69, 3.84, 4.18, 3.2, 14.16, 3.98,
       2.44, 3.66, 6.26, 3.38, 5.9, 7.85, 1.82, 1.77, 8.46, 2.85, 3.57,
       6.63, 3.54, 8.29, 3.16, 4.9, 2.81, 6.33, 6.09, 3.22, 10.49, 3.68,
       1.96, 2.83, 4.14, 8.49, 11.17, 9.19, 1.86, 5.29, 5.04, 5.08, 6.41,
       7.04, 3.3, 2.05, 2.96, 2.51, 2.8, 3.24, 5.05, 2.28, 3.58, 5.59,
       3.52, 6.22, 6.13, 8.12, 3.02, 5.21, 5.35, 2.82, 5.05, 4.43, 5.32,
       3.57, 8.28, 6.65, 4.37, 1.07, 4.8, 1.94, 3.14, 3.95, 4.66, 3.26,
       6.41, 2.43, 1.88, 12.42, 6.06, 8.12, 6.58, 4.31, 5

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)

In [23]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [24]:
classifier = ADA()

classifier.fit(x_train, y_train)

In [25]:
y_pred = classifier.predict(x_test)

In [26]:
print(y_pred)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [27]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[24  0]
 [23  0]]


In [28]:
evaluate_model(x, y, classifier)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.67
Precision: 0.42
Recall: 0.46
F1 Score: 0.43
