# Task2

## Pre-processing Part


In [None]:
import numpy as np
import pandas as pd

In [None]:
X = pd.read_csv('../../customer_churn_dataset-training-master.csv')
y = pd.read_csv('../../customer_churn_dataset-testing-master.csv')

In [None]:
class Node:
    def __init__(self, data_indices, depth):
        self.data_indices = data_indices
        self.depth = depth
        self.split_feature = None
        self.split_value = None
        self.label = None
        self.gain = 0.0  # For Information Gain
        self.gain_ratio = 0.0  # For Gain Ratio
        self.gini_index = 0.0  # For Gini Index
        self.left = None
        self.right = None

In [None]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, split_criterion="gini"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.split_criterion = split_criterion

    def fit(self, X, y):
        self.num_classes = len(np.unique(y))
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        data_indices = np.arange(num_samples)
        node = Node(data_indices, depth)

        if depth < self.max_depth and num_samples >= self.min_samples_split:
            best_gain = 0.0

            for feature in range(num_features):
                unique_values = np.unique(X[data_indices, feature])
                for value in unique_values:
                    left_indices = data_indices[X[data_indices, feature] <= value]
                    right_indices = data_indices[X[data_indices, feature] > value]

                    if len(left_indices) == 0 or len(right_indices) == 0:
                        continue

                    gain = self._calculate_criterion(y, left_indices, right_indices)

                    if gain > best_gain:
                        best_gain = gain
                        node.split_feature = feature
                        node.split_value = value
                        node.gain = gain

            if node.gain == 0.0:
                node.label = np.argmax(np.bincount(y[data_indices]))
                return node

            left_node = self._build_tree(X, y, depth + 1)
            right_node = self._build_tree(X, y, depth + 1)
            node.left = left_node
            node.right = right_node

        else:
            node.label = np.argmax(np.bincount(y[data_indices]))

        return node

    def _calculate_criterion(self, y, left_indices, right_indices):
        if self.split_criterion == "gini":
            return self._gini_index(y, left_indices, right_indices)
        elif self.split_criterion == "gain_ratio":
            return self._gain_ratio(y, left_indices, right_indices)
        else:
            return self._information_gain(y, left_indices, right_indices)

    def _gini_index(self, y, left_indices, right_indices):
        num_left = len(left_indices)
        num_right = len(right_indices)
        num_total = num_left + num_right

        gini_left = 1.0 - sum((np.bincount(y[left_indices]) / num_left) ** 2)
        gini_right = 1.0 - sum((np.bincount(y[right_indices]) / num_right) ** 2)

        gini = (num_left / num_total) * gini_left + (num_right / num_total) * gini_right
        return gini

    def _entropy(self, y):
        probs = np.bincount(y) / len(y)
        return -np.sum(probs * np.log2(probs + 1e-10))

    def _gain_ratio(self, y, left_indices, right_indices):
        h_y = self._entropy(y)
        h_yx = (len(left_indices) / len(y)) * self._entropy(y[left_indices]) + \
               (len(right_indices) / len(y)) * self._entropy(y[right_indices])

        split_info = -((len(left_indices) / len(y)) * np.log2(len(left_indices) / len(y) + 1e-10)) - \
                     ((len(right_indices) / len(y)) * np.log2(len(right_indices) / len(y) + 1e-10))

        gain_ratio = (h_y - h_yx) / (split_info + 1e-10)
        return gain_ratio

    def _information_gain(self, y, left_indices, right_indices):
        h_y = self._entropy(y)
        h_yx = (len(left_indices) / len(y)) * self._entropy(y[left_indices]) + \
               (len(right_indices) / len(y)) * self._entropy(y[right_indices])
        information_gain = h_y - h_yx
        return information_gain

    def predict(self, X):
        return np.array([self._predict_instance(x, self.tree) for x in X])

    def _predict_instance(self, x, node):
        if node.label is not None:
            return node.label
        if x[node.split_feature] <= node.split_value:
            return self._predict_instance(x, node.left)
        else:
            return self._predict_instance(x, node.right)

In [None]:
def ensemble_voting(predictions):
    ensemble_predictions = np.column_stack(predictions)
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=ensemble_predictions)
    return final_predictions

In [None]:
# Split dataset into train and test sets
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [None]:
# Create decision tree models
tree_info_gain = DecisionTree(max_depth=5, min_samples_split=2, split_criterion="information_gain")
tree_info_gain.fit(X_train, y_train)

In [None]:
tree_gain_ratio = DecisionTree(max_depth=5, min_samples_split=2, split_criterion="gain_ratio")
tree_gain_ratio.fit(X_train, y_train)

In [None]:
tree_gini = DecisionTree(max_depth=5, min_samples_split=2, split_criterion="gini")
tree_gini.fit(X)