In [3]:
data_file = "/Users/noshitha/Downloads/contraceptive+method+choice/cmc.data"
attribute_names = [
    "Wife_age", "Wife_education", "Husband_education", "Number_of_children ever born",
    "Wife_religion", "Wife_working", "Husband_occupation", "Standard-of-living_index",
    "Media_exposure", "Contraceptive_method_used"
]
data = pd.read_csv(data_file, header=None, names=attribute_names)

# Convert "Wife's age" and "Number of children ever born" to object type
data['Wife_age'] = data['Wife_age'].astype(str)
data['Number_of_children ever born'] = data['Number_of_children ever born'].astype(str)

# Separate features and target variable
X = data.drop("Contraceptive_method_used", axis=1)
y = data["Contraceptive_method_used"]

In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

class DecisionTreeClassifier:
    def __init__(self, max_depth):
        self.max_depth = max_depth

    def entropy(self, labels):
        unique_labels, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        entropy_value = -np.sum(probabilities * np.log2(probabilities))
        return entropy_value

    def information_gain(self, y, x):
        parent_entropy = self.entropy(y)
        info_a = 0
        if x.dtype == 'object':
            unique_values = x.unique()
            for value in unique_values:
                partition_indices = x[x == value].index
                partition_entropy = self.entropy(y[partition_indices])
                info_a += len(partition_indices) / len(x) * partition_entropy
        else:
            attr_mean = x.mean()
            partition_indices_left = x[x <= attr_mean].index
            partition_indices_right = x[x > attr_mean].index
            partition_entropy_left = self.entropy(y[partition_indices_left])
            partition_entropy_right = self.entropy(y[partition_indices_right])
            weight_left = len(partition_indices_left) / len(x)
            weight_right = len(partition_indices_right) / len(x)
            info_a = weight_left * partition_entropy_left + weight_right * partition_entropy_right
        gain_a = parent_entropy - info_a
        return gain_a

    def decision_tree(self, X_train, y_train, current_depth=0):
        if len(set(y_train)) == 1 or current_depth == self.max_depth or len(X_train.columns) == 0:
            class_counts = Counter(y_train)
            majority_class = class_counts.most_common(1)[0][0]
            return {"class_label": majority_class}

        gains = {}
        for attr in X_train.columns:
            gains[attr] = self.information_gain(y_train, X_train[attr])

        best_attr = max(gains, key=gains.get)
        node = {"attribute": best_attr, "leaf": {}}

        if X_train[best_attr].dtype == 'object':
            unique_values = X_train[best_attr].unique()
            for value in unique_values:
                partition_indices = X_train[X_train[best_attr] == value].index
                if len(partition_indices) == 0:
                    class_counts = Counter(y_train)
                    majority_class = class_counts.most_common(1)[0][0]
                    node["leaf"][value] = {"class_label": majority_class}
                else:
                    node["leaf"][value] = self.decision_tree(X_train.loc[partition_indices], y_train.loc[partition_indices], current_depth + 1)
        else:
            attr_mean = X_train[best_attr].mean()
            partition_indices_left = X_train[X_train[best_attr] <= attr_mean].index
            partition_indices_right = X_train[X_train[best_attr] > attr_mean].index
            if len(partition_indices_left) == 0 or len(partition_indices_right) == 0:
                class_counts = Counter(y_train)
                majority_class = class_counts.most_common(1)[0][0]
                return {"class_label": majority_class}
            else:
                node["split_value"] = attr_mean
                node["left"] = self.decision_tree(X_train.loc[partition_indices_left], y_train.loc[partition_indices_left], current_depth + 1)
                node["right"] = self.decision_tree(X_train.loc[partition_indices_right], y_train.loc[partition_indices_right], current_depth + 1)

        return node
    
    def fit(self, X_train, y_train):
        self.tree = self.decision_tree(X_train, y_train)

    def predict_instance(self, instance, tree):
        if "class_label" in tree:
            return tree["class_label"]
        else:
            attr = tree["attribute"]
            if instance[attr] in tree["leaf"]:
                return self.predict_instance(instance, tree["leaf"][instance[attr]])
            else:
                if instance[attr] <= tree["split_value"]:
                    return self.predict_instance(instance, tree["left"])
                else:
                    return self.predict_instance(instance, tree["right"])

    def predict(self, X_test):
        predictions = []
        for index, instance in X_test.iterrows():
            predictions.append(self.predict_instance(instance, self.tree))
        return predictions

class RandomForestClassifier:
    def __init__(self, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.example_subsample_rate = example_subsample_rate
        self.attr_subsample_rate = attr_subsample_rate
    
    def entropy(self, labels):
        unique_labels, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        entropy_value = -np.sum(probabilities * np.log2(probabilities))
        return entropy_value

    def information_gain(self, y, x):
        parent_entropy = self.entropy(y)
        info_a = 0
        if x.dtype == 'object':
            unique_values = x.unique()
            for value in unique_values:
                partition_indices = x[x == value].index
                partition_entropy = self.entropy(y[partition_indices])
                info_a += len(partition_indices) / len(x) * partition_entropy
        else:
            attr_mean = x.mean()
            partition_indices_left = x[x <= attr_mean].index
            partition_indices_right = x[x > attr_mean].index
            partition_entropy_left = self.entropy(y[partition_indices_left])
            partition_entropy_right = self.entropy(y[partition_indices_right])
            weight_left = len(partition_indices_left) / len(x)
            weight_right = len(partition_indices_right) / len(x)
            info_a = weight_left * partition_entropy_left + weight_right * partition_entropy_right
        gain_a = parent_entropy - info_a
        return gain_a
    
    def decision_tree(self, X_train, y_train, current_depth=0):
        if len(set(y_train)) == 1 or current_depth == self.max_depth or len(X_train.columns) == 0:
            class_counts = Counter(y_train)
            majority_class = class_counts.most_common(1)[0][0]
            return {"class_label": majority_class}

        gains = {}
        for attr in X_train.columns:
            gains[attr] = self.information_gain(y_train, X_train[attr])

        best_attr = max(gains, key=gains.get)
        node = {"attribute": best_attr, "leaf": {}}

        if X_train[best_attr].dtype == 'object':
            unique_values = X_train[best_attr].unique()
            for value in unique_values:
                partition_indices = X_train[X_train[best_attr] == value].index
                if len(partition_indices) == 0:
                    class_counts = Counter(y_train)
                    majority_class = class_counts.most_common(1)[0][0]
                    node["leaf"][value] = {"class_label": majority_class}
                else:
                    node["leaf"][value] = self.decision_tree(X_train.loc[partition_indices], y_train.loc[partition_indices], current_depth + 1)
        else:
            attr_mean = X_train[best_attr].mean()
            partition_indices_left = X_train[X_train[best_attr] <= attr_mean].index
            partition_indices_right = X_train[X_train[best_attr] > attr_mean].index
            if len(partition_indices_left) == 0 or len(partition_indices_right) == 0:
                class_counts = Counter(y_train)
                majority_class = class_counts.most_common(1)[0][0]
                return {"class_label": majority_class}
            else:
                node["split_value"] = attr_mean
                node["left"] = self.decision_tree(X_train.loc[partition_indices_left], y_train.loc[partition_indices_left], current_depth + 1)
                node["right"] = self.decision_tree(X_train.loc[partition_indices_right], y_train.loc[partition_indices_right], current_depth + 1)

        return node

    def bootstrap_sampling(self, X, y):
        indices = np.random.choice(len(X), size=len(X), replace=True)  # Bootstrap sampling
        return X.iloc[indices], y.iloc[indices]

    def fit(self, X_train, y_train):
        trees = []
        subsampled_attributes = []

        for i in range(self.num_trees):
            bootstrapped_X, bootstrapped_y = self.bootstrap_sampling(X_train, y_train)  # Bootstrap sampling
            subsampled_attr_indexes = np.random.choice(range(X_train.shape[1]), int(X_train.shape[1] * self.attr_subsample_rate), replace=False)
            subsampled_attributes.append(subsampled_attr_indexes.tolist())
            subsampled_X = bootstrapped_X.iloc[:, subsampled_attr_indexes]
            dt = DecisionTreeClassifier(self.max_depth)
            tree = dt.decision_tree(subsampled_X, bootstrapped_y)
            trees.append(tree)

        return trees, subsampled_attributes

    def predict_random_forest(self, trees, subsampled_attributes, X_test):
        class_labels = []
        dt = DecisionTreeClassifier(self.max_depth)  # Create instance for decision tree classification
        for _, test_row in X_test.iterrows():
            tree_votes = []
            for tree, sub_attributes in zip(trees, subsampled_attributes):
                test_features = test_row[sub_attributes]
                predicted_label = dt.predict_instance(test_features, tree)  # Use decision tree predict_instance method
                if predicted_label is None:
                    print("None value predicted for features:", test_features)
                tree_votes.append(predicted_label)
            class_labels.append(max(set(tree_votes), key=tree_votes.count))
        return class_labels


    def predict(self, X_test):
        predictions = []
        for _, test_row in X_test.iterrows():
            tree_votes = []
            for tree, sub_attributes in zip(self.trees, self.subsampled_attributes):
                test_features = test_row[sub_attributes]
                predicted_label = self.predict_instance(test_features, tree)  # Use decision tree predict_instance method
                if predicted_label is None:
                    print("None value predicted for features:", test_features)
                tree_votes.append(predicted_label)
            predictions.append(max(set(tree_votes), key=tree_votes.count))
        return predictions

    def fit_random_forest(self, X_train, y_train):
        self.trees, self.subsampled_attributes = self.fit(X_train, y_train)
    
    
# class RandomForestClassifier:
#     def __init__(self, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
#         self.num_trees = num_trees
#         self.max_depth = max_depth
#         self.example_subsample_rate = example_subsample_rate
#         self.attr_subsample_rate = attr_subsample_rate

#     def bootstrap_sampling(self, X, y):
#         indices = np.random.choice(len(X), size=len(X), replace=True)
#         return X.iloc[indices], y.iloc[indices]
    
#     def predict_random_forest(self, trees, subsampled_attributes, X_test):
#         class_labels = []
#         dt = DecisionTreeClassifier(self.max_depth)  # Create instance for decision tree classification
#         for _, test_row in X_test.iterrows():
#             tree_votes = []
#             for tree, sub_attributes in zip(trees, subsampled_attributes):
#                 test_features = test_row[sub_attributes]
#                 predicted_label = dt.predict_instance(test_features, tree)  # Use decision tree predict_instance method
#                 if predicted_label is None:
#                     print("None value predicted for features:", test_features)
#                 tree_votes.append(predicted_label)
#             class_labels.append(max(set(tree_votes), key=tree_votes.count))
#         return class_labels

#     def predict(self, X_test):
#         predictions = []
#         for _, test_row in X_test.iterrows():
#             tree_votes = []
#             for tree, sub_attributes in zip(self.trees, self.subsampled_attributes):
#                 test_features = test_row[sub_attributes]
#                 predicted_label = self.predict_instance(test_features, tree)  # Use decision tree predict_instance method
#                 if predicted_label is None:
#                     print("None value predicted for features:", test_features)
#                 tree_votes.append(predicted_label)
#             predictions.append(max(set(tree_votes), key=tree_votes.count))
#         return predictions

#     def fit_random_forest(self, X_train, y_train):
#         self.trees, self.subsampled_attributes = self.fit(X_train, y_train)
    
#     def fit(self, X_train, y_train):
#         trees = []
#         subsampled_attributes = []

#         for i in range(self.num_trees):
#             bootstrapped_X, bootstrapped_y = self.bootstrap_sampling(X_train, y_train)
#             subsampled_attr_indexes = np.random.choice(range(X_train.shape[1]), int(X_train.shape[1] * self.attr_subsample_rate), replace=False)
#             subsampled_attributes.append(subsampled_attr_indexes.tolist())
#             subsampled_X = bootstrapped_X.iloc[:, subsampled_attr_indexes]
#             dt = DecisionTreeClassifier(self.max_depth)
#             tree = dt.decision_tree(subsampled_X, bootstrapped_y)
#             trees.append(tree)

#         return trees, subsampled_attributes

#     def classify_random_forest(self, trees, subsampled_attributes, X_test):
#         class_labels = []
#         dt = DecisionTreeClassifier(self.max_depth)  # Create instance for decision tree classification
#         for _, test_row in X_test.iterrows():
#             tree_votes = []
#             for tree, sub_attributes in zip(trees, subsampled_attributes):
#                 test_features = test_row[sub_attributes]
#                 predicted_label = dt.classify(tree, test_features)  # Use decision tree classify method
#                 if predicted_label[0] is None:
#                     print("None value predicted for features:", test_features)
#                 tree_votes.append(predicted_label[0])
#             class_labels.append(max(set(tree_votes), key=tree_votes.count))
#         return class_labels



class EvaluationMetrics:
    @staticmethod
    def confusion_matrix(y_true, y_pred):
        print("y_true: ",y_true,"y_pred: ",y_pred)
        classes = np.unique(np.concatenate([y_true, y_pred]))
        n_classes = len(classes)
        conf_matrix = np.zeros((n_classes, n_classes), dtype=int)

        for i, true_label in enumerate(classes):
            for j, pred_label in enumerate(classes):
                conf_matrix[i, j] = np.sum((y_true == true_label) & (y_pred == pred_label))

        return conf_matrix
    
#     @staticmethod
#     def confusion_matrix(y_true, y_pred):
#         # Filter out None values
#         y_true_filtered = [x for x in y_true if x is not None]
#         y_pred_filtered = [x for x in y_pred if x is not None]

#         classes = np.unique(np.concatenate([y_true_filtered, y_pred_filtered]))
#         n_classes = len(classes)
#         conf_matrix = np.zeros((n_classes, n_classes), dtype=int)

#         for i, true_label in enumerate(classes):
#             for j, pred_label in enumerate(classes):
#                 conf_matrix[i, j] = np.sum((np.array(y_true_filtered) == true_label) & (np.array(y_pred_filtered) == pred_label))

#         return conf_matrix


    @staticmethod
    def calculate_metrics(conf_matrix):
        TP = np.diag(conf_matrix)
        FP = np.sum(conf_matrix, axis=0) - TP
        FN = np.sum(conf_matrix, axis=1) - TP
        TN = np.sum(conf_matrix) - (TP + FP + FN)

        accuracy = np.sum(TP) / np.sum(conf_matrix)

        precision = np.where(TP + FP == 0, 0, TP / (TP + FP))
        recall = np.where(TP + FN == 0, 0, TP / (TP + FN))

        f1_score = np.zeros_like(precision)
        non_zero_denominator = (precision + recall) != 0
        f1_score[non_zero_denominator] = 2 * (precision[non_zero_denominator] * recall[non_zero_denominator]) / (precision[non_zero_denominator] + recall[non_zero_denominator])

        return accuracy, precision, recall, f1_score


def stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
    fold_size = len(X) // n_folds
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(n_folds):
        start = i * fold_size
        end = (i + 1) * fold_size

        X_train_fold = pd.concat([X[:start], X[end:]])
        y_train_fold = pd.concat([y[:start], y[end:]])

        X_validation_fold = X[start:end]
        y_validation_fold = y[start:end]

        rf = RandomForestClassifier(num_trees, max_depth, example_subsample_rate, attr_subsample_rate)
        trees, subsampled_attributes = rf.fit_random_forest(X_train_fold, y_train_fold)
        predictions = rf.classify_random_forest(trees, subsampled_attributes, X_validation_fold)

        y_validation_fold_list = y_validation_fold.tolist()
        conf_matrix = EvaluationMetrics.confusion_matrix(y_validation_fold_list, predictions)
        acc, prec, rec, f1 = EvaluationMetrics.calculate_metrics(conf_matrix)

        accuracies.append(acc)
        precisions.append(prec)
        recalls.append(rec)
        f1_scores.append(f1)

    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean([np.mean(precision, axis=0) for precision in precisions], axis=0)
    mean_recall = np.mean([np.mean(recall, axis=0) for recall in recalls], axis=0)
    mean_f1_score = np.nanmean([np.nanmean(f1_score, axis=0) for f1_score in f1_scores], axis=0)

    return mean_accuracy, mean_precision, mean_recall, mean_f1_score


if __name__ == "__main__":
    # Load dataset
    data_file = "/Users/noshitha/Downloads/contraceptive+method+choice/cmc.data"

    column_names = [
        "Wife_age", "Wife_education", "Husband_education", "Number_of_children_ever_born",
        "Wife_religion", "Wife_working", "Husband_occupation", "Standard-of-living_index",
        "Media_exposure", "Contraceptive_method_used"
    ]

    data = pd.read_csv(data_file, names=column_names)

    # Convert "Wife_age" and "Number_of_children_ever_born" to object type
    data["Wife_age"] = data["Wife_age"].astype(object)
    data["Number_of_children_ever_born"] = data["Number_of_children_ever_born"].astype(object)

    # Split features and target variable
    X = data.drop("Contraceptive_method_used", axis=1)
    y = data["Contraceptive_method_used"]

    n_trees_list = [1, 5, 10, 20, 30, 40, 50]
    n_folds = 10
    max_depth = 3
    example_subsample_rate = 0.5
    attr_subsample_rate = 0.5

    for num_trees in n_trees_list:
        print("num_trees: ", num_trees)
        accuracies, precisions, recalls, f1_scores = stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate)
        print("Accuracies:", accuracies)
        print("Precisions:", precisions)
        print("Recalls:", recalls)
        print("F1-scores:", f1_scores)


num_trees:  1


TypeError: cannot unpack non-iterable NoneType object

In [26]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle

class DecisionTreeClassifier:
    def __init__(self, max_depth):
        self.max_depth = max_depth

    def entropy(self, labels):
        unique_labels, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        entropy_value = -np.sum(probabilities * np.log2(probabilities))
        return entropy_value

    def information_gain(self, y, x):
        parent_entropy = self.entropy(y)
        info_a = 0
        for value in set(x):
            partition_indices = x[x == value].index
            partition_entropy = self.entropy(y[partition_indices])
            info_a += len(partition_indices) / len(x) * partition_entropy
        gain_a = parent_entropy - info_a
        return gain_a

    def decision_tree(self, X_train, y_train, current_depth=0):
        if len(set(y_train)) == 1 or current_depth == self.max_depth or len(X_train.columns) == 0:
            class_counts = Counter(y_train)
            majority_class = class_counts.most_common(1)[0][0] if class_counts else None
            return {"class_label": majority_class}

        gains = {}
        for attr in X_train.columns:
            if X_train[attr].dtype == 'object':
                gains[attr] = self.information_gain(y_train, X_train[attr])
            else:
                attr_mean = X_train[attr].mean()
                partition_indices_left = X_train[X_train[attr] <= attr_mean].index
                partition_indices_right = X_train[X_train[attr] > attr_mean].index
                partition_entropy_left = self.entropy(y_train[partition_indices_left])
                partition_entropy_right = self.entropy(y_train[partition_indices_right])
                weight_left = len(partition_indices_left) / len(X_train)
                weight_right = len(partition_indices_right) / len(X_train)
                info_a = weight_left * partition_entropy_left + weight_right * partition_entropy_right
                gains[attr] = self.entropy(y_train) - info_a

        best_attr = max(gains, key=gains.get)
        node = {"attribute": best_attr}

        if X_train[best_attr].dtype == 'object':
            unique_values = X_train[best_attr].unique()
            node["leaf"] = {}
            for value in unique_values:
                partition_indices = X_train[X_train[best_attr] == value].index
                node["leaf"][value] = self.decision_tree(X_train.loc[partition_indices], y_train.loc[partition_indices], current_depth + 1)
        else:
            attr_mean = X_train[best_attr].mean()
            partition_indices_left = X_train[X_train[best_attr] <= attr_mean].index
            partition_indices_right = X_train[X_train[best_attr] > attr_mean].index
            node["split_value"] = attr_mean
            node["left"] = self.decision_tree(X_train.loc[partition_indices_left], y_train.loc[partition_indices_left], current_depth + 1)
            node["right"] = self.decision_tree(X_train.loc[partition_indices_right], y_train.loc[partition_indices_right], current_depth + 1)

        if "leaf" not in node:
            class_counts = Counter(y_train)
            majority_class = class_counts.most_common(1)[0][0]
            node["class_label"] = majority_class

        return node
    
    def classify_mixed(self, tree, features):
        class_labels = []

        if isinstance(features, pd.Series):
            node = tree
            while "class_label" not in node:
                if 'split_value' not in node:  # Check if the node is a leaf node
                    class_labels.append(node.get('class_label', None))  # Return None if no class label found
                    break

                attr_value = features[node['attribute']]
                if isinstance(attr_value, (int, float)):
                    if attr_value <= node['split_value']:
                        node = node['left']
                    else:
                        node = node['right']
                else:
                    # Handle categorical attributes
                    node = node['leaf'].get(attr_value, None)  # Return None if no leaf node found for the attribute value

        else:
            for _, feature in features.iterrows():
                node = tree
                while "class_label" not in node:
                    if 'split_value' not in node:  # Check if the node is a leaf node
                        class_labels.append(node.get('class_label', None))  # Return None if no class label found
                        break

                    attr_value = feature[node['attribute']]
                    if isinstance(attr_value, (int, float)):
                        if attr_value <= node['split_value']:
                            node = node['left']
                        else:
                            node = node['right']
                    else:
                        # Handle categorical attributes
                        node = node['leaf'].get(attr_value, None)  # Return None if no leaf node found for the attribute value

        return class_labels


class RandomForestClassifier:
    def __init__(self, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.example_subsample_rate = example_subsample_rate
        self.attr_subsample_rate = attr_subsample_rate

    def bootstrap_sampling(self, X, y):
        indices = np.random.choice(len(X), size=len(X), replace=True)
        return X.iloc[indices], y.iloc[indices]

    def fit(self, X_train, y_train):
        trees = []
        subsampled_attributes = []

        for i in range(self.num_trees):
            bootstrapped_X, bootstrapped_y = self.bootstrap_sampling(X_train, y_train)
            subsampled_attr_indexes = np.random.choice(range(X_train.shape[1]), int(X_train.shape[1] * self.attr_subsample_rate), replace=False)
            subsampled_attributes.append(subsampled_attr_indexes.tolist())
            subsampled_X = bootstrapped_X.iloc[:, subsampled_attr_indexes]
            dt = DecisionTreeClassifier(self.max_depth)
            tree = dt.decision_tree(subsampled_X, bootstrapped_y)
            trees.append(tree)

        return trees, subsampled_attributes
    
    def classify_random_forest(self, trees, subsampled_attributes, X_test):
        class_labels = []
        dt = DecisionTreeClassifier(self.max_depth)  # Create instance for decision tree classification
        for i, test_row in X_test.iterrows():
            tree_votes = []
            for tree, sub_attributes in zip(trees, subsampled_attributes):
                test_features = test_row[sub_attributes]
                predicted_label = dt.classify_mixed(tree, test_features)
                if predicted_label[0] is not None:  # Check if the prediction is not None
                    tree_votes.append(predicted_label[0])
            if tree_votes:  # Check if there are any votes
                class_labels.append(max(set(tree_votes), key=tree_votes.count))
            else:
                # If all predictions are None, assign a default label (for example, the majority class)
                class_labels.append(default_label)  # Replace default_label with the desired default value
        return class_labels

    def fit_random_forest(self, X_train, y_train):
        trees, subsampled_attributes = self.fit(X_train, y_train)
        return trees, subsampled_attributes


class EvaluationMetrics:
    @staticmethod
    def confusion_matrix(y_true, y_pred):
        classes = np.unique(np.concatenate([y_true, y_pred]))
        n_classes = len(classes)
        conf_matrix = np.zeros((n_classes, n_classes), dtype=int)

        for i, true_label in enumerate(classes):
            for j, pred_label in enumerate(classes):
                conf_matrix[i, j] = np.sum((y_true == true_label) & (y_pred == pred_label))

        return conf_matrix

    @staticmethod
    def calculate_metrics(conf_matrix):
        TP = np.diag(conf_matrix)
        FP = np.sum(conf_matrix, axis=0) - TP
        FN = np.sum(conf_matrix, axis=1) - TP
        TN = np.sum(conf_matrix) - (TP + FP + FN)

        accuracy = np.sum(TP) / np.sum(conf_matrix)

        precision = np.where(TP + FP == 0, 0, TP / (TP + FP))
        recall = np.where(TP + FN == 0, 0, TP / (TP + FN))

        f1_score = np.zeros_like(precision)
        non_zero_denominator = (precision + recall) != 0
        f1_score[non_zero_denominator] = 2 * (precision[non_zero_denominator] * recall[non_zero_denominator]) / (precision[non_zero_denominator] + recall[non_zero_denominator])

        return accuracy, precision, recall, f1_score


def stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
    fold_size = len(X) // n_folds
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(n_folds):
        start = i * fold_size
        end = (i + 1) * fold_size

        X_train_fold = pd.concat([X[:start], X[end:]])
        y_train_fold = pd.concat([y[:start], y[end:]])

        X_validation_fold = X[start:end]
        y_validation_fold = y[start:end]

        rf = RandomForestClassifier(num_trees, max_depth, example_subsample_rate, attr_subsample_rate)
        trees, subsampled_attributes = rf.fit_random_forest(X_train_fold, y_train_fold)
        predictions = rf.classify_random_forest(trees, subsampled_attributes, X_validation_fold)

        y_validation_fold_list = y_validation_fold.tolist()
        conf_matrix = EvaluationMetrics.confusion_matrix(y_validation_fold_list, predictions)
        acc, prec, rec, f1 = EvaluationMetrics.calculate_metrics(conf_matrix)

        accuracies.append(acc)
        precisions.append(prec)
        recalls.append(rec)
        f1_scores.append(f1)

    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean([np.mean(precision, axis=0) for precision in precisions], axis=0)
    mean_recall = np.mean([np.mean(recall, axis=0) for recall in recalls], axis=0)
    mean_f1_score = np.nanmean([np.nanmean(f1_score, axis=0) for f1_score in f1_scores], axis=0)

    return mean_accuracy, mean_precision, mean_recall, mean_f1_score


if __name__ == "__main__":
    # Load dataset
    data_file = "/Users/noshitha/Downloads/contraceptive+method+choice/cmc.data"

    column_names = [
        "Wife_age", "Wife_education", "Husband_education", "Number_of_children_ever_born",
        "Wife_religion", "Wife_working", "Husband_occupation", "Standard-of-living_index",
        "Media_exposure", "Contraceptive_method_used"
    ]

    data = pd.read_csv(data_file, names=column_names)

    # Convert categorical to object type
    data["Wife_education"] = data["Wife_education"].astype(object)
    data["Husband_education"] = data["Husband_education"].astype(object)
    data["Wife_religion"] = data["Wife_religion"].astype(object)
    data["Wife_working"] = data["Wife_working"].astype(object)
    data["Standard-of-living_index"] = data["Standard-of-living_index"].astype(object)
    data["Media_exposure"] = data["Media_exposure"].astype(object)
    data["Contraceptive_method_used"] = data["Contraceptive_method_used"].astype(object)
    
    

    # Split features and target variable
    X = data.drop("Contraceptive_method_used", axis=1)
    y = data["Contraceptive_method_used"]

    n_trees_list = [1, 5, 10, 20, 30, 40, 50]
    n_folds = 10
    max_depth = 3
    example_subsample_rate = 0.5
    attr_subsample_rate = 0.5

    for num_trees in n_trees_list:
        print("num_trees: ", num_trees)
        accuracies, precisions, recalls, f1_scores = stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate)
        print("Accuracies:", accuracies)
        print("Precisions:", precisions)
        print("Recalls:", recalls)
        print("F1-scores:", f1_scores)


num_trees:  1


ZeroDivisionError: division by zero

In [4]:
data.dtypes

Wife_age                        object
Wife_education                   int64
Husband_education                int64
Number_of_children_ever_born    object
Wife_religion                    int64
Wife_working                     int64
Husband_occupation               int64
Standard-of-living_index         int64
Media_exposure                   int64
Contraceptive_method_used        int64
dtype: object

In [8]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle

class DecisionTreeClassifier:
    def __init__(self, max_depth):
        self.max_depth = max_depth

    def entropy(self, labels):
        unique_labels, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        entropy_value = -np.sum(probabilities * np.log2(probabilities))
        return entropy_value

    def information_gain(self, y, x):
        parent_entropy = self.entropy(y)
        info_a = 0
        
        if x.dtype == 'object':  # Check if the attribute is categorical
            for value in set(x):
                partition_indices = x[x == value].index
                partition_entropy = self.entropy(y[partition_indices])
                info_a += len(partition_indices) / len(x) * partition_entropy
        else:  # If the attribute is numerical
            # You can use the same numerical information gain calculation as before
            attr_mean = x.mean()
            partition_indices_left = x[x <= attr_mean].index
            partition_indices_right = x[x > attr_mean].index
            partition_entropy_left = self.entropy(y[partition_indices_left])
            partition_entropy_right = self.entropy(y[partition_indices_right])
            weight_left = len(partition_indices_left) / len(x)
            weight_right = len(partition_indices_right) / len(x)
            info_a = weight_left * partition_entropy_left + weight_right * partition_entropy_right

        gain_a = parent_entropy - info_a
        return gain_a

    def decision_tree(self, X_train, y_train, current_depth=0):
        if len(set(y_train)) == 1 or current_depth == self.max_depth or len(X_train.columns) == 0:
            class_counts = Counter(y_train)
            majority_class = class_counts.most_common(1)[0][0]
            return {"class_label": majority_class}

        gains = {}
        for attr in X_train.columns:
            gains[attr] = self.information_gain(y_train, X_train[attr])

        best_attr = max(gains, key=gains.get)
        node = {"attribute": best_attr, "leaf": {}}

        if X_train[best_attr].dtype == 'object':
            unique_values = X_train[best_attr].unique()
            for value in unique_values:
                partition_indices = X_train[X_train[best_attr] == value].index
                node["leaf"][value] = self.decision_tree(X_train.loc[partition_indices], y_train.loc[partition_indices], current_depth + 1)
        else:
            attr_mean = X_train[best_attr].mean()
            partition_indices_left = X_train[X_train[best_attr] <= attr_mean].index
            partition_indices_right = X_train[X_train[best_attr] > attr_mean].index
            node["split_value"] = attr_mean
            node["left"] = self.decision_tree(X_train.loc[partition_indices_left], y_train.loc[partition_indices_left], current_depth + 1)
            node["right"] = self.decision_tree(X_train.loc[partition_indices_right], y_train.loc[partition_indices_right], current_depth + 1)

        return node

    def classify(self, tree, features):
        class_labels = []
        if isinstance(features, pd.Series):
            node = tree
            while "class_label" not in node:
                if features[node['attribute']] in node['leaf']:
                    node = node['leaf'][features[node['attribute']]]
                else:
                    class_labels.append(max(node["leaf"].items(), key=lambda x: len(x[1]))[0])  # Return majority class label
                    break
            else:
                class_labels.append(node['class_label'])
        else:
            for _, feature in features.iterrows():
                node = tree
                while "class_label" not in node:
                    if feature[node['attribute']] in node['leaf']:
                        node = node['leaf'][feature[node['attribute']]]
                    else:
                        class_labels.append(max(node["leaf"].items(), key=lambda x: len(x[1]))[0])  # Return majority class label
                        break
                else:
                    class_labels.append(node['class_label'])
        return class_labels
    
class RandomForestClassifier:
    def __init__(self, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.example_subsample_rate = example_subsample_rate
        self.attr_subsample_rate = attr_subsample_rate

    def bootstrap_sampling(self, X, y):
        indices = np.random.choice(len(X), size=len(X), replace=True)
        return X.iloc[indices], y.iloc[indices]

    def fit(self, X_train, y_train):
        trees = []
        subsampled_attributes = []

        for i in range(self.num_trees):
            bootstrapped_X, bootstrapped_y = self.bootstrap_sampling(X_train, y_train)
            subsampled_attr_indexes = np.random.choice(range(X_train.shape[1]), int(X_train.shape[1] * self.attr_subsample_rate), replace=False)
            subsampled_attributes.append(subsampled_attr_indexes.tolist())
            subsampled_X = bootstrapped_X.iloc[:, subsampled_attr_indexes]
            dt = DecisionTreeClassifier(self.max_depth)
            tree = dt.decision_tree(subsampled_X, bootstrapped_y)
            trees.append(tree)

        return trees, subsampled_attributes

    def classify_random_forest(self, trees, subsampled_attributes, X_test):
        class_labels = []
        dt = DecisionTreeClassifier(self.max_depth)  # Create instance for decision tree classification
        for _, test_row in X_test.iterrows():
            tree_votes = []
            for tree, sub_attributes in zip(trees, subsampled_attributes):
                if isinstance(test_row, pd.Series):
                    test_features = pd.DataFrame([test_row[sub_attributes]])  # Ensure DataFrame format
                else:
                    test_features = test_row[sub_attributes]
                predicted_label = dt.classify(tree, test_features)  # Use decision tree classify method
                tree_votes.append(predicted_label[0])
            class_labels.append(max(set(tree_votes), key=tree_votes.count))
        return class_labels

    def fit_random_forest(self, X_train, y_train):
        trees, subsampled_attributes = self.fit(X_train, y_train)
        return trees, subsampled_attributes


class EvaluationMetrics:
    @staticmethod
    def confusion_matrix(y_true, y_pred):
        classes = np.unique(np.concatenate([y_true, y_pred]))
        n_classes = len(classes)
        conf_matrix = np.zeros((n_classes, n_classes), dtype=int)

        for i, true_label in enumerate(classes):
            for j, pred_label in enumerate(classes):
                conf_matrix[i, j] = np.sum((y_true == true_label) & (y_pred == pred_label))

        return conf_matrix

    @staticmethod
    def calculate_metrics(conf_matrix):
        TP = np.diag(conf_matrix)
        FP = np.sum(conf_matrix, axis=0) - TP
        FN = np.sum(conf_matrix, axis=1) - TP
        TN = np.sum(conf_matrix) - (TP + FP + FN)

        accuracy = np.sum(TP) / np.sum(conf_matrix)

        precision = np.where(TP + FP == 0, 0, TP / (TP + FP))
        recall = np.where(TP + FN == 0, 0, TP / (TP + FN))

        f1_score = np.zeros_like(precision)
        non_zero_denominator = (precision + recall) != 0
        f1_score[non_zero_denominator] = 2 * (precision[non_zero_denominator] * recall[non_zero_denominator]) / (precision[non_zero_denominator] + recall[non_zero_denominator])

        return accuracy, precision, recall, f1_score


def stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
    fold_size = len(X) // n_folds
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(n_folds):
        start = i * fold_size
        end = (i + 1) * fold_size

        X_train_fold = pd.concat([X[:start], X[end:]])
        y_train_fold = pd.concat([y[:start], y[end:]])

        X_validation_fold = X[start:end]
        y_validation_fold = y[start:end]

        rf = RandomForestClassifier(num_trees, max_depth, example_subsample_rate, attr_subsample_rate)
        trees, subsampled_attributes = rf.fit_random_forest(X_train_fold, y_train_fold)
        predictions = rf.classify_random_forest(trees, subsampled_attributes, X_validation_fold)

        y_validation_fold_list = y_validation_fold.tolist()
        conf_matrix = EvaluationMetrics.confusion_matrix(y_validation_fold_list, predictions)
        acc, prec, rec, f1 = EvaluationMetrics.calculate_metrics(conf_matrix)

        accuracies.append(acc)
        precisions.append(prec)
        recalls.append(rec)
        f1_scores.append(f1)

    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean([np.mean(precision, axis=0) for precision in precisions], axis=0)
    mean_recall = np.mean([np.mean(recall, axis=0) for recall in recalls], axis=0)
    mean_f1_score = np.nanmean([np.nanmean(f1_score, axis=0) for f1_score in f1_scores], axis=0)

    return mean_accuracy, mean_precision, mean_recall, mean_f1_score


if __name__ == "__main__":
    # Load dataset
    data_file = "/Users/noshitha/Downloads/contraceptive+method+choice/cmc.data"

    column_names = [
        "Wife_age", "Wife_education", "Husband_education", "Number_of_children_ever_born",
        "Wife_religion", "Wife_working", "Husband_occupation", "Standard-of-living_index",
        "Media_exposure", "Contraceptive_method_used"
    ]

    data = pd.read_csv(data_file, names=column_names)

    # Convert categorical to object type
    data["Wife_education"] = data["Wife_education"].astype(object)
    data["Husband_education"] = data["Husband_education"].astype(object)
    data["Wife_religion"] = data["Wife_religion"].astype(object)
    data["Wife_working"] = data["Wife_working"].astype(object)
    data["Standard-of-living_index"] = data["Standard-of-living_index"].astype(object)
    data["Media_exposure"] = data["Media_exposure"].astype(object)
    data["Contraceptive_method_used"] = data["Contraceptive_method_used"].astype(object)
    
    

    # Split features and target variable
    X = data.drop("Contraceptive_method_used", axis=1)
    y = data["Contraceptive_method_used"]

    n_trees_list = [1]
    n_folds = 10
    max_depth = 3
    example_subsample_rate = 0.5
    attr_subsample_rate = 0.5

    for num_trees in n_trees_list:
        print("num_trees: ", num_trees)
        accuracies, precisions, recalls, f1_scores = stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate)
        print("Accuracies:", accuracies)
        print("Precisions:", precisions)
        print("Recalls:", recalls)
        print("F1-scores:", f1_scores)

num_trees:  1


ValueError: max() arg is an empty sequence