In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

class DecisionTreeRootFinder:
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def calculate_entropy(self, y):
        # Calculate entropy of a given set of labels
        unique_classes, class_counts = np.unique(y, return_counts=True)
        entropy = 0
        total_samples = len(y)
        for count in class_counts:
            p = count / total_samples
            entropy -= p * np.log2(p)
        return entropy

    def calculate_information_gain(self, feature_column):
        # Calculate information gain for a given feature column
        total_entropy = self.calculate_entropy(self.y)
        unique_values, value_counts = np.unique(feature_column, return_counts=True)
        weighted_entropy = 0
        total_samples = len(self.y)
        for value, count in zip(unique_values, value_counts):
            subset_y = self.y[feature_column == value]
            weighted_entropy += (count / total_samples) * self.calculate_entropy(subset_y)
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def find_root_feature(self):
        # Find the root feature with the highest information gain
        num_features = self.X.shape[1]
        best_feature = None
        max_information_gain = -np.inf
        for i in range(1, num_features):  # Start loop from index 1
            information_gain = self.calculate_information_gain(self.X[:, i])
            if information_gain > max_information_gain:
                max_information_gain = information_gain
                best_feature = i
        return best_feature

# Load the dataset
df = pd.read_csv("/content/Telecom_data.csv")

# Let's assume 'Churn' is our target variable
X = df.drop(columns=['Churn'])
y = df['Churn']

# Convert numerical columns to categorical using binning
for column in X.select_dtypes(include=['int', 'float']).columns:
    X[column] = pd.cut(X[column], bins=5, labels=False)

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    X[column] = label_encoders[column].fit_transform(X[column])

# Create an instance of DecisionTreeRootFinder
root_finder = DecisionTreeRootFinder(X.values, y.values)

# Find the root feature index
root_feature_index = root_finder.find_root_feature()

# Print the root feature index and its corresponding feature name
print("Root feature index:", root_feature_index)
print("Root feature:", X.columns[root_feature_index])


Root feature index: 19
Root feature: TotalCharges


In [None]:
import numpy as np

class DecisionTreeRootFinder:
    def __init__(self, X=None, y=None):
        self.X = X
        self.y = y

    def calculate_entropy(self, y):
        # Calculate entropy of a set of labels
        unique_classes, class_counts = np.unique(y, return_counts=True)
        entropy = 0
        total_samples = len(y)
        for count in class_counts:
            p = count / total_samples
            entropy -= p * np.log2(p)
        return entropy

    def calculate_information_gain(self, feature_column):
        # Calculate information gain for a given feature column
        total_entropy = self.calculate_entropy(self.y)
        unique_values, value_counts = np.unique(feature_column, return_counts=True)
        weighted_entropy = 0
        total_samples = len(self.y)
        for value, count in zip(unique_values, value_counts):
            subset_y = self.y[feature_column == value]
            weighted_entropy += (count / total_samples) * self.calculate_entropy(subset_y)
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def find_root_feature(self):
        # Find the root feature with the highest information gain
        num_features = self.X.shape[1]
        best_feature = None
        max_information_gain = -np.inf
        for i in range(num_features):
            information_gain = self.calculate_information_gain(self.X[:, i])
            if information_gain > max_information_gain:
                max_information_gain = information_gain
                best_feature = i
        return best_feature

    def bin_continuous_feature(self, feature_column, num_bins=None, binning_type='equal_width'):
        # Bins a continuous feature into categorical bins
        if num_bins is None:
            num_bins = 10  # Default number of bins

        if binning_type == 'equal_width':
            # Divide the range of feature values into num_bins equal-width intervals
            bins = np.linspace(np.min(feature_column), np.max(feature_column), num_bins + 1)
            binned_feature = np.digitize(feature_column, bins)

        elif binning_type == 'frequency':
            # Bin the data based on frequency of occurrence
            _, bins = np.histogram(feature_column, bins=num_bins)
            binned_feature = np.digitize(feature_column, bins)

        return binned_feature


import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("/content/Telecom_data.csv")

# Let's assume 'Churn' is our target variable
X = df.drop(columns=['Churn'])
y = df['Churn']

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    X[column] = label_encoders[column].fit_transform(X[column])

# Binning continuous features
continuous_features = X.select_dtypes(include=['float', 'int']).columns
for feature in continuous_features:
    # Instantiate DecisionTreeRootFinder and bin the feature
    X[feature] = DecisionTreeRootFinder(X, y).bin_continuous_feature(X[feature], num_bins=5, binning_type='equal_width')

# Find the root feature
root_finder = DecisionTreeRootFinder(X.values, y.values)
root_feature_index = root_finder.find_root_feature()

print("Root feature index:", root_feature_index)
print("Root feature:", X.columns[root_feature_index])


Root feature index: 15
Root feature: Contract


In [None]:
import numpy as np

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, value=None, left=None, right=None):
        """
        Initialize a node in the decision tree.

        Parameters:
        - feature_index: Index of the feature to split on
        - threshold: Threshold value for the feature
        - value: Value to return if this is a leaf node
        - left: Left child node
        - right: Right child node
        """
        self.feature_index = feature_index
        self.threshold = threshold
        self.value = value
        self.left = left
        self.right = right

class MyDecisionTreeClassifier:
    def __init__(self, max_depth=None):
        """
        Initialize the decision tree classifier.

        Parameters:
        - max_depth: Maximum depth of the decision tree
        """
        self.max_depth = max_depth

    def calculate_entropy(self, y):
        """
        Calculate the entropy of a set of labels.

        Parameters:
        - y: Array of labels

        Returns:
        - entropy: Entropy of the label distribution
        """
        unique_classes, class_counts = np.unique(y, return_counts=True)
        entropy = 0
        total_samples = len(y)
        for count in class_counts:
            p = count / total_samples
            entropy -= p * np.log2(p)
        return entropy

    def calculate_information_gain(self, X, y, feature_index, threshold):
        """
        Calculate the information gain for a given feature split.

        Parameters:
        - X: Feature matrix
        - y: Array of labels
        - feature_index: Index of the feature to split on
        - threshold: Threshold value for the feature split

        Returns:
        - info_gain: Information gain achieved by the feature split
        """
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        left_y, right_y = y[left_mask], y[right_mask]

        entropy_parent = self.calculate_entropy(y)
        entropy_left = self.calculate_entropy(left_y)
        entropy_right = self.calculate_entropy(right_y)

        total_samples = len(y)
        info_gain = entropy_parent - (len(left_y) / total_samples * entropy_left + len(right_y) / total_samples * entropy_right)
        return info_gain

    def find_best_split(self, X, y):
        """
        Find the best feature split based on maximum information gain.

        Parameters:
        - X: Feature matrix
        - y: Array of labels

        Returns:
        - best_feature_index: Index of the best feature to split on
        - best_threshold: Threshold value for the best feature split
        """
        num_features = X.shape[1]
        best_feature_index = None
        best_threshold = None
        max_info_gain = -np.inf

        for feature_index in range(num_features):
            unique_values = np.unique(X[:, feature_index])
            for threshold in unique_values:
                info_gain = self.calculate_information_gain(X, y, feature_index, threshold)
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def build_tree(self, X, y, depth=0):
        """
        Recursively build the decision tree.

        Parameters:
        - X: Feature matrix
        - y: Array of labels
        - depth: Current depth of the tree

        Returns:
        - node: Root node of the decision tree
        """
        if depth == self.max_depth or len(np.unique(y)) == 1:
            leaf_value = np.argmax(np.bincount(y))
            return TreeNode(value=leaf_value)

        best_feature_index, best_threshold = self.find_best_split(X, y)

        if best_feature_index is None:
            leaf_value = np.argmax(np.bincount(y))
            return TreeNode(value=leaf_value)

        left_mask = X[:, best_feature_index] <= best_threshold
        right_mask = ~left_mask
        left_X, left_y = X[left_mask], y[left_mask]
        right_X, right_y = X[right_mask], y[right_mask]

        left_subtree = self.build_tree(left_X, left_y, depth + 1)
        right_subtree = self.build_tree(right_X, right_y, depth + 1)

        return TreeNode(feature_index=best_feature_index, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def fit(self, X, y):
        """
        Fit the decision tree classifier to the training data.

        Parameters:
        - X: Feature matrix
        - y: Array of labels
        """
        self.tree = self.build_tree(X, y)

    def predict_instance(self, x, node):
        """
        Predict the label for a single instance.

        Parameters:
        - x: Feature vector of a single instance
        - node: Current node in the decision tree

        Returns:
        - Prediction: Predicted label for the instance
        """
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict_instance(x, node.left)
        else:
            return self.predict_instance(x, node.right)

    def predict(self, X):
        """
        Predict the labels for multiple instances.

        Parameters:
        - X: Feature matrix of multiple instances

        Returns:
        - Predictions: Array of predicted labels
        """
        predictions = []
        for x in X:
            predictions.append(self.predict_instance(x, self.tree))
        return np.array(predictions)

# Example usage:
if __name__ == "__main__":
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import LabelEncoder

    # Load the Telecom Churn dataset
    df = pd.read_csv("/content/Telecom_data.csv")

    # Let's assume 'Churn' is our target variable
    X = df.drop(columns=['Churn'])
    y = df['Churn']

    # Convert categorical variables to numerical using LabelEncoder
    label_encoders = {}
    for column in X.select_dtypes(include=['object']).columns:
        label_encoders[column] = LabelEncoder()
        X[column] = label_encoders[column].fit_transform(X[column])

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

    # Convert the target variable to integer type
    label_encoder_y = LabelEncoder()
    y_train = label_encoder_y.fit_transform(y_train)
    y_test = label_encoder_y.transform(y_test)

    # Initialize and train the decision tree classifier
    tree = MyDecisionTreeClassifier(max_depth=5)
    tree.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = tree.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)


Accuracy: 0.7963094393186657
