# A1

In [3]:
import numpy as np
import pandas as pd

def calculate_entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))  # Added epsilon for numerical stability
    return entropy

def calculate_information_gain(X, y, feature):
    total_entropy = calculate_entropy(y)
    values, counts = np.unique(X[feature], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) * calculate_entropy(y[X[feature] == values[i]]) for i in range(len(values))])
    information_gain = total_entropy - weighted_entropy
    return information_gain

def find_root_node(X, y):
    information_gains = {feature: calculate_information_gain(X, y, feature) for feature in X.columns}
    root_node = max(information_gains, key=information_gains.get)
    return root_node

# Load the dataset
data = pd.read_csv('C:\\Users\\srest\\Downloads\\Stock-Price-Prediction-Using-Machine-Learning-main\\Stock-Price-Prediction-Using-Machine-Learning-main\\ICICI_BANK.csv')
data.describe()
data.shape
data.isnull().sum()
data = data.dropna()
data.isnull().sum()

# Encode target variable based on price movement
data['Target'] = np.where(data['Close'] > data['Open'], 1, 0)

# Define features and target variable
X_project = data[['Open']]
y_project = data[['Target']]

# Find the root node feature
root_node_feature = find_root_node(X_project, y_project)
print("Root Node Feature:", root_node_feature)


Root Node Feature: Open


# A2

In [4]:
def bin_continuous_feature(X, feature, num_bins=5, binning_type='equal_width'):
    if binning_type == 'equal_width':
        bins = np.linspace(X[feature].min(), X[feature].max(), num_bins + 1)
        binned_feature = np.digitize(X[feature], bins)
    elif binning_type == 'frequency':
        bins = np.percentile(X[feature], np.arange(0, 100, 100 / num_bins))
        binned_feature = np.digitize(X[feature], bins)
    else:
        raise ValueError("Invalid binning type. Choose 'equal_width' or 'frequency'.")
    return binned_feature

# Example usage of binning
binned_open = bin_continuous_feature(X_project, 'Open', num_bins=5, binning_type='equal_width')
print("Binned Open Feature:\n", binned_open)

Binned Open Feature:
 [3 3 3 ... 2 2 2]


# A3

In [7]:
class DecisionTree:
    def __init__(self, max_depth=None, num_bins=5, binning_type='equal_width'):
        self.max_depth = max_depth
        self.num_bins = num_bins
        self.binning_type = binning_type
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            leaf_node = {'label': np.argmax(np.bincount(y)), 'samples': len(y)}
            return leaf_node
        
        if X.empty:  # Check if there are no features left to split on
            leaf_node = {'label': np.argmax(np.bincount(y)), 'samples': len(y)}
            return leaf_node

        root_node_feature = find_root_node(X, y)
        if X[root_node_feature].dtype == np.float64 or X[root_node_feature].dtype == np.int64:
            X[root_node_feature] = bin_continuous_feature(X, root_node_feature, num_bins=self.num_bins, binning_type=self.binning_type)

        unique_values = np.unique(X[root_node_feature])
        sub_trees = {}
        for value in unique_values:
            X_subset = X[X[root_node_feature] == value]
            y_subset = y[X[root_node_feature] == value]
            sub_trees[value] = self._build_tree(X_subset.drop(root_node_feature, axis=1), y_subset, depth + 1)
        
        return {'feature': root_node_feature, 'sub_trees': sub_trees}

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            predictions.append(self._predict_row(row, self.tree))
        return predictions

    def _predict_row(self, row, tree):
        if 'label' in tree:
            return tree['label']
        else:
            feature_value = row[tree['feature']]
            if feature_value in tree['sub_trees']:
                return self._predict_row(row, tree['sub_trees'][feature_value])
            else:
                # Handle unseen feature values by predicting the majority class
                return np.argmax(np.bincount(y_project))
