In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load the dataset
df = pd.read_csv("D:/Semester VII/Decision_tree/online_shoppers_intention.csv")

# Drop rows with missing values
df = df.dropna()

# Define categorical columns
categorical_columns = ['Month', 'VisitorType', 'OperatingSystems', 'Browser', 'Region', 'TrafficType']

# Encode categorical columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Define feature matrix (X) and target vector (y)
X = df.drop('Revenue', axis=1).values
y = df['Revenue'].values

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for x in X])

    def _gini_impurity(self, y):
        m = len(y)
        if m == 0:
            return 0
        p1 = np.sum(y) / m
        p0 = 1 - p1
        return 1 - p1**2 - p0**2

    def _split(self, X, y, feature, threshold):
        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold
        return X[left_indices], X[right_indices], y[left_indices], y[right_indices]

    def _find_best_split(self, X, y):
        best_gini = float('inf')
        best_split = None
        m, n = X.shape

        for feature in range(n):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)
                gini_left = self._gini_impurity(y_left)
                gini_right = self._gini_impurity(y_right)
                weighted_gini = (len(y_left) * gini_left + len(y_right) * gini_right) / m

                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_split = {
                        'feature': feature,
                        'threshold': threshold,
                        'left': (X_left, y_left),
                        'right': (X_right, y_right)
                    }
        return best_split

    def _build_tree(self, X, y, depth=0):
        if len(y) < self.min_samples_split or len(np.unique(y)) == 1 or (self.max_depth and depth >= self.max_depth):
            return {'leaf': True, 'class': np.round(np.mean(y))}

        split = self._find_best_split(X, y)
        if split is None:
            return {'leaf': True, 'class': np.round(np.mean(y))}

        left_branch = self._build_tree(split['left'][0], split['left'][1], depth + 1)
        right_branch = self._build_tree(split['right'][0], split['right'][1], depth + 1)

        return {
            'leaf': False,
            'feature': split['feature'],
            'threshold': split['threshold'],
            'left': left_branch,
            'right': right_branch
        }

    def _predict_single(self, x, tree):
        if tree['leaf']:
            return tree['class']
        if x[tree['feature']] <= tree['threshold']:
            return self._predict_single(x, tree['left'])
        return self._predict_single(x, tree['right'])

# Train the manual decision tree
dt = DecisionTree(max_depth=5, min_samples_split=10)
dt.fit(X_train, y_train)

# Make predictions
y_pred = dt.predict(X_test)

# Evaluate the performance of the decision tree
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')


Accuracy: 0.8994
Precision: 0.7134
Recall: 0.5864
