In [1]:
import pandas as pd


In [2]:
phones = pd.read_csv('../Datasets/train.csv')


In [3]:
phones.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
phones['price_range'].value_counts()


price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

x, y = phones.drop('price_range', axis=1), phones['price_range']
x1, x2, y1, y2 = train_test_split(x, y, test_size=0.2, random_state=42)


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report as report

dtc = DecisionTreeClassifier()
pd.DataFrame(report(y2, dtc.fit(x1, y1).predict(x2),output_dict=True))


Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg
precision,0.909091,0.735294,0.780488,0.863248,0.825,0.82203,0.827137
recall,0.857143,0.824176,0.695652,0.901786,0.825,0.819689,0.825
f1-score,0.882353,0.777202,0.735632,0.882096,0.825,0.819321,0.824613
support,105.0,91.0,92.0,112.0,0.825,400.0,400.0


In [38]:
from typing import Literal, Callable
import numpy as np
from collections import Counter

class Node:
    def __init__(self,
        predicted_value: float | int | None = None,
        feature: int | None = None,
        threshold: float | int | None = None,
        left_child = None,
        right_child = None
    ):
        self.predicted_value = predicted_value
        self.feature = feature
        self.threshold = threshold
        self.left_child = left_child
        self.right_child = right_child

def mean_squared_error(column: np.ndarray):
    return np.mean((column - np.mean(column))**2 )

def mean_absolute_error(column: np.ndarray):
    return np.sum(np.absolute(column - np.mean(column))) / column.shape[0]

def entropy(column: np.ndarray):
    probabilities = np.array(list(Counter(column).values())) / len(column)
    return -np.sum(probabilities * np.log2(probabilities))

def gini(column: np.ndarray):
    column = np.sort(column)
    n = column.shape[0]
    index = np.arange(1, n + 1)
    return ((np.sum((2 * index - n  - 1) * column)) / (n * np.sum(column)))

def null(x) -> bool:
    return x.shape[0] == 0 if type(x) is np.ndarray else not bool(x)

def mean(Y: np.ndarray):
    return Node(np.mean(Y))

def mode(Y: np.ndarray):
    return Node(Counter(Y).most_common(1)[0][0])

class CART:
    """Classification and regression tree"""
    CRITERIONS = {
        'squared_error': mean_squared_error ,
        'absolute_error': mean_absolute_error,
        'entropy': entropy,
        'gini': gini,
    }

    @staticmethod
    def __initialization(func):
        def init_wrapper(*args, **kwargs):
            func(*args, **kwargs)
            self: CART = args[0]
            if self.criterion not in CART.CRITERIONS:
                raise ValueError(f'Criterion {self.criterion} not exists')
            regression = self.criterion in ['squared_error', 'absolute_error']
            self.list = mean if regression else mode
            self.criterion = CART.CRITERIONS[self.criterion]
        return init_wrapper

    @__initialization
    def __init__(
        self,
        criterion: Literal['squared_error', 'absolute_error', 'entropy', 'gini'],
        max_depth: int | None = None,
        min_samples_split: int = 2
    ) -> None:
        self.max_depth = max_depth
        self.criterion = criterion
        self.min_samples_split = min_samples_split
        self.list: Callable[[np.ndarray], float | int] | None = None

    def __split_dataset(
        self, X: np.ndarray, y: np.ndarray, feature: int, threshold: float
    ):
        left_indexes = np.where(X[:, feature] <= threshold)[0]
        right_indexes = np.where(X[:, feature] > threshold)[0]
        return X[left_indexes], y[left_indexes], X[right_indexes], y[right_indexes]

    def __find_best_split(self, X: np.ndarray, y: np.ndarray):
        best_feature, best_threshold, best_criterion_score = None, None, np.inf
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                _, y_left, _, y_right = self.__split_dataset(X, y, feature, threshold)
                if not (null(y_left) or null(y_right)):
                    criterion_score = (len(y_left) * self.criterion(y_left) +
                        len(y_right) * self.criterion(y_right)) / len(y)
                    if criterion_score < best_criterion_score:
                        best_feature, best_threshold, best_criterion_score = (
                            feature, threshold, criterion_score)
        return best_feature, best_threshold

    def __build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0):
        if depth == self.max_depth or len(X) <= self.min_samples_split:
            return self.list(y)
        feature, threshold = self.__find_best_split(X, y)
        if not threshold:
            return self.list(y)
        x_left, y_left, x_right, y_right = self.__split_dataset(X, y, feature, threshold)
        left_child = self.__build_tree(x_left, y_left, depth + 1)
        right_child = self.__build_tree(x_right, y_right, depth + 1)
        return Node(feature=feature,
                    threshold=threshold,
                    left_child=left_child,
                    right_child=right_child)

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.root = self.__build_tree(np.array(X), np.array(y))
        return self

    def __predict_single(self, X: np.ndarray, node: Node):
        while isinstance(node.feature, int):
            node = node.left_child if X[node.feature] <= node.threshold else node.right_child
        return node.predicted_value

    def predict(self, X: np.ndarray):
        return [self.__predict_single(x, self.root) for x in np.array(X)]

cart = CART(criterion='entropy')
print(report(y2, cart.fit(x1, y1).predict(x2)))

import pickle

with open('../DumpsModels/clsf_cart.pickle', 'wb') as file:
    pickle.dump(cart, file)


              precision    recall  f1-score   support

           0       0.94      0.86      0.90       105
           1       0.75      0.84      0.79        91
           2       0.78      0.78      0.78        92
           3       0.91      0.90      0.91       112

    accuracy                           0.85       400
   macro avg       0.85      0.84      0.84       400
weighted avg       0.85      0.85      0.85       400

