In [1]:
from data.dataset import Dataset
import numpy as np
import pandas as pd
from TPC3.decision_tree import DecisionTree
from TPC1.rw import *
from sklearn.metrics import accuracy_score

In [2]:
def accuracy_score(y_true, y_pred):
    """
    Classification performance metric that computes the accuracy of y_true
    and y_pred.
    Parameters
    ----------
    y_true: numpy.ndarray (n_samples,)
        Ground truth correct labels.
    y_pred: numpy.ndarray  (n_samples,)
        Estimated target values.
    Returns
    -------
    accuracy (float) 
        Accuracy score.
    """
    accuracy = (y_true==y_pred).sum() / len(y_true)
    return accuracy

In [3]:
def train_test_split_1(dataset: Dataset, test_size=0.3):
    nrows = dataset.get_X().shape[0]
    test_size = int(test_size * nrows)
    train_size = nrows - test_size
    idx = np.arange(nrows)
    np.random.shuffle(idx)
    train_idx = idx[:train_size]
    test_idx = idx[train_size:]

    # Get the X and Y attributes of the dataset object
    X = dataset.get_X()
    y = dataset.get_y()
    features = dataset.get_features()
    label = dataset.get_label()

    # Split the X and Y data into training and test sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # Define the discrete and numeric features
    data = pd.DataFrame(X)
    discretes = [str(col) for col in data.columns if data[col].dtype in ['object', 'category', 'bool']]

    # Create the training and test datasets
    train = Dataset(X=X_train, y=y_train, features=features, discrete_features = discretes, label=label)
    test = Dataset(X=X_test, y=y_test, features=features, discrete_features = discretes, label=label)
 
    return X_train, X_test, y_train, y_test

In [8]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def test_dt():
    
    
    
    data = datasets.load_breast_cancer()
    X, y = data.data, data.target
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

    # Train the DecisionTree model
    dt = DecisionTree(criterion='entropy', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                         pre_pruning='independence', post_pruning='reduced_error_pruning', class_threshold=0.01)
    dt.fit(X_train, y_train)
    
    # Evaluate the model on the test set
    accuracy = (dt.predict(X_test) == y_test).mean()

    # Print the accuracy
    print("Accuracy: {:.2f}%".format(accuracy * 100))

In [9]:
test_dt()

Accuracy: 92.11%
