In [20]:
import pandas as pd

from chefboost import Chefboost as chef
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

In [21]:
def compute_kappa(model, data: pd.DataFrame, target_label: str):
    y_true = []
    y_pred = []
    for row in data.iterrows():
        elements = []
        for key, item in row[1].to_dict().items():
            if key == target_label:
                y_true.append(item)
            else:
                elements.append(item)
        y_pred.append(chef.predict(model, elements))
    kappa = cohen_kappa_score(y_true, y_pred)
    return kappa


### Base Titanic

In [22]:
file = "datasets/titanic.csv"
data = pd.read_csv(file, delimiter=",")

target_label = "survived"

data_train, data_test = train_test_split(data, test_size=0.1)
data_train.head()

Unnamed: 0,class,age,sex,survived
1841,crew,adult,male,no
1028,3rd,adult,male,no
326,2nd,adult,male,yes
987,3rd,adult,male,no
2109,crew,adult,male,no


In [23]:
config = {'algorithm': 'C4.5'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  78.83838383838383
Confusion matrix :  [[1320, 404], [15, 241]]
Precision :  76.5661
Recall :  98.8764
Kappa coefficient :  0.472972972972973


In [24]:
config = {'algorithm': 'ID3'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  78.83838383838383
Confusion matrix :  [[1320, 404], [15, 241]]
Precision :  76.5661
Recall :  98.8764
Kappa coefficient :  0.472972972972973


In [25]:
config = {'algorithm': 'CART'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  78.83838383838383
Confusion matrix :  [[1320, 404], [15, 241]]
Precision :  76.5661
Recall :  98.8764
Kappa coefficient :  0.472972972972973


### Base cars

Dans cette base de données, les attributs "Nportes" et "Npers" sont considérés comme des strings, donc comme des attributs catégoriels et non numérique

In [26]:
file = "datasets/Cars.arff"
arff_file = arff.loadarff(file)
data = pd.DataFrame(arff_file[0])
# Decode byte strings to regular strings
data = data.map(lambda x: x.decode() if isinstance(x, bytes) else x)

target_label = "Verdict"

data_train, data_test = train_test_split(data, test_size=0.1)
data_train.head()

Unnamed: 0,Pachat,Pmaint,Nportes,Npers,Tcoffre,Surete,Verdict
232,thaut,moyen,2.0,4.0,grand,moyen,acc
855,haut,bas,5plus,plus,petit,bas,nacc
1481,bas,haut,4.0,plus,moyen,haut,tbon
207,thaut,haut,5plus,plus,petit,bas,nacc
8,thaut,thaut,2.0,2.0,grand,haut,nacc


In [27]:
config = {'algorithm': 'C4.5'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  96.39871382636656
Confusion matrix :  [[328, 22, 2, 4], [9, 1062, 0, 0], [4, 0, 58, 7], [5, 3, 0, 51]]
Precision :  86.4407
Recall :  82.2581
Kappa coefficient :  0.8697877465000753


In [28]:
config = {'algorithm': 'ID3'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  96.39871382636656
Confusion matrix :  [[328, 22, 2, 4], [9, 1062, 0, 0], [4, 0, 58, 7], [5, 3, 0, 51]]
Precision :  86.4407
Recall :  82.2581
Kappa coefficient :  0.8697877465000753


In [29]:
config = {'algorithm': 'CART'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  96.39871382636656
Confusion matrix :  [[328, 22, 2, 4], [9, 1062, 0, 0], [4, 0, 58, 7], [5, 3, 0, 51]]
Precision :  86.4407
Recall :  82.2581
Kappa coefficient :  0.8697877465000753


### Base nursery

Dans cette base de données, l'attributs "children" est considéré comme un string, donc comme un attributs catégoriel et non numérique

In [30]:
file = "datasets/nursery.arff"
arff_file = arff.loadarff(file)
data = pd.DataFrame(arff_file[0])
# Decode byte strings to regular strings
data = data.map(lambda x: x.decode() if isinstance(x, bytes) else x)

target_label = "Class"

data_train, data_test = train_test_split(data, test_size=0.1)
data_train.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,Class
4819,pretentious,proper,incomplete,2,convenient,inconv,slightly_prob,priority,priority
7723,pretentious,critical,foster,more,convenient,convenient,nonprob,priority,spec_prior
5802,pretentious,less_proper,incomplete,more,less_conv,convenient,problematic,recommended,priority
4908,pretentious,proper,incomplete,3,critical,inconv,slightly_prob,recommended,priority
12709,great_pret,very_crit,incomplete,more,less_conv,convenient,nonprob,priority,spec_prior


In [31]:
config = {'algorithm': 'C4.5'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  94.47873799725652
Confusion matrix :  [[3465, 150, 0, 106, 0], [287, 3474, 0, 0, 0], [0, 0, 3895, 0, 0], [100, 0, 0, 186, 1], [0, 0, 0, 0, 0]]
Precision :  0.0
Recall :  0.0
Kappa coefficient :  0.9142233853093962


In [32]:
config = {'algorithm': 'ID3'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  94.47873799725652
Confusion matrix :  [[3465, 150, 0, 106, 0], [287, 3474, 0, 0, 0], [0, 0, 3895, 0, 0], [100, 0, 0, 186, 1], [0, 0, 0, 0, 0]]
Precision :  0.0
Recall :  0.0
Kappa coefficient :  0.9142233853093962


In [33]:
config = {'algorithm': 'CART'}
model = chef.fit(data_train, config=config, target_label=target_label, silent=True)

print("Accuracy : ",model["evaluation"]["train"]["Accuracy"])
print("Confusion matrix : ", model["evaluation"]["train"]["Confusion matrix"])
print("Precision : ", model["evaluation"]["train"]["Precision"])
print("Recall : ", model["evaluation"]["train"]["Recall"])
print("Kappa coefficient : ", compute_kappa(model, data_test, target_label))

Accuracy :  94.64163237311385
Confusion matrix :  [[3525, 191, 0, 106, 0], [227, 3433, 0, 0, 0], [0, 0, 3895, 0, 0], [100, 0, 0, 186, 1], [0, 0, 0, 0, 0]]
Precision :  0.0
Recall :  0.0
Kappa coefficient :  0.908588306903509
