# Mini-project: Decision trees

## <u>Preprocessing</u>

In [None]:
# before executing:
# pip install ucimlrepo

In [12]:
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

This algorithm has been implemented using the library sklearn.tree

In [2]:
dataset_ids =   {
                "parkinsons": 174,
                "page-blocks": 78,
                "optical": 80,
                "musk2": 75,
                "bc-wisc-diag": 17,
                "students": 697,
                "wine": 109,
                "magic": 159,
                "balance-scale": 12,
                "glass": 42,
                "zoo": 111,
                "waveform": 107,
                "image-segmentation": 50,
                "blood": 176,
                "spect": 95,
                "yeast": 110,
                "monk": 70,
                "ecoli": 39,
                "iris": 53,
                "contraception": 30,
                "fertility": 244,
                "conn-bench-sonar":  151,
                "landsat": 146,
                "ionosphere": 52,
                "letter": 59,
                }

In [3]:
def load_dataset(id):  
    # fetch dataset 
    dataset = fetch_ucirepo(id=id) 
    
    # data (as pandas dataframes) 
    X = dataset.data.features 
    y =dataset.data.targets 
    
    # dictionary gthering infos about the metadata (url, abstract, ... etc.)
    metadata_infos_dict = dataset.metadata
    print('data url:\n', metadata_infos_dict['data_url'])
    
    # variable information
    var_infos = dataset.variables.to_numpy()
    
    data_vectors = X.to_numpy() #instance vectors with features
    features_names = X.columns.to_numpy() #getting the names of each feature
    
    data_labels = y.to_numpy() #output labels for each instance
    label_name = y.columns.to_numpy() # name of the output label
    
    return data_vectors, features_names, data_labels, label_name

In [4]:
def print_useful_data(X, X_names, y, y_name, index = 0):
    n_instances = len(X)
    n_features = len(X_names)
    
    print("number of instances: ", n_instances)
    print("number of features: ", n_features)

    print("names of the features:\n", X_names)
    print("name of the output label: ", y_name)

    print(f"instance {index} feature vector:\n", X[index])
    print(f"instance {index} output label: ", y[index])

In [5]:
def preprocessing(X, y, split=0.7):
    scaler = StandardScaler()
    try:
        #standardization
        X = scaler.fit_transform(X)
    except ValueError:
        #If non numerical data is detected, data is encoded
        X = np.array(X, dtype=object)
        encoder = OneHotEncoder()
        X_encoded = encoder.fit_transform(X).toarray()
        X = scaler.fit_transform(X_encoded)
    
    #data is split among training and testing sets
    return train_test_split(X, y, train_size=split, random_state=42)

## <u>Accuracy, F1 and confusion matrice</u>

In [10]:
def calculate_metrics(true_labels, predicted_labels):
    
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    cm = confusion_matrix(Y_test, Y_pred)
    
    return accuracy, f1, cm

## <u>Results on 25 datasets</u>

### Predict Students' Dropout and Academic Success

In [8]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["students"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split=split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/697/data.csv
training set size: (3096, 36)
testing set size: (1328, 36)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [13]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 73.57%
F1 score: 0.72
confusion matrix:
[[300  50  91]
 [ 38  76 131]
 [ 19  22 601]]


### Wine

In [14]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["wine"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/109/data.csv
training set size: (124, 13)
testing set size: (54, 13)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [15]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 96.30%
F1 score: 0.96
confusion matrix:
[[18  1  0]
 [ 0 21  0]
 [ 1  0 13]]


### MAGIC gamma telescope

In [17]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["magic"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/159/data.csv
training set size: (13314, 10)
testing set size: (5706, 10)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [18]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 83.25%
F1 score: 0.83
confusion matrix:
[[3429  276]
 [ 680 1321]]


### Parkinsons

In [19]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["parkinsons"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/174/data.csv
training set size: (136, 22)
testing set size: (59, 22)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [20]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 86.44%
F1 score: 0.86
confusion matrix:
[[10  5]
 [ 3 41]]


### Page-blocks

In [21]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["page-blocks"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/78/data.csv
training set size: (3831, 10)
testing set size: (1642, 10)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [22]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 96.77%
F1 score: 0.96
confusion matrix:
[[1451   10    1    2    2]
 [   5   99    0    2    0]
 [   4    0    6    0    0]
 [   3    1    0   26    1]
 [  19    0    3    0    7]]


### Optical

In [23]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["optical"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)

num_sample = 6000
X_train, X_test = X_train[:num_sample*2,:], X_test[:num_sample,:]
Y_train, Y_test = Y_train[:num_sample*2], Y_test[:num_sample]

print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/80/data.csv
training set size: (3933, 64)
testing set size: (1687, 64)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [24]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 69.12%
F1 score: 0.69
confusion matrix:
[[156   0   1   1   4   2   0   0   4   2]
 [  0  34   3   4  31   1   1   0  85  14]
 [  0   2  43   3   1   0   6   0  95   4]
 [  0   5   7 116   1   1   0   0  19  24]
 [  0   5   2   0 140   2   2   4  14  14]
 [  0   1   3   9   3 125   0   2   2   8]
 [  0   8   2   0   2   0 154   0   1   1]
 [  0   0   0   1   8   0   0 150  26   1]
 [  1   1   2  15   6   0   0   5 116   7]
 [  1   0   1  11   0   3   0  13  13 132]]


### Ionosphere

In [25]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ionosphere"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/52/data.csv
training set size: (245, 34)
testing set size: (106, 34)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [26]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 90.57%
F1 score: 0.90
confusion matrix:
[[31  8]
 [ 2 65]]


### Glass Identification

In [27]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["glass"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/42/data.csv
training set size: (149, 9)
testing set size: (65, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [28]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 66.15%
F1 score: 0.65
confusion matrix:
[[17  2  0  0  0  0]
 [ 9 10  2  2  0  0]
 [ 2  1  1  0  0  0]
 [ 0  1  0  3  2  0]
 [ 0  1  0  0  2  0]
 [ 0  0  0  0  0 10]]


### Letter Recognition

In [31]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["letter"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/59/data.csv
training set size: (14000, 16)
testing set size: (6000, 16)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [32]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 37.30%
F1 score: 0.33
confusion matrix:
[[187   0   0   2   0   0   0   1   0   1   0   1   1   0   0   0  31   7
    1   0   0   0   0   0   0   0]
 [  0   0   0 189   0   1   0   0   3   0   0   0   0   0   0   0  27   0
    0   0   0   0   0   0   0   9]
 [  0   0   0   1   1   3   0   0  91  13   0   0   0   0   0   0  85   0
    0   4   2   0   0   0   0   1]
 [  0   0   0 217   0   0   0   0   0   0   0   0   0   2   0   0  12   4
    0   0  15   0   0   0   0   0]
 [  0   0   0   0  64   2   0   0   5   0   0   0   0   0   0   0 163   0
    0   0   0   0   0   0   0   4]
 [  0   0   0  46   0  23   0   0   5   0   0   0   0   2   0  26  15   0
    0  49   5  40   0   0   0   0]
 [  1   0   0  13   0   0   4   0   1   1   0   0   0   0   0   0 184   2
    0   0   0   0   0   0   0  24]
 [  0   0   0  46   0   7   0   0   0   0   0   0   0   0   0   0  96   8
    0   0  61   0   0   0   0   0]
 [  0   0   0  28   0   1   0   0 169   3   0   0   0   0   0   1  12   5
    

### Connectionist Bench (Sonar, Mines vs. Rocks)

In [33]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["conn-bench-sonar"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/151/data.csv
training set size: (145, 60)
testing set size: (63, 60)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [34]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 73.02%
F1 score: 0.73
confusion matrix:
[[22 13]
 [ 4 24]]


### Musk (Version 2)

In [35]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["musk2"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/75/data.csv
training set size: (4618, 166)
testing set size: (1980, 166)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [36]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 93.94%
F1 score: 0.94
confusion matrix:
[[1646   27]
 [  93  214]]


### Breast Cancer Wisconsin (Diagnostic)

In [37]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["bc-wisc-diag"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/17/data.csv
training set size: (398, 30)
testing set size: (171, 30)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [38]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 95.32%
F1 score: 0.95
confusion matrix:
[[104   4]
 [  4  59]]


### Balance-scale

In [39]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["balance-scale"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/12/data.csv
training set size: (437, 4)
testing set size: (188, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [40]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 72.34%
F1 score: 0.71
confusion matrix:
[[ 1 13  4]
 [ 3 73  4]
 [ 7 21 62]]


### Contraception

In [41]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["contraception"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/30/data.csv
training set size: (1031, 9)
testing set size: (442, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [42]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 56.11%
F1 score: 0.56
confusion matrix:
[[112  30  52]
 [ 21  42  38]
 [ 26  27  94]]


### Fertility

In [44]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["fertility"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/244/data.csv
training set size: (70, 9)
testing set size: (30, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [45]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 86.67%
F1 score: 0.87
confusion matrix:
[[25  2]
 [ 2  1]]


### Waveform (Version 1)

In [46]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["waveform"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/107/data.csv
training set size: (3500, 21)
testing set size: (1500, 21)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [47]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 77.00%
F1 score: 0.77
confusion matrix:
[[359  61  66]
 [ 70 396  53]
 [ 50  45 400]]


### Image Segmentation

In [48]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["image-segmentation"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/50/data.csv
training set size: (147, 19)
testing set size: (63, 19)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [49]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 77.78%
F1 score: 0.76
confusion matrix:
[[ 8  0  0  0  0  0  1]
 [ 0  5  1  2  1  0  1]
 [ 1  0 10  0  0  0  0]
 [ 0  0  0  9  0  0  1]
 [ 0  0  0  0  9  0  0]
 [ 0  0  0  0  0  5  0]
 [ 1  0  5  0  0  0  3]]


### Blood Transfusion Service

In [50]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["blood"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/176/data.csv
training set size: (523, 4)
testing set size: (225, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [51]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 75.11%
F1 score: 0.73
confusion matrix:
[[149  16]
 [ 40  20]]


### SPECT Heart

In [52]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["spect"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/95/data.csv
training set size: (186, 22)
testing set size: (81, 22)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [53]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 60.49%
F1 score: 0.64
confusion matrix:
[[ 4 11]
 [21 45]]


### Statlog (Landsat Satellite)

In [54]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["landsat"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/146/data.csv
training set size: (4504, 36)
testing set size: (1931, 36)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [55]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 80.74%
F1 score: 0.80
confusion matrix:
[[435   0   3   2   8   2]
 [  4 165   0   0  13   4]
 [ 11   0 345  55   2   3]
 [  6   0  31  56   2 106]
 [ 24   3   0   1 159  32]
 [  4   0   3  38  15 399]]


### Yeast

In [56]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["yeast"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/110/data.csv
training set size: (1038, 8)
testing set size: (446, 8)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [57]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 56.05%
F1 score: 0.54
confusion matrix:
[[107   0   0   2   0   3   4  29   0   0]
 [  0   0   0   0   0   0   0   0   0   0]
 [  4   0   4   2   0   0   0   0   0   0]
 [  0   0   2  10   3   0   0   0   0   0]
 [  2   1   0   0   6   1   1   0   0   0]
 [  2   0   0   0   1  44   1   3   0   0]
 [ 32   0   0   1   1   4  32   4   0   0]
 [ 62   0   1   0   0   9   7  47   0   0]
 [  2   0   0   0   0   1   1   0   0   0]
 [  5   1   0   1   0   2   1   0   0   0]]


### Monk

In [58]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["monk"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/70/data.csv
training set size: (302, 6)
testing set size: (130, 6)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [59]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 80.00%
F1 score: 0.80
confusion matrix:
[[48 13]
 [13 56]]


### Ecoli

In [60]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ecoli"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/39/data.csv
training set size: (235, 7)
testing set size: (101, 7)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [61]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 80.20%
F1 score: 0.80
confusion matrix:
[[44  0  0  0  0  0  2]
 [ 2 13  0  5  0  0  0]
 [ 0  0  0  0  0  0  1]
 [ 0  4  0  7  0  0  0]
 [ 0  0  0  0  6  1  0]
 [ 0  0  0  0  0  1  0]
 [ 2  2  0  0  1  0 10]]


### Iris

In [62]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["iris"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/53/data.csv
training set size: (105, 4)
testing set size: (45, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [63]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 95.56%
F1 score: 0.96
confusion matrix:
[[19  0  0]
 [ 0 11  2]
 [ 0  0 13]]


### Zoo

In [64]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["zoo"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/111/data.csv
training set size: (70, 16)
testing set size: (31, 16)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [65]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 93.55%
F1 score: 0.92
confusion matrix:
[[15  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  0  2  0  0]
 [ 0  0  0  0  0  4  1]
 [ 0  0  0  0  0  0  3]]
