# Mini-project: Decision trees

## <u>Preprocessing</u>

In [1]:
# before executing:
# pip install ucimlrepo

In [2]:
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

This algorithm has been implemented using the library sklearn.tree

In [3]:
dataset_ids =   {
                "parkinsons": 174,
                "page-blocks": 78,
                "optical": 80,
                "musk2": 75,
                "bc-wisc-diag": 17,
                "students": 697,
                "wine": 109,
                "magic": 159,
                "balance-scale": 12,
                "glass": 42,
                "zoo": 111,
                "waveform": 107,
                "image-segmentation": 50,
                "blood": 176,
                "spect": 95,
                "yeast": 110,
                "monk": 70,
                "ecoli": 39,
                "iris": 53,
                "contraception": 30,
                "fertility": 244,
                "conn-bench-sonar":  151,
                "landsat": 146,
                "ionosphere": 52,
                "letter": 59,
                }

In [4]:
def load_dataset(id):  
    # fetch dataset 
    dataset = fetch_ucirepo(id=id) 
    
    # data (as pandas dataframes) 
    X = dataset.data.features 
    y =dataset.data.targets 
    
    # dictionary gthering infos about the metadata (url, abstract, ... etc.)
    metadata_infos_dict = dataset.metadata
    print('data url:\n', metadata_infos_dict['data_url'])
    
    # variable information
    var_infos = dataset.variables.to_numpy()
    
    data_vectors = X.to_numpy() #instance vectors with features
    features_names = X.columns.to_numpy() #getting the names of each feature
    
    data_labels = y.to_numpy() #output labels for each instance
    label_name = y.columns.to_numpy() # name of the output label
    
    return data_vectors, features_names, data_labels, label_name

In [5]:
def print_useful_data(X, X_names, y, y_name, index = 0):
    n_instances = len(X)
    n_features = len(X_names)
    
    print("number of instances: ", n_instances)
    print("number of features: ", n_features)

    print("names of the features:\n", X_names)
    print("name of the output label: ", y_name)

    print(f"instance {index} feature vector:\n", X[index])
    print(f"instance {index} output label: ", y[index])

In [6]:
def preprocessing(X, y, split=0.7):
    scaler = StandardScaler()
    try:
        #standardization
        X = scaler.fit_transform(X)
    except ValueError:
        #If non numerical data is detected, data is encoded
        X = np.array(X, dtype=object)
        encoder = OneHotEncoder()
        X_encoded = encoder.fit_transform(X).toarray()
        X = scaler.fit_transform(X_encoded)
    
    #data is split among training and testing sets
    return train_test_split(X, y, train_size=split, random_state=42)

## <u>Accuracy, F1 and confusion matrice</u>

In [7]:
def calculate_metrics(true_labels, predicted_labels):
    
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    cm = confusion_matrix(Y_test, Y_pred)
    
    return accuracy, f1, cm

## <u>Results on 25 datasets</u>

### Predict Students' Dropout and Academic Success

In [8]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["students"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split=split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/697/data.csv
training set size: (3096, 36)
testing set size: (1328, 36)


In [9]:
print_useful_data(X, X_names, y, y_name)

number of instances:  4424
number of features:  36
names of the features:
 ['Marital Status' 'Application mode' 'Application order' 'Course'
 'Daytime/evening attendance' 'Previous qualification'
 'Previous qualification (grade)' 'Nacionality' "Mother's qualification"
 "Father's qualification" "Mother's occupation" "Father's occupation"
 'Admission grade' 'Displaced' 'Educational special needs' 'Debtor'
 'Tuition fees up to date' 'Gender' 'Scholarship holder'
 'Age at enrollment' 'International' 'Curricular units 1st sem (credited)'
 'Curricular units 1st sem (enrolled)'
 'Curricular units 1st sem (evaluations)'
 'Curricular units 1st sem (approved)' 'Curricular units 1st sem (grade)'
 'Curricular units 1st sem (without evaluations)'
 'Curricular units 2nd sem (credited)'
 'Curricular units 2nd sem (enrolled)'
 'Curricular units 2nd sem (evaluations)'
 'Curricular units 2nd sem (approved)' 'Curricular units 2nd sem (grade)'
 'Curricular units 2nd sem (without evaluations)' 'Unemploymen

In [10]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 73.57%
F1 score: 0.72
confusion matrix:
[[300  50  91]
 [ 38  76 131]
 [ 19  22 601]]


### Wine

In [11]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["wine"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/109/data.csv
training set size: (124, 13)
testing set size: (54, 13)


In [12]:
print_useful_data(X, X_names, y, y_name)

number of instances:  178
number of features:  13
names of the features:
 ['Alcohol' 'Malicacid' 'Ash' 'Alcalinity_of_ash' 'Magnesium'
 'Total_phenols' 'Flavanoids' 'Nonflavanoid_phenols' 'Proanthocyanins'
 'Color_intensity' 'Hue' '0D280_0D315_of_diluted_wines' 'Proline']
name of the output label:  ['class']
instance 0 feature vector:
 [1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
 2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
instance 0 output label:  [1]


In [13]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 96.30%
F1 score: 0.96
confusion matrix:
[[18  1  0]
 [ 0 21  0]
 [ 1  0 13]]


### MAGIC gamma telescope

In [14]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["magic"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/159/data.csv
training set size: (13314, 10)
testing set size: (5706, 10)


In [15]:
print_useful_data(X, X_names, y, y_name)

number of instances:  19020
number of features:  10
names of the features:
 ['fLength' 'fWidth' 'fSize' 'fConc' 'fConc1' 'fAsym' 'fM3Long' 'fM3Trans'
 'fAlpha' 'fDist']
name of the output label:  ['class']
instance 0 feature vector:
 [28.7967 16.0021  2.6449  0.3918  0.1982 27.7004 22.011  -8.2027 40.092
 81.8828]
instance 0 output label:  ['g']


In [16]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 83.25%
F1 score: 0.83
confusion matrix:
[[3429  276]
 [ 680 1321]]


### Parkinsons

In [17]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["parkinsons"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/174/data.csv
training set size: (136, 22)
testing set size: (59, 22)


In [18]:
print_useful_data(X, X_names, y, y_name)

number of instances:  195
number of features:  22
names of the features:
 ['MDVP:Fo' 'MDVP:Fhi' 'MDVP:Flo' 'MDVP:Jitter' 'MDVP:Jitter' 'MDVP:RAP'
 'MDVP:PPQ' 'Jitter:DDP' 'MDVP:Shimmer' 'MDVP:Shimmer' 'Shimmer:APQ3'
 'Shimmer:APQ5' 'MDVP:APQ' 'Shimmer:DDA' 'NHR' 'HNR' 'RPDE' 'DFA'
 'spread1' 'spread2' 'D2' 'PPE']
name of the output label:  ['status']
instance 0 feature vector:
 [ 1.199920e+02  1.573020e+02  7.499700e+01  7.840000e-03  7.840000e-03
  3.700000e-03  5.540000e-03  1.109000e-02  4.374000e-02  4.374000e-02
  2.182000e-02  3.130000e-02  2.971000e-02  6.545000e-02  2.211000e-02
  2.103300e+01  4.147830e-01  8.152850e-01 -4.813031e+00  2.664820e-01
  2.301442e+00  2.846540e-01]
instance 0 output label:  [1]


In [19]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 86.44%
F1 score: 0.86
confusion matrix:
[[10  5]
 [ 3 41]]


### Page-blocks

In [20]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["page-blocks"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/78/data.csv
training set size: (3831, 10)
testing set size: (1642, 10)


In [21]:
print_useful_data(X, X_names, y, y_name)

number of instances:  5473
number of features:  10
names of the features:
 ['height' 'length' 'area' 'eccen' 'p_black' 'p_and' 'mean_tr' 'blackpix'
 'blackand' 'wb_trans']
name of the output label:  ['class']
instance 0 feature vector:
 [ 5.     7.    35.     1.4    0.4    0.657  2.33  14.    23.     6.   ]
instance 0 output label:  [1]


In [22]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 96.77%
F1 score: 0.96
confusion matrix:
[[1451   10    1    2    2]
 [   5   99    0    2    0]
 [   4    0    6    0    0]
 [   3    1    0   26    1]
 [  19    0    3    0    7]]


### Optical

In [23]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["optical"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)

num_sample = 6000
X_train, X_test = X_train[:num_sample*2,:], X_test[:num_sample,:]
Y_train, Y_test = Y_train[:num_sample*2], Y_test[:num_sample]

print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/80/data.csv
training set size: (3933, 64)
testing set size: (1687, 64)


In [24]:
print_useful_data(X, X_names, y, y_name)

number of instances:  5620
number of features:  64
names of the features:
 ['Attribute1' 'Attribute2' 'Attribute3' 'Attribute4' 'Attribute5'
 'Attribute6' 'Attribute7' 'Attribute8' 'Attribute9' 'Attribute10'
 'Attribute11' 'Attribute12' 'Attribute13' 'Attribute14' 'Attribute15'
 'Attribute16' 'Attribute17' 'Attribute18' 'Attribute19' 'Attribute20'
 'Attribute21' 'Attribute22' 'Attribute23' 'Attribute24' 'Attribute25'
 'Attribute26' 'Attribute27' 'Attribute28' 'Attribute29' 'Attribute30'
 'Attribute31' 'Attribute32' 'Attribute33' 'Attribute34' 'Attribute35'
 'Attribute36' 'Attribute37' 'Attribute38' 'Attribute39' 'Attribute40'
 'Attribute41' 'Attribute42' 'Attribute43' 'Attribute44' 'Attribute45'
 'Attribute46' 'Attribute47' 'Attribute48' 'Attribute49' 'Attribute50'
 'Attribute51' 'Attribute52' 'Attribute53' 'Attribute54' 'Attribute55'
 'Attribute56' 'Attribute57' 'Attribute58' 'Attribute59' 'Attribute60'
 'Attribute61' 'Attribute62' 'Attribute63' 'Attribute64']
name of the output label

In [25]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 69.12%
F1 score: 0.69
confusion matrix:
[[156   0   1   1   4   2   0   0   4   2]
 [  0  34   3   4  31   1   1   0  85  14]
 [  0   2  43   3   1   0   6   0  95   4]
 [  0   5   7 116   1   1   0   0  19  24]
 [  0   5   2   0 140   2   2   4  14  14]
 [  0   1   3   9   3 125   0   2   2   8]
 [  0   8   2   0   2   0 154   0   1   1]
 [  0   0   0   1   8   0   0 150  26   1]
 [  1   1   2  15   6   0   0   5 116   7]
 [  1   0   1  11   0   3   0  13  13 132]]


### Ionosphere

In [26]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ionosphere"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/52/data.csv
training set size: (245, 34)
testing set size: (106, 34)


In [27]:
print_useful_data(X, X_names, y, y_name)

number of instances:  351
number of features:  34
names of the features:
 ['Attribute1' 'Attribute2' 'Attribute3' 'Attribute4' 'Attribute5'
 'Attribute6' 'Attribute7' 'Attribute8' 'Attribute9' 'Attribute10'
 'Attribute11' 'Attribute12' 'Attribute13' 'Attribute14' 'Attribute15'
 'Attribute16' 'Attribute17' 'Attribute18' 'Attribute19' 'Attribute20'
 'Attribute21' 'Attribute22' 'Attribute23' 'Attribute24' 'Attribute25'
 'Attribute26' 'Attribute27' 'Attribute28' 'Attribute29' 'Attribute30'
 'Attribute31' 'Attribute32' 'Attribute33' 'Attribute34']
name of the output label:  ['Class']
instance 0 feature vector:
 [ 1.       0.       0.99539 -0.05889  0.85243  0.02306  0.83398 -0.37708
  1.       0.0376   0.85243 -0.17755  0.59755 -0.44945  0.60536 -0.38223
  0.84356 -0.38542  0.58212 -0.32192  0.56971 -0.29674  0.36946 -0.47357
  0.56811 -0.51171  0.41078 -0.46168  0.21266 -0.3409   0.42267 -0.54487
  0.18641 -0.453  ]
instance 0 output label:  ['g']


In [28]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 90.57%
F1 score: 0.90
confusion matrix:
[[31  8]
 [ 2 65]]


### Glass Identification

In [29]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["glass"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/42/data.csv
training set size: (149, 9)
testing set size: (65, 9)


In [30]:
print_useful_data(X, X_names, y, y_name)

number of instances:  214
number of features:  9
names of the features:
 ['RI' 'Na' 'Mg' 'Al' 'Si' 'K' 'Ca' 'Ba' 'Fe']
name of the output label:  ['Type_of_glass']
instance 0 feature vector:
 [1.52101e+00 1.36400e+01 4.49000e+00 1.10000e+00 7.17800e+01 6.00000e-02
 8.75000e+00 0.00000e+00 0.00000e+00]
instance 0 output label:  [1]


In [31]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 66.15%
F1 score: 0.65
confusion matrix:
[[17  2  0  0  0  0]
 [ 9 10  2  2  0  0]
 [ 2  1  1  0  0  0]
 [ 0  1  0  3  2  0]
 [ 0  1  0  0  2  0]
 [ 0  0  0  0  0 10]]


### Letter Recognition

In [32]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["letter"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/59/data.csv
training set size: (14000, 16)
testing set size: (6000, 16)


In [33]:
print_useful_data(X, X_names, y, y_name)

number of instances:  20000
number of features:  16
names of the features:
 ['x-box' 'y-box' 'width' 'high' 'onpix' 'x-bar' 'y-bar' 'x2bar' 'y2bar'
 'xybar' 'x2ybr' 'xy2br' 'x-ege' 'xegvy' 'y-ege' 'yegvx']
name of the output label:  ['lettr']
instance 0 feature vector:
 [ 2  8  3  5  1  8 13  0  6  6 10  8  0  8  0  8]
instance 0 output label:  ['T']


In [34]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 37.30%
F1 score: 0.33
confusion matrix:
[[187   0   0   2   0   0   0   1   0   1   0   1   1   0   0   0  31   7
    1   0   0   0   0   0   0   0]
 [  0   0   0 189   0   1   0   0   3   0   0   0   0   0   0   0  27   0
    0   0   0   0   0   0   0   9]
 [  0   0   0   1   1   3   0   0  91  13   0   0   0   0   0   0  85   0
    0   4   2   0   0   0   0   1]
 [  0   0   0 217   0   0   0   0   0   0   0   0   0   2   0   0  12   4
    0   0  15   0   0   0   0   0]
 [  0   0   0   0  64   2   0   0   5   0   0   0   0   0   0   0 163   0
    0   0   0   0   0   0   0   4]
 [  0   0   0  46   0  23   0   0   5   0   0   0   0   2   0  26  15   0
    0  49   5  40   0   0   0   0]
 [  1   0   0  13   0   0   4   0   1   1   0   0   0   0   0   0 184   2
    0   0   0   0   0   0   0  24]
 [  0   0   0  46   0   7   0   0   0   0   0   0   0   0   0   0  96   8
    0   0  61   0   0   0   0   0]
 [  0   0   0  28   0   1   0   0 169   3   0   0   0   0   0   1  12   5
    

### Connectionist Bench (Sonar, Mines vs. Rocks)

In [35]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["conn-bench-sonar"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/151/data.csv
training set size: (145, 60)
testing set size: (63, 60)


In [36]:
print_useful_data(X, X_names, y, y_name)

number of instances:  208
number of features:  60
names of the features:
 ['Attribute1' 'Attribute2' 'Attribute3' 'Attribute4' 'Attribute5'
 'Attribute6' 'Attribute7' 'Attribute8' 'Attribute9' 'Attribute10'
 'Attribute11' 'Attribute12' 'Attribute13' 'Attribute14' 'Attribute15'
 'Attribute16' 'Attribute17' 'Attribute18' 'Attribute19' 'Attribute20'
 'Attribute21' 'Attribute22' 'Attribute23' 'Attribute24' 'Attribute25'
 'Attribute26' 'Attribute27' 'Attribute28' 'Attribute29' 'Attribute30'
 'Attribute31' 'Attribute32' 'Attribute33' 'Attribute34' 'Attribute35'
 'Attribute36' 'Attribute37' 'Attribute38' 'Attribute39' 'Attribute40'
 'Attribute41' 'Attribute42' 'Attribute43' 'Attribute44' 'Attribute45'
 'Attribute46' 'Attribute47' 'Attribute48' 'Attribute49' 'Attribute50'
 'Attribute51' 'Attribute52' 'Attribute53' 'Attribute54' 'Attribute55'
 'Attribute56' 'Attribute57' 'Attribute58' 'Attribute59' 'Attribute60']
name of the output label:  ['class']
instance 0 feature vector:
 [0.02   0.0371 0.

In [37]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 73.02%
F1 score: 0.73
confusion matrix:
[[22 13]
 [ 4 24]]


### Musk (Version 2)

In [38]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["musk2"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/75/data.csv
training set size: (4618, 166)
testing set size: (1980, 166)


In [39]:
print_useful_data(X, X_names, y, y_name)

number of instances:  6598
number of features:  166
names of the features:
 ['f1' 'f2' 'f3' 'f4' 'f5' 'f6' 'f7' 'f8' 'f9' 'f10' 'f11' 'f12' 'f13'
 'f14' 'f15' 'f16' 'f17' 'f18' 'f19' 'f20' 'f21' 'f22' 'f23' 'f24' 'f25'
 'f26' 'f27' 'f28' 'f29' 'f30' 'f31' 'f32' 'f33' 'f34' 'f35' 'f36' 'f37'
 'f38' 'f39' 'f40' 'f41' 'f42' 'f43' 'f44' 'f45' 'f46' 'f47' 'f48' 'f49'
 'f50' 'f51' 'f52' 'f53' 'f54' 'f55' 'f56' 'f57' 'f58' 'f59' 'f60' 'f61'
 'f62' 'f63' 'f64' 'f65' 'f66' 'f67' 'f68' 'f69' 'f70' 'f71' 'f72' 'f73'
 'f74' 'f75' 'f76' 'f77' 'f78' 'f79' 'f80' 'f81' 'f82' 'f83' 'f84' 'f85'
 'f86' 'f87' 'f88' 'f89' 'f90' 'f91' 'f92' 'f93' 'f94' 'f95' 'f96' 'f97'
 'f98' 'f99' 'f100' 'f101' 'f102' 'f103' 'f104' 'f105' 'f106' 'f107'
 'f108' 'f109' 'f110' 'f111' 'f112' 'f113' 'f114' 'f115' 'f116' 'f117'
 'f118' 'f119' 'f120' 'f121' 'f122' 'f123' 'f124' 'f125' 'f126' 'f127'
 'f128' 'f129' 'f130' 'f131' 'f132' 'f133' 'f134' 'f135' 'f136' 'f137'
 'f138' 'f139' 'f140' 'f141' 'f142' 'f143' 'f144' 'f145' 'f14

In [40]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 93.94%
F1 score: 0.94
confusion matrix:
[[1646   27]
 [  93  214]]


### Breast Cancer Wisconsin (Diagnostic)

In [41]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["bc-wisc-diag"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/17/data.csv
training set size: (398, 30)
testing set size: (171, 30)


In [42]:
print_useful_data(X, X_names, y, y_name)

number of instances:  569
number of features:  30
names of the features:
 ['radius1' 'texture1' 'perimeter1' 'area1' 'smoothness1' 'compactness1'
 'concavity1' 'concave_points1' 'symmetry1' 'fractal_dimension1' 'radius2'
 'texture2' 'perimeter2' 'area2' 'smoothness2' 'compactness2' 'concavity2'
 'concave_points2' 'symmetry2' 'fractal_dimension2' 'radius3' 'texture3'
 'perimeter3' 'area3' 'smoothness3' 'compactness3' 'concavity3'
 'concave_points3' 'symmetry3' 'fractal_dimension3']
name of the output label:  ['Diagnosis']
instance 0 feature vector:
 [1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
instance 0 output label:  ['M']


In [43]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 95.32%
F1 score: 0.95
confusion matrix:
[[104   4]
 [  4  59]]


### Balance-scale

In [44]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["balance-scale"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/12/data.csv
training set size: (437, 4)
testing set size: (188, 4)


In [45]:
print_useful_data(X, X_names, y, y_name)

number of instances:  625
number of features:  4
names of the features:
 ['right-distance' 'right-weight' 'left-distance' 'left-weight']
name of the output label:  ['class']
instance 0 feature vector:
 [1 1 1 1]
instance 0 output label:  ['B']


In [46]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 72.34%
F1 score: 0.71
confusion matrix:
[[ 1 13  4]
 [ 3 73  4]
 [ 7 21 62]]


### Contraception

In [47]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["contraception"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/30/data.csv
training set size: (1031, 9)
testing set size: (442, 9)


In [48]:
print_useful_data(X, X_names, y, y_name)

number of instances:  1473
number of features:  9
names of the features:
 ['wife_age' 'wife_edu' 'husband_edu' 'num_children' 'wife_religion'
 'wife_working' 'husband_occupation' 'standard_of_living_index'
 'media_exposure']
name of the output label:  ['contraceptive_method']
instance 0 feature vector:
 [24  2  3  3  1  1  2  3  0]
instance 0 output label:  [1]


In [49]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 56.11%
F1 score: 0.56
confusion matrix:
[[112  30  52]
 [ 21  42  38]
 [ 26  27  94]]


### Fertility

In [50]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["fertility"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/244/data.csv
training set size: (70, 9)
testing set size: (30, 9)


In [51]:
print_useful_data(X, X_names, y, y_name)

number of instances:  100
number of features:  9
names of the features:
 ['season' 'age' 'child_diseases' 'accident' 'surgical_intervention'
 'high_fevers' 'alcohol' 'smoking' 'hrs_sitting']
name of the output label:  ['diagnosis']
instance 0 feature vector:
 [-0.33  0.69  0.    1.    1.    0.    0.8   0.    0.88]
instance 0 output label:  ['N']


In [52]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 86.67%
F1 score: 0.87
confusion matrix:
[[25  2]
 [ 2  1]]


### Waveform (Version 1)

In [53]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["waveform"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/107/data.csv
training set size: (3500, 21)
testing set size: (1500, 21)


In [54]:
print_useful_data(X, X_names, y, y_name)

number of instances:  5000
number of features:  21
names of the features:
 ['Attribute1' 'Attribute2' 'Attribute3' 'Attribute4' 'Attribute5'
 'Attribute6' 'Attribute7' 'Attribute8' 'Attribute9' 'Attribute10'
 'Attribute11' 'Attribute12' 'Attribute13' 'Attribute14' 'Attribute15'
 'Attribute16' 'Attribute17' 'Attribute18' 'Attribute19' 'Attribute20'
 'Attribute21']
name of the output label:  ['class']
instance 0 feature vector:
 [-1.23 -1.56 -1.75 -0.28  0.6   2.22  0.85  0.21 -0.2   0.89  1.08  4.2
  2.89  7.75  4.59  3.15  5.12  3.32  1.2   0.24 -0.56]
instance 0 output label:  [2]


In [55]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 77.00%
F1 score: 0.77
confusion matrix:
[[359  61  66]
 [ 70 396  53]
 [ 50  45 400]]


### Image Segmentation

In [56]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["image-segmentation"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/50/data.csv
training set size: (147, 19)
testing set size: (63, 19)


In [57]:
print_useful_data(X, X_names, y, y_name)

number of instances:  210
number of features:  19
names of the features:
 ['region-centroid-col' 'region-centroid-row' 'region-pixel-count'
 'short-line-density-5' 'short-line-density-2' 'vedge-mean' 'vedge-sd'
 'hedge-mean' 'hedge-sd' 'intensity-mean' 'rawred-mean' 'rawblue-mean'
 'rawgreen-mean' 'exred-mean' 'exblue-mean' 'exgreen-mean' 'value-mean'
 'saturation-mean' 'hue-mean']
name of the output label:  ['class']
instance 0 feature vector:
 [ 1.4000000e+02  1.2500000e+02  9.0000000e+00  0.0000000e+00
  0.0000000e+00  2.7777790e-01  6.2963010e-02  6.6666675e-01
  3.1111118e-01  6.1851850e+00  7.3333335e+00  7.6666665e+00
  3.5555556e+00  3.4444444e+00  4.4444447e+00 -7.8888890e+00
  7.7777777e+00  5.4563490e-01 -1.1218182e+00]
instance 0 output label:  ['BRICKFACE']


In [58]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 77.78%
F1 score: 0.76
confusion matrix:
[[ 8  0  0  0  0  0  1]
 [ 0  5  1  2  1  0  1]
 [ 1  0 10  0  0  0  0]
 [ 0  0  0  9  0  0  1]
 [ 0  0  0  0  9  0  0]
 [ 0  0  0  0  0  5  0]
 [ 1  0  5  0  0  0  3]]


### Blood Transfusion Service

In [59]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["blood"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/176/data.csv
training set size: (523, 4)
testing set size: (225, 4)


In [60]:
print_useful_data(X, X_names, y, y_name)

number of instances:  748
number of features:  4
names of the features:
 ['Recency' 'Frequency' 'Monetary' 'Time']
name of the output label:  ['Donated_Blood']
instance 0 feature vector:
 [    2    50 12500    98]
instance 0 output label:  [1]


In [61]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 75.11%
F1 score: 0.73
confusion matrix:
[[149  16]
 [ 40  20]]


### SPECT Heart

In [62]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["spect"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/95/data.csv
training set size: (186, 22)
testing set size: (81, 22)


In [63]:
print_useful_data(X, X_names, y, y_name)

number of instances:  267
number of features:  22
names of the features:
 ['F1' 'F2' 'F3' 'F4' 'F5' 'F6' 'F7' 'F8' 'F9' 'F10' 'F11' 'F12' 'F13'
 'F14' 'F15' 'F16' 'F17' 'F18' 'F19' 'F20' 'F21' 'F22']
name of the output label:  ['OVERALL_DIAGNOSIS']
instance 0 feature vector:
 [0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0]
instance 0 output label:  [1]


In [64]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 60.49%
F1 score: 0.64
confusion matrix:
[[ 4 11]
 [21 45]]


### Statlog (Landsat Satellite)

In [65]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["landsat"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/146/data.csv
training set size: (4504, 36)
testing set size: (1931, 36)


In [66]:
print_useful_data(X, X_names, y, y_name)

number of instances:  6435
number of features:  36
names of the features:
 ['Attribute1' 'Attribute2' 'Attribute3' 'Attribute4' 'Attribute5'
 'Attribute6' 'Attribute7' 'Attribute8' 'Attribute9' 'Attribute10'
 'Attribute11' 'Attribute12' 'Attribute13' 'Attribute14' 'Attribute15'
 'Attribute16' 'Attribute17' 'Attribute18' 'Attribute19' 'Attribute20'
 'Attribute21' 'Attribute22' 'Attribute23' 'Attribute24' 'Attribute25'
 'Attribute26' 'Attribute27' 'Attribute28' 'Attribute29' 'Attribute30'
 'Attribute31' 'Attribute32' 'Attribute33' 'Attribute34' 'Attribute35'
 'Attribute36']
name of the output label:  ['class']
instance 0 feature vector:
 [ 92 115 120  94  84 102 106  79  84 102 102  83 101 126 133 103  92 112
 118  85  84 103 104  81 102 126 134 104  88 121 128 100  84 107 113  87]
instance 0 output label:  [3]


In [67]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 80.74%
F1 score: 0.80
confusion matrix:
[[435   0   3   2   8   2]
 [  4 165   0   0  13   4]
 [ 11   0 345  55   2   3]
 [  6   0  31  56   2 106]
 [ 24   3   0   1 159  32]
 [  4   0   3  38  15 399]]


### Yeast

In [68]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["yeast"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/110/data.csv
training set size: (1038, 8)
testing set size: (446, 8)


In [69]:
print_useful_data(X, X_names, y, y_name)

number of instances:  1484
number of features:  8
names of the features:
 ['mcg' 'gvh' 'alm' 'mit' 'erl' 'pox' 'vac' 'nuc']
name of the output label:  ['localization_site']
instance 0 feature vector:
 [0.58 0.61 0.47 0.13 0.5  0.   0.48 0.22]
instance 0 output label:  ['MIT']


In [70]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 56.05%
F1 score: 0.54
confusion matrix:
[[107   0   0   2   0   3   4  29   0   0]
 [  0   0   0   0   0   0   0   0   0   0]
 [  4   0   4   2   0   0   0   0   0   0]
 [  0   0   2  10   3   0   0   0   0   0]
 [  2   1   0   0   6   1   1   0   0   0]
 [  2   0   0   0   1  44   1   3   0   0]
 [ 32   0   0   1   1   4  32   4   0   0]
 [ 62   0   1   0   0   9   7  47   0   0]
 [  2   0   0   0   0   1   1   0   0   0]
 [  5   1   0   1   0   2   1   0   0   0]]


### Monk

In [71]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["monk"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/70/data.csv
training set size: (302, 6)
testing set size: (130, 6)


In [72]:
print_useful_data(X, X_names, y, y_name)

number of instances:  432
number of features:  6
names of the features:
 ['a1' 'a2' 'a3' 'a4' 'a5' 'a6']
name of the output label:  ['class']
instance 0 feature vector:
 [1 1 1 1 1 1]
instance 0 output label:  [1]


In [73]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 80.00%
F1 score: 0.80
confusion matrix:
[[48 13]
 [13 56]]


### Ecoli

In [74]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ecoli"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/39/data.csv
training set size: (235, 7)
testing set size: (101, 7)


In [75]:
print_useful_data(X, X_names, y, y_name)

number of instances:  336
number of features:  7
names of the features:
 ['mcg' 'gvh' 'lip' 'chg' 'aac' 'alm1' 'alm2']
name of the output label:  ['class']
instance 0 feature vector:
 [0.49 0.29 0.48 0.5  0.56 0.24 0.35]
instance 0 output label:  ['cp']


In [76]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 80.20%
F1 score: 0.80
confusion matrix:
[[44  0  0  0  0  0  2]
 [ 2 13  0  5  0  0  0]
 [ 0  0  0  0  0  0  1]
 [ 0  4  0  7  0  0  0]
 [ 0  0  0  0  6  1  0]
 [ 0  0  0  0  0  1  0]
 [ 2  2  0  0  1  0 10]]


### Iris

In [77]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["iris"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/53/data.csv
training set size: (105, 4)
testing set size: (45, 4)


In [78]:
print_useful_data(X, X_names, y, y_name)

number of instances:  150
number of features:  4
names of the features:
 ['sepal length' 'sepal width' 'petal length' 'petal width']
name of the output label:  ['class']
instance 0 feature vector:
 [5.1 3.5 1.4 0.2]
instance 0 output label:  ['Iris-setosa']


In [79]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 95.56%
F1 score: 0.96
confusion matrix:
[[19  0  0]
 [ 0 11  2]
 [ 0  0 13]]


### Zoo

In [80]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["zoo"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/111/data.csv
training set size: (70, 16)
testing set size: (31, 16)


In [81]:
print_useful_data(X, X_names, y, y_name)

number of instances:  101
number of features:  16
names of the features:
 ['hair' 'feathers' 'eggs' 'milk' 'airborne' 'aquatic' 'predator' 'toothed'
 'backbone' 'breathes' 'venomous' 'fins' 'legs' 'tail' 'domestic'
 'catsize']
name of the output label:  ['type']
instance 0 feature vector:
 [1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1]
instance 0 output label:  [1]


In [82]:
#model training
model = DecisionTreeClassifier(max_depth=5, criterion="gini", random_state=42)
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 93.55%
F1 score: 0.92
confusion matrix:
[[15  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  0  2  0  0]
 [ 0  0  0  0  0  4  1]
 [ 0  0  0  0  0  0  3]]
