# Mini-project: Gaussian Naive Bayes

## <u>Preprocessing</u>

In [None]:
# before executing:
# pip install ucimlrepo

In [23]:
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
dataset_ids =   {
                "parkinsons": 174,
                "page-blocks": 78,
                "optical": 80,
                "musk2": 75,
                "bc-wisc-diag": 17,
                "students": 697,
                "wine": 109,
                "magic": 159,
                "balance-scale": 12,
                "glass": 42,
                "zoo": 111,
                "waveform": 107,
                "image-segmentation": 50,
                "blood": 176,
                "spect": 95,
                "yeast": 110,
                "monk": 70,
                "ecoli": 39,
                "iris": 53,
                "contraception": 30,
                "fertility": 244,
                "conn-bench-sonar":  151,
                "landsat": 146,
                "ionosphere": 52,
                "letter": 59,
                }

In [11]:
def load_dataset(id):  
    # fetch dataset 
    dataset = fetch_ucirepo(id=id) 
    
    # data (as pandas dataframes) 
    X = dataset.data.features 
    y =dataset.data.targets 
    
    # dictionary gthering infos about the metadata (url, abstract, ... etc.)
    metadata_infos_dict = dataset.metadata
    print('data url:\n', metadata_infos_dict['data_url'])
    
    # variable information
    var_infos = dataset.variables.to_numpy()
    
    data_vectors = X.to_numpy() #instance vectors with features
    features_names = X.columns.to_numpy() #getting the names of each feature
    
    data_labels = y.to_numpy() #output labels for each instance
    label_name = y.columns.to_numpy() # name of the output label
    
    return data_vectors, features_names, data_labels, label_name

In [50]:
def print_useful_data(X, X_names, y, y_name, index = 0):
    n_instances = len(X)
    n_features = len(X_names)
    
    print("number of instances: ", n_instances)
    print("number of features: ", n_features)

    print("names of the features:\n", X_names)
    print("name of the output label: ", y_name)

    print(f"instance {index} feature vector:\n", X[index])
    print(f"instance {index} output label: ", y[index])

In [17]:
def preprocessing(X, y, split=0.7):
    scaler = StandardScaler()
    try:
        #standardization
        X = scaler.fit_transform(X)
    except ValueError:
        #If non numerical data is detected, data is encoded
        X = np.array(X, dtype=object)
        encoder = OneHotEncoder()
        X_encoded = encoder.fit_transform(X).toarray()
        X = scaler.fit_transform(X_encoded)
    
    #data is split among training and testing sets
    return train_test_split(X, y, train_size=split, random_state=42)

## <u>Gaussian Naive Bayes</u>

In [51]:
class GaussianBayes:
    def __init__(self):
        self.classes = None
        self.mean = {}
        self.variance = {}
        self.prior = {}

    
    def fit(self, X_train, Y_train):
        self.classes = np.unique(Y_train)
        
        for singular_class in self.classes:
            X_c = X_train[Y_train.flatten() == singular_class]
            self.mean[singular_class] = np.mean(X_c, axis=0) 
            self.variance[singular_class] = np.var(X_c, axis=0)
            self.prior[singular_class] = X_c.shape[0] / X_train.shape[0]
            
    def gaussian_density(self, x, mean, var, epsilon=1e-9):
        # Stabilize variance to avoid division by zero
        var = var + epsilon
        return (1 / np.sqrt(2*np.pi*var)) * np.exp(-((x-mean)**2) / (2*var))
    
    def predict_sample(self, X):
        probability = []
        for singular_class in self.classes:
            log_likelihood = np.sum(np.log(np.maximum(self.gaussian_density(X, self.mean[singular_class], self.variance[singular_class]), 1e-6)))
            log_prior = np.log(self.prior[singular_class])
            probability.append(log_prior + log_likelihood)
        
        probability = np.array(probability)
        return self.classes[np.argmax(probability)]
    
    def predict(self, X_test):
        return np.array([self.predict_sample(x) for x in X_test])


In [7]:
#function to visualize clusters using PCA
def visualize_clusters(X, clusters, centroids, iteration, nb_cluster, cluster_labels=None):
    plt.figure(figsize=(8, 6))
    colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']  #define colors for clusters

    #reduce the data to 2D using PCA for visualization
    pca = PCA(n_components=2)
    X_2D = pca.fit_transform(X) 

    #if cluster assignments are provided, color by cluster membership
    if cluster_labels is not None:
        for i in range(nb_cluster):
            cluster_points_2D = X_2D[cluster_labels == i]
            plt.scatter(cluster_points_2D[:, 0], cluster_points_2D[:, 1], 
                        color=colors[i % len(colors)], label=f'Cluster {i+1}')
    else:
        for i, cluster in enumerate(clusters):
            cluster_points = np.array(cluster)
            cluster_points_2D = pca.transform(cluster_points)  #transform current cluster into 2D
            plt.scatter(cluster_points_2D[:, 0], cluster_points_2D[:, 1], 
                        color=colors[i % len(colors)], label=f'Cluster {i+1}')

    #plot centroids in 2D space
    centroids_2D = pca.transform(centroids)  #transform centroids into 2D
    plt.scatter(centroids_2D[:, 0], centroids_2D[:, 1], 
                color='black', marker='x', s=200, label='Centroids')

    plt.title(f"clusters and centroids - Iteration {iteration}")
    plt.legend()
    plt.show()

## <u>Accuracy, F1 and confusion matrice</u>

In [8]:
def calculate_metrics(true_labels, predicted_labels):
    
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    cm = confusion_matrix(Y_test, Y_pred)
    
    return accuracy, f1, cm

## <u>Results on 25 datasets</u>

### Predict Students' Dropout and Academic Success

In [27]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["students"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split=split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/697/data.csv
training set size: (3096, 36)
testing set size: (1328, 36)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [28]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 69.28%
F1 score: 0.68
confusion matrix:
[[309  55  77]
 [ 44  68 133]
 [ 38  61 543]]


### Wine

In [34]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["wine"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/109/data.csv
training set size: (124, 13)
testing set size: (54, 13)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [35]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 100.00%
F1 score: 1.00
confusion matrix:
[[19  0  0]
 [ 0 21  0]
 [ 0  0 14]]


### MAGIC gamma telescope

In [36]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["magic"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/159/data.csv
training set size: (13314, 10)
testing set size: (5706, 10)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [37]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 72.75%
F1 score: 0.70
confusion matrix:
[[3410  295]
 [1260  741]]


### Parkinsons

In [54]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["parkinsons"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/174/data.csv
training set size: (136, 22)
testing set size: (59, 22)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [55]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 76.27%
F1 score: 0.78
confusion matrix:
[[12  3]
 [11 33]]


### Page-blocks

In [52]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["page-blocks"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/78/data.csv
training set size: (3831, 10)
testing set size: (1642, 10)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [53]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 87.27%
F1 score: 0.90
confusion matrix:
[[1311    7    8  118   22]
 [  24   79    1    2    0]
 [   2    0    7    0    1]
 [   0    2    0   28    1]
 [  12    0    5    4    8]]


### Optical

In [56]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["optical"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)

num_sample = 6000
X_train, X_test = X_train[:num_sample*2,:], X_test[:num_sample,:]
Y_train, Y_test = Y_train[:num_sample*2], Y_test[:num_sample]

print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/80/data.csv
training set size: (3933, 64)
testing set size: (1687, 64)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [57]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 61.94%
F1 score: 0.60
confusion matrix:
[[170   0   0   0   0   0   0   0   0   0]
 [  0 116   0   0   0   0  14   0  38   5]
 [  2  12  66   0   0   0   0   0  74   0]
 [  7   0   0  50   0   0   0   8 104   4]
 [ 21  24   0   0  43   0  37  49   6   3]
 [ 21   1   0   0   0  46   1   8  74   2]
 [  1   1   0   0   0   0 166   0   0   0]
 [  0   3   0   0   0   0   0 180   3   0]
 [  3   2   0   0   0   0   0   1 147   0]
 [ 38   6   0   0   0   0   2  21  46  61]]


### Ionosphere

In [58]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ionosphere"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/52/data.csv
training set size: (245, 34)
testing set size: (106, 34)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [59]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 88.68%
F1 score: 0.88
confusion matrix:
[[28 11]
 [ 1 66]]


### Glass Identification

In [62]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["glass"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/42/data.csv
training set size: (149, 9)
testing set size: (65, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [63]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 21.54%
F1 score: 0.21
confusion matrix:
[[ 0  1 18  0  0  0]
 [ 1  3 15  0  4  0]
 [ 1  0  3  0  0  0]
 [ 0  3  0  1  2  0]
 [ 0  0  0  0  3  0]
 [ 0  0  0  1  5  4]]


### Letter Recognition

In [64]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["letter"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/59/data.csv
training set size: (14000, 16)
testing set size: (6000, 16)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [65]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 64.30%
F1 score: 0.64
confusion matrix:
[[213   0   0   0   0   0   0   1   0   0   1   0   6   1   0   0   1   1
    7   0   0   0   0   0   1   0]
 [  0 155   0   6   0   0   2   5  20   0   5   0   4   0   2   0   4  16
    5   0   0   0   4   1   0   0]
 [  0   0 144   0   6   1  16   1   0   0  11   0   2   0   6   0   4   0
    5   2   2   1   0   0   0   0]
 [  2  13   0 193   0   0   0   5   3   5   5   0   3   0   3   1   0  11
    4   0   0   0   1   1   0   0]
 [  0   5   1   0  84   0  43   0  22   0   9   0   0   0   0   0  12   0
   19   4   0   0   0  33   2   4]
 [  0   9   0   9   0 152   9   1   1   0   0   0   0   4   0   9   5   1
    4   4   0   0   2   0   1   0]
 [  5   6  50   1   1   0 127   0   4   0   2   0   4   0   2   0  12   4
    4   0   0   0   6   2   0   0]
 [  1   6   0  19   0   0   3  67   4   0   9   0   5   2  35   0   1  20
    1   0   8   0   0  32   5   0]
 [  0   4   0  22   2   0   0   0 164   7   0   2   0   0   0   1   3   0
   1

### Connectionist Bench (Sonar, Mines vs. Rocks)

In [66]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["conn-bench-sonar"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/151/data.csv
training set size: (145, 60)
testing set size: (63, 60)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [67]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 77.78%
F1 score: 0.78
confusion matrix:
[[23 12]
 [ 2 26]]


### Musk (Version 2)

In [68]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["musk2"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/75/data.csv
training set size: (4618, 166)
testing set size: (1980, 166)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [69]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 75.15%
F1 score: 0.78
confusion matrix:
[[1240  433]
 [  59  248]]


### Breast Cancer Wisconsin (Diagnostic)

In [70]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["bc-wisc-diag"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/17/data.csv
training set size: (398, 30)
testing set size: (171, 30)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [71]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 93.57%
F1 score: 0.94
confusion matrix:
[[103   5]
 [  6  57]]


### Balance-scale

In [72]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["balance-scale"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/12/data.csv
training set size: (437, 4)
testing set size: (188, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [73]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 88.83%
F1 score: 0.84
confusion matrix:
[[ 0 13  5]
 [ 0 80  0]
 [ 0  3 87]]


### Contraception

In [74]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["contraception"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/30/data.csv
training set size: (1031, 9)
testing set size: (442, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [75]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 45.25%
F1 score: 0.45
confusion matrix:
[[77 53 64]
 [13 55 33]
 [42 37 68]]


### Fertility

In [76]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["fertility"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/244/data.csv
training set size: (70, 9)
testing set size: (30, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [77]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 93.33%
F1 score: 0.92
confusion matrix:
[[27  0]
 [ 2  1]]


### Waveform (Version 1)

In [78]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["waveform"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/107/data.csv
training set size: (3500, 21)
testing set size: (1500, 21)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [79]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 80.87%
F1 score: 0.80
confusion matrix:
[[262 117 107]
 [  5 490  24]
 [  3  31 461]]


### Image Segmentation

In [80]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["image-segmentation"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/50/data.csv
training set size: (147, 19)
testing set size: (63, 19)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [81]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 66.67%
F1 score: 0.65
confusion matrix:
[[ 3  0  0  0  0  0  6]
 [ 1  6  0  0  0  0  3]
 [ 0  1  1  0  0  0  9]
 [ 0  0  0 10  0  0  0]
 [ 0  0  0  0  9  0  0]
 [ 0  0  0  0  0  5  0]
 [ 0  0  1  0  0  0  8]]


### Blood Transfusion Service

In [82]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["blood"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/176/data.csv
training set size: (523, 4)
testing set size: (225, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [83]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 74.67%
F1 score: 0.68
confusion matrix:
[[160   5]
 [ 52   8]]


### SPECT Heart

In [84]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["spect"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/95/data.csv
training set size: (186, 22)
testing set size: (81, 22)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [85]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 55.56%
F1 score: 0.59
confusion matrix:
[[15  0]
 [36 30]]


### Statlog (Landsat Satellite)

In [86]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["landsat"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/146/data.csv
training set size: (4504, 36)
testing set size: (1931, 36)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [87]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 79.13%
F1 score: 0.80
confusion matrix:
[[351   0  14   0  85   0]
 [  5 165   0   0  15   1]
 [  6   0 367  40   2   1]
 [  2   0  27 140   3  29]
 [ 21   1   0  11 161  25]
 [  0   0   2  94  19 344]]


### Yeast

In [88]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["yeast"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/110/data.csv
training set size: (1038, 8)
testing set size: (446, 8)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [90]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 15.25%
F1 score: 0.18
confusion matrix:
[[  0   1   7   1   0   0   2  18   0 116]
 [  0   0   0   0   0   0   0   0   0   0]
 [  0   0   8   0   0   0   0   0   0   2]
 [  0   0   6   9   0   0   0   0   0   0]
 [  0   0   3   4   0   0   0   0   0   4]
 [  0   0   0   0   0   1   0   4   0  46]
 [  0   0  10   3   0   0  16   2   0  43]
 [  0   0   3   1   0   0   3  26   0  93]
 [  0   0   1   0   0   0   0   0   3   0]
 [  0   0   3   2   0   0   0   0   0   5]]


### Monk

In [91]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["monk"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/70/data.csv
training set size: (302, 6)
testing set size: (130, 6)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [92]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 71.54%
F1 score: 0.72
confusion matrix:
[[47 14]
 [23 46]]


### Ecoli

In [93]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ecoli"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/39/data.csv
training set size: (235, 7)
testing set size: (101, 7)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [94]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 73.27%
F1 score: 0.67
confusion matrix:
[[44  1  0  0  0  0  1]
 [ 2 17  0  1  0  0  0]
 [ 0  0  0  0  0  0  1]
 [ 0 11  0  0  0  0  0]
 [ 0  0  0  0  0  0  7]
 [ 0  0  0  0  0  1  0]
 [ 1  2  0  0  0  0 12]]


### Iris

In [95]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["iris"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/53/data.csv
training set size: (105, 4)
testing set size: (45, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [96]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 97.78%
F1 score: 0.98
confusion matrix:
[[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]


### Zoo

In [97]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["zoo"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/111/data.csv
training set size: (70, 16)
testing set size: (31, 16)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [98]:
#model training
model = GaussianBayes()
model.fit(X_train, Y_train)

#labels prediction
Y_pred = model.predict(X_test)

accuracy, f1, cm = calculate_metrics(Y_test, Y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 90.32%
F1 score: 0.91
confusion matrix:
[[14  0  0  1  0  0  0]
 [ 0  3  0  0  0  0  0]
 [ 0  0  0  1  0  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  0  2  0  0]
 [ 0  0  0  0  0  5  0]
 [ 0  0  1  0  0  0  2]]
