# The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2019 Semester 1
-----
## Project 1: Gaining Information about Naive Bayes
-----
###### Student Name(s): Xiuge Chen 961392
###### Python version: python3
###### Submission deadline: 1pm, Fri 5 Apr 2019

This iPython notebook is a template which you may use for your Project 1 submission. (You are not required to use it; in particular, there is no need to use iPython if you do not like it.)

Marking will be applied on the five functions that are defined in this notebook, and to your responses to the questions at the end of this notebook.

You may change the prototypes of these functions, and you may write other functions, according to your requirements. We would appreciate it if the required functions were prominent/easy to find. 

In [None]:
# To use different test strategy, change the NUM_PARTITION in SYSTEM SETTINGS, 1 represents test on training data, 
# other represents k-fold cross validation 

In [1]:
# import
import os
import sys
import math
import numpy as np

In [2]:
##### SYSTEM SETTINGS #####

# data folder path
FOLDER_PATH = "./2019S1-proj1-data/"
# file name that contains header information
HEADER_FILE = "headers.txt"
# epsilon smoothing value
EPSILON = sys.float_info.epsilon
# number of partition used in cross validation, 1 if testing on training data
NUM_PARTITION = 1

##### END SYSTEM SETTINGS #####

In [3]:
#### DATA PROCESSING ####
# opens a data file, and converts it into a usable format dataset with proper header at its first row
# INPUT: fileName that contains data
# OUTPUT: dataset, a list of instance, where instance is a 2-tuple 
# (a list of attribute values, and a class label attribute)
def preprocess(fileName):
    dataSet = []
    
    # get headers, transform into instance and append it to the first line of dataset
    headerArray = getHeader(fileName)
    dataSet.append((headerArray[:len(headerArray) - 1], headerArray[len(headerArray) - 1]))
    
    # get data, transform into instance and append it to dataset
    dataFile = open(FOLDER_PATH + fileName, 'r')
    for line in dataFile:
        line = line.rstrip('\n')
        dataArray = line.split(',')
        
        dataSet.append((dataArray[:len(dataArray) - 1], dataArray[len(dataArray) - 1]))
    
    return dataSet

# given a fileName, return the header that decripts to this data file
# INPUT: file name
# OUTPUT: array of elements in header
def getHeader(fileName):
    headerFile = open(FOLDER_PATH + HEADER_FILE, 'r') 
    lines = headerFile.readlines()
    
    index = lines.index(fileName + '\n')   
    header = lines[index + 2]
    header = header.rstrip('\n')
    headerArray = header.split(',')
    
    return headerArray

# given a folder that contains all files, get all csv file names
# INPUT: folder path
# OUTPUT: csv file names
def getFileNames(folderPath):
    filePaths = []
    
    for dicName in os.listdir(folderPath):
        if dicName.startswith('.') or not dicName.endswith(".csv"):
            continue
        
        filePaths.append(dicName)
    
    return filePaths

# partition given data set into training and testing data based on cross validation
# INPUT: dataset, number of partition (1 if test on training data)
# OUTPUT: list of training dataset, list of testing dataset, the length of list
def partition(dataSet, numPartition):
    if numPartition == 1 or numPartition >= len(dataSet[1:]):
        return [dataSet.copy()], [dataSet.copy()], 1
    
    data, dataHeader, dataLen = dataSet[1:].copy(), dataSet[0], len(dataSet[1:])
    trains, tests, partitionLen, extraInstance = [], [], int(dataLen / numPartition), dataLen % numPartition
    
    np.random.shuffle(data)
    
    for i in range(0, numPartition):
        # fill the extra instances that couldn't be divided equally into partition
        if i < extraInstance:
            testIndex = list((range(i * (partitionLen + 1), (i + 1) * (partitionLen + 1))))
        else:
            testIndex = list((range(i * partitionLen + extraInstance, (i + 1) * partitionLen + extraInstance)))
            
        trainIndex = list(set(range(0, dataLen)).difference(set(testIndex)))
        
        test, train = [dataHeader], [dataHeader]
        
        for j in testIndex:
            test.append(data[j])
            
        for k in trainIndex:
            train.append(data[k])
        
        tests.append(test)
        trains.append(train)
    
    return trains, tests, numPartition
    
#### END DATA PROCESSING ####

In [4]:
#### TRAINING STAGE ####
# calculate probabilities (prior, posteriors) from the training data, to build a Naive Bayes (NB) model
# missing value handling: not contribute to the counts/probability estimates, exclude a missing value by subtract 1 from everywhere it counts before, not delete the whole row!!!
# INPUT: usable format dataSet with proper header as first row
# OUTPUT: model, a 2-tuple contains a dictionary of normalised counts (probabilities) representing the class 
# distribution P(c), and a dictionary (one per class) of dictionaries (one per attribute) of dictionaries (keys are attribute 
# values, values are counts or probabilities) representing the conditional probabilities P(a|c);
def train(dataSet):
    # class distribution and conditional probabilities
    pc, pac = {}, {}  
    dataHeader = dataSet[0][0]

    for instance in dataSet[1:]:
        # count priors and initialize posteriors
        classTag, attributes = instance[1], instance[0]
        
        if classTag in pc:
            pc[classTag] += 1
        else:
            pc[classTag], pac[classTag] = 1, {}
        
        # count posteriors, pac represent conditional probabilities
        for i in range(0, len(attributes)):
            attributeName, attributeValue = dataHeader[i], attributes[i]
        
            if attributeName in pac[classTag]:
                if attributeValue in pac[classTag][attributeName]:
                    pac[classTag][attributeName][attributeValue] += 1
                else:
                    pac[classTag][attributeName][attributeValue] = 1
            else:
                pac[classTag][attributeName] = {}
                pac[classTag][attributeName][attributeValue] = 1
                
    # normalise counts to get posteriors probabilities of each values of each attributes of each classes
    # eliminate missing value
    for singleClass in pac.keys():
        for attribute in pac[singleClass].keys():
            valueKeys = pac[singleClass][attribute].keys()
            numInstance = pc[singleClass]
            
            # if there is missing value, not contribute to the counts/probability estimates, however we still maintain ? in model just for calculating entropy
            if '?' in valueKeys:
                numInstance = numInstance - pac[singleClass][attribute]['?']
            
            for valueKey in valueKeys:
                # infinity if all are '?'
                if numInstance == 0:
                    pac[singleClass][attribute][valueKey] = float("inf")
                else:
                    pac[singleClass][attribute][valueKey] = pac[singleClass][attribute][valueKey] / numInstance
    
    # normalise counts to get priors probabilities of each classes
    for singleClass in pc.keys():
        pc[singleClass] = pc[singleClass] / len(dataSet[1:])
    
    return (pc, pac)
    
#### END TRAINING STAGE ####

In [5]:
#### PREDICTING STAGE ####

# predict classes distribution for the test dataset, for each probabilities use log-transformation to prevent underflow, for new values using epsilon smoothing method
# INPUT: the Naive Bayes model, usable format dataSet with proper header at first row
# OUTPUT: a list of class labels
def predict(model, dataSet):
    dataHeader, predictClass = dataSet[0][0], []
    pc, pac = model

    for instance in dataSet[1:]:
        maxClass, maxProb = "", float('-inf')
        attributes = instance[0]
        
        for singleClass in pac.keys():
            # calculate probabilities for each class, replace 0 with epsilon
            prob = math.log(pc[singleClass], 2)
            
            for i in range(0, len(attributes)):
                attributeName, attributeValue = dataHeader[i], attributes[i]
                
                if attributeValue == '?':
                    prob += 0
                elif attributeValue in pac[singleClass][attributeName].keys():
                    prob += math.log(pac[singleClass][attributeName][attributeValue], 2)
                else:
                    prob += math.log(EPSILON, 2)
            
            if prob > maxProb:
                maxClass, maxProb = singleClass, prob
        
        predictClass.append(maxClass)
        
    return predictClass
    
#### END PREDICTING STAGE ####

In [6]:
#### EVALUATION STAGE ####

# output evaluation metric(s) for model and test data, or sufficient information so that they can be easily calculated by hand
# INPUT: trained model, usable dataset with header for testing
# OUTPUT: void, but prints the necessary evaluation metric information
# print example:
#   A B
# A 2 1
# B 0 3
# meaning: 2 instances of class A are correctly identified as A, 1 is mistakenly identified as B, all 3 instances of B are correctly identified as B, none of them is mistakenly identified as A
def evaluate(model, testSet, num):
    # get correct label array
    correctLabels, resultLabels = [], []
    for instance in testSet[1:]:
        correctLabels.append(instance[1])
    
    resultLabels = predict(model, testSet)
    
    # get result 2d map, primary key is each primary class, secondary key and value are the number of other class primary class has been identified into
    # ex. {'A': {'A': 2, 'B': 1}} means for total 3 test instances of A, 2 of them are correctly identified as A itself, 1 is mistakenly identified as B
    metricMap = {}
    for i in range(0, len(correctLabels)):
        correctLabel, resultLabel = correctLabels[i], resultLabels[i]
        
        if correctLabel in metricMap.keys():
            if resultLabel in metricMap[correctLabel].keys():
                metricMap[correctLabel][resultLabel] += 1
            else:
                metricMap[correctLabel][resultLabel] = 1
        else: 
            metricMap[correctLabel] = {}
            metricMap[correctLabel][resultLabel] = 1
            
    # transform the result map to output matrix format and output results       
    print("\nClass Idendification Initial Number Matrix:")
    print("\t" + "".join(key + "\t" for key in metricMap.keys()))
    
    for priKey in metricMap.keys():
        countList = [priKey]
        
        for secKey in metricMap.keys():
            if secKey in metricMap[priKey].keys():
                countList.append(metricMap[priKey][secKey])
            else:
                countList.append(0)
                
        print("".join(str(count) + "\t" for count in countList))
        
    # transform the result map to precision, recall, f1-score by assume each class is the interested one separately
    print("\nClass Idendification Advanced Data Matrix:")
    print("\t" + "".join(key + "\t" for key in ["precision", "recall", "f1-score"]))
    
    # get TP, FP, FN for each class and print its precision, recall and f1 score
    # calculate and output micro-averaging, macro-averaging, weighted-averaging
    macroavg, weightedavg, microavg = ["macro avg", 0, 0], ["weighted avg", 0, 0], ["micro avg"]
    tps, fps, fns, precisions, recalls = 0, 0, 0, 0, 0
    for priKey in metricMap.keys():
        tp, fp, fn, precision, recall, f1, countList = 0, 0, 0, 0, 0, 0, [priKey]
        
        # get tp, fp, fn for each class
        for secKey in metricMap.keys():
            if secKey in metricMap[priKey].keys():
                if secKey == priKey:
                    tp = metricMap[priKey][secKey]
                else:
                    fn += metricMap[priKey][secKey]
            
            if priKey in metricMap[secKey].keys() and priKey != secKey:
                fp += metricMap[secKey][priKey]
        
        # calculate precision and recall, f1 for each class     
        if (tp + fp) > 0:
            precision = tp / (tp + fp)
        if (tp + fn) > 0:
            recall = tp / (tp + fn)
        if (precision + recall) > 0:
            f1 = (2 * precision * recall) / (precision + recall)
            
        countList.extend([precision , recall, f1])
        print("".join(str(count) + "\t" for count in countList))
        
        tps += tp
        fps += fp
        fns += fn
        macroavg[1] += precision / len(metricMap.keys())
        macroavg[2] += recall / len(metricMap.keys())
        if priKey in model[0].keys():
            weightedavg[1] += precision * model[0][priKey]
            weightedavg[2] += recall * model[0][priKey]
    
    if not tps + fps == 0:
        precisionu = tps / (tps + fps)
    else:
        precisionu = 0.0
        
    if not tps + fns == 0:
        recallu = tps / (tps + fns)
    else:
        recallu = 0.0
    
    # calculate f1 score for different average
    if not macroavg[1] + macroavg[2] == 0:
        macroavg.append((2 * macroavg[1] * macroavg[2]) / (macroavg[1] + macroavg[2]))
    else:
        macroavg.append(0.0)
    
    if not weightedavg[1] + weightedavg[2] == 0:
        weightedavg.append((2 * weightedavg[1] * weightedavg[2]) / (weightedavg[1] + weightedavg[2]))
    else: 
        weightedavg.append(0.0)
        
    if not precisionu + recallu == 0:
        microavg.extend([precisionu, recallu, (2 * precisionu * recallu) / (precisionu + recallu)])    
    else:
        microavg.extend([0.0, 0.0, 0.0])
    
    print("".join(str(count) + "\t" for count in macroavg))
    print("".join(str(count) + "\t" for count in weightedavg))
    print("".join(str(count) + "\t" for count in microavg))

    return
    
#### END EVALUATION ####

In [7]:
#### INFORMATION GAIN ####

# calculate the Information Gain (IG) for one (or each) attribute, relative to the class distribution
# if there is a missing value, exclude a missing value by subtract 1 from everywhere it counts before, but not delete the whole row!!!
# INPUT: A trained model
# OUTPUT: a dictionary maps each attribute to its values of Information Gain
def info_gain(model):
    pc, pac = model
    ig, attributes_values, output = 0, {}, {}
    
    # preparing, get useful data
    # get entire attributes and value mapping list
    attributes = []
    for eachclass in pac:
        attributes.extend(pac[eachclass].keys())  
    # get unique attributes set
    attributes = list(set(attributes))
    
    for attribute in attributes:
        values = []        
        for eachclass in pac:
            values.extend(pac[eachclass][attribute].keys())
        
        values = list(set(values))
        attributes_values[attribute] = values
    
    # info_gain calculation
    for attribute in attributes_values.keys():
        meanInfo, pc_copy, ig = 0, pc.copy(), 0
        
        # re-calculate the probabilities distribution of each class and entropy of the root when missing value '?' presents
        # subtract the ? from everywhere it counts to avoid wrong entropy
        if '?' in attributes_values[attribute]:
            # get the probability of missing value appeared in all instances
            missValueProb, class_prob = 0, {}
            for eachclass in pc_copy.keys():
                classProb = 0
                for value in pac[eachclass][attribute].keys():
                    classProb += pac[eachclass][attribute][value]
                
                class_prob[eachclass] = classProb
                missValueProb += pc_copy[eachclass] - pc_copy[eachclass] / classProb
                    
            # re-calculate the probabilities distribution of each class and entropy of the root
            for eachclass in pc_copy.keys():
                pc_copy[eachclass] = pc_copy[eachclass] / class_prob[eachclass] / (1 - missValueProb)
                
                # when the refined probability of a class is 0 (all instances of this class has attribute = '?'), add nothing (0) to ig
                if not pc_copy[eachclass] == 0:
                    ig -= pc_copy[eachclass] * math.log(pc_copy[eachclass], 2)
        
        else:
            # entropy of the root with no missing value, the entropy before splitting the tree using the attribute’s values
            for eachclass in pc.keys():
                ig -= pc[eachclass] * math.log(pc[eachclass], 2)
        
        # the weighted average of the entropy over the children after the split (Mean Information)
        # Mean Information (attribute a) = sum_v( P(value v) * H(value v) )
        for value in attributes_values[attribute]:
            if value == '?':
                continue
        
            prob_av, h_av = 0, 0
            
            # P(a=v), prob_av
            for eachclass in pac.keys():
                if attribute in pac[eachclass].keys() and value in pac[eachclass][attribute].keys():
                    prob_av += pac[eachclass][attribute][value] * pc_copy[eachclass]
        
            # H(a=v) =  - sum_c( P(class c | a=v) * log(P(c | a=v)) ), h_av
            # P(c | a=v) = (P(a=v | c) * P(c)) / P(a=v), prob_c_av
            for eachclass in pac.keys():
                if attribute in pac[eachclass].keys() and value in pac[eachclass][attribute].keys():
                    prob_c_av = (pac[eachclass][attribute][value] * pc_copy[eachclass]) / prob_av
                    h_av -= prob_c_av * math.log(prob_c_av, 2)          
                
            meanInfo += prob_av * h_av
            
        output[attribute] = ig - meanInfo

    return output
    
#### END INFORMATION GAIN ####

In [8]:
#### MAIN FUNCTION ####
# main function that initialize and execute this program
def main(): 
    # get all file paths and header mapping for each file
    fileNames = getFileNames(FOLDER_PATH)
    
    for fileName in fileNames:
        print("\nFile: " + fileName)

        dataSet = preprocess(fileName)
        
        trainSets, testSets, num = partition(dataSet, NUM_PARTITION)
        
        for i in range(0, num):
            print("\nNo." + str(i + 1) + " model: ")
            model = train(trainSets[i])
        
            evaluate(model, testSets[i], num)
        
            iglist = info_gain(model)
            print("\ninformation gain list: ", sorted(iglist.items() , reverse=True, key=lambda x: x[1]))
    
    return
    
# make the main function work
if __name__ == "__main__":
    main()
    
#### END MAIN FUNCTION ####


File: hypothyroid.csv

No.1 model: 

Class Idendification Initial Number Matrix:
	hypothyroid	negative	
hypothyroid	0	151	
negative	0	3012	

Class Idendification Advanced Data Matrix:
	precision	recall	f1-score	
hypothyroid	0	0.0	0	
negative	0.9522605121719886	1.0	0.9755465587044534	
macro avg	0.4761302560859943	0.5	0.4877732793522267	
weighted avg	0.9068000830420581	0.9522605121719886	0.9289744656395238	
micro avg	0.9522605121719886	0.9522605121719886	0.9522605121719886	

information gain list:  [('TSH', 0.009353710215580346), ('TT4', 0.0057925537058469145), ('T4U', 0.005768288201614458), ('FTI', 0.005744031245602799), ('T3', 0.004075493419623877), ('TBG', 0.002580427555574416), ('query-hypothyroid', 0.0013683791752742147), ('query-on-thyroxine', 0.0012382074503017315), ('surgery', 0.0009985293906336068), ('on-thyroxine', 0.0009139351160850073), ('tumor', 0.0008983004044028076), ('query-hyperthyroid', 0.0005423006444424394), ('sick', 0.0004888757691284829), ('pregnant', 0.00043509384


Class Idendification Initial Number Matrix:
	unacc	acc	vgood	good	
unacc	1161	47	0	2	
acc	85	289	0	10	
vgood	0	26	39	0	
good	0	46	2	21	

Class Idendification Advanced Data Matrix:
	precision	recall	f1-score	
unacc	0.9317817014446228	0.959504132231405	0.9454397394136809	
acc	0.7083333333333334	0.7526041666666666	0.7297979797979798	
vgood	0.9512195121951219	0.6	0.7358490566037735	
good	0.6363636363636364	0.30434782608695654	0.411764705882353	
macro avg	0.8069245458341786	0.654114031246257	0.72252803706509	
weighted avg	0.8710614687209302	0.8738425925925926	0.8724498143028194	
micro avg	0.8738425925925926	0.8738425925925926	0.8738425925925926	

information gain list:  [('safety', 0.26218435655426386), ('persons', 0.2196629633399081), ('buying', 0.09644896916961399), ('maint', 0.07370394692148596), ('lug_boot', 0.030008141247605202), ('doors', 0.004485716626631886)]

File: breast-cancer.csv

No.1 model: 

Class Idendification Initial Number Matrix:
	recurrence-events	no-recurrence-events	

Questions (you may respond in a cell or cells below):

1. The Naive Bayes classifiers can be seen to vary, in terms of their effectiveness on the given datasets (e.g. in terms of Accuracy). Consider the Information Gain of each attribute, relative to the class distribution — does this help to explain the classifiers’ behaviour? Identify any results that are particularly surprising, and explain why they occur.
2. The Information Gain can be seen as a kind of correlation coefficient between a pair of attributes: when the gain is low, the attribute values are uncorrelated; when the gain is high, the attribute values are correlated. In supervised ML, we typically calculate the Infomation Gain between a single attribute and the class, but it can be calculated for any pair of attributes. Using the pair-wise IG as a proxy for attribute interdependence, in which cases are our NB assumptions violated? Describe any evidence (or indeed, lack of evidence) that this is has some effect on the effectiveness of the NB classifier.
3. Since we have gone to all of the effort of calculating Infomation Gain, we might as well use that as a criterion for building a “Decision Stump” (1-R classifier). How does the effectiveness of this classifier compare to Naive Bayes? Identify one or more cases where the effectiveness is notably different, and explain why.
4. Evaluating the model on the same data that we use to train the model is considered to be a major mistake in Machine Learning. Implement a hold–out or cross–validation evaluation strategy. How does your estimate of effectiveness change, compared to testing on the training data? Explain why. (The result might surprise you!)
5. Implement one of the advanced smoothing regimes (add-k, Good-Turing). Does changing the smoothing regime (or indeed, not smoothing at all) affect the effectiveness of the Naive Bayes classifier? Explain why, or why not.
6. Naive Bayes is said to elegantly handle missing attribute values. For the datasets with missing values, is there any evidence that the performance is different on the instances with missing values, compared to the instances where all of the values are present? Does it matter which, or how many values are missing? Would a imputation strategy have any effect on this?

Don't forget that groups of 1 student should respond to question (1), and one other question of your choosing. Groups of 2 students should respond to question (1) and question (2), and two other questions of your choosing. Your responses should be about 150-250 words each.

#### Answer for Q1: 
Theoretically, for an attribute, lower mean information lead to higher IG, means that most instances having same class would be distributed into the same values of this attribute. So, for each value, unlike the data distribution in training data, would contain more homogeneous instances, data become more "predictable" after filtering with this attribute. 

Naive Bayes use prior and posterior probabilities to predict the most likely class, where prior probability could be seen as "fixed" term for different attributes since it only describes the class distribution on the whole training set. However, posterior probability (P(a=v|c)) could be vary, for attribute with higher IG, since each value contains more homogeneous instances, P(c|a=v) would be higher for one class and lower for the others. Therefore, according to Bayes theory, posterior and final prediction probability would be higher for one specific class and lower for others, lead to more “confident” prediction to that specific class.

Therefore, usually NB classifier will perform better with many high IG attributes shown since they provide "powerful" evidence that could “divide” classes, give higher posterior probability to correct class, make prediction less ambiguous and bias. For instance, files anneal.csv, nursery.csv, mushroom.csv, high accuracy (> 90%) has been achieved with many high IG attributes (IG > 0.2), for some others like cmc.csv, breast-cancer.csv, low accuracy occurred (just like accuracy we could obtained from dummy classifier) with no high IG attributes.

There are exceptions like primary-tumor.csv and hypothyroid.csv, having high accuracy with no high IG attributes or low accuracy with high IG attributes present. For hypothyroid.csv, it is because classes in it are highly unequally distributed (3012 vs 151), choosing the majority is good enough to achieve high accuracy (95%). Also, most attributes only contain two values and not that informative, it is unlikely possible for them to distinguish minority from such large amount of majority thus result in all low IG. But since choosing majority is good enough, high accuracy is achieved even without informative attributes. For primary-tumor.csv, since IG prefers highly-branching attribute and this file contains so many missing values(based on my algorithm are all eliminated), also some attributes and classes only have few instances, so highly-branched attributes that could separate these small classes will get high IG, since it looks divides classes better. But they are not representative to large classes thus could not help predicting these big classes correctly, which finally lead to a low accuracy with the presence of high IG attributes.

#### Answer for Q4: 
As shown in appendix 1, for most of files like anneal, hepatitis and car, precision, recall and f1 score on different average measurements all went down a little bit (0.1 - 0.6) after implementing 10-fold cross validation. It is actually a more realistic proxy for the accuracy since test on training data usually will cause overfitting problem.  Test on training is useful in some cases as descriptive model when we just want to know how well the model describes the dataset. However generally we want a more generic model that could predict unseen instances. In test on training data, all the test data have been put into NB model and adjust both prior and posterior probabilities, change the prediction probability on desired class (input while training) higher. Since the model so "perfectly" that even fits outliers, noise and other variance, it is expected that the model will give higher accuracy output. Cross validation introduces a mechanism that still train and test the whole dataset eventually but not in the same run, it partition data to introduce new instances so that it could better reflect generalize case, so that probabilities inside NB model won't be adjusted by test cases to better predict them, the model we get is more realistic and the result of accuracy is more close to general accuracy. Therefore, cross validation helps reduce the overfitting problem thus results in a slightly decrease on accuracy measurements.

However, there two exceptions. First one is files like hypothyroid, nursery and mushroom that accuracy didn't decrease a lot (<0.001). It is because they are either having highly unbalanced class distribution (hypothyroid: 3012 vs151) or having attributes that are so informative (with IG > 0.9). These features remain almost the same in both training and testing data after partition, result in nearly the same probabilities measurement in NB models. The test instances don’t introduce new variance or unusual cases that violates these features (high IG attributes or unequal distribution). Therefore, the accuracy keeps almost the same after using cross validation. The other one is happened on primary-tumor that accuracy decreased a lot after implementing cross validation, one possible reason could be there are too many types of classes and attributes with just small amount of instances and many missing values, so that training data after partition is not "representative" enough for the whole data set. Also, the testing data introduces too many new instances with missing values that there are few or none evidences in the training data to classify them correctly, as prediction probabilities are all pretty small (including epsilon into it). Consequently the accuracy degrades a lot after implementing cross validation.

#### Appendix 1 Result Summary for train on test and 10-cross validation: 
|   File Name   |                           Test on                            |                            Train                             | Result                                                       | 10-cross                                                    | Validation                                                   | Result                                                      |
| :-----------: | :----------------------------------------------------------: | :----------------------------------------------------------: | ------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------------------------------- |
|  avg types  |                          macro avg                           |                         weighted avg                         | micro avg                                                    | macro avg                                                   | weighted avg                                                 | micro avg                                                   |
|  hypothyroid  |    Precision: 0.476130<br />Recall: 0.5<br />F1: 0.487773    | Precision: 0.906800 <br />Recall: 0.952261<br />F1: 0.928974 | Precision: 0.952261 <br />Recall: 0.952261<br />F1: 0.952261 | Precision: 0.476128<br />Recall: 0.5<br />F1: 0.487749      | Precision: 0.906776 <br />Recall: 0.952260<br />F1: 0.928917 | Precision: 0.952255<br />Recall: 0.952255<br />F1: 0.952255 |
| primary-tumor | Precision: 0.653160<br />Recall: 0.624474<br />F1: 0.638495  | Precision: 0.607216<br />Recall: 0.601770<br />F1: 0.604480  | Precision: 0.601770<br />Recall: 0.601770<br />F1: 0.601770  | Precision: 0.303401<br />Recall: 0.365846<br />F1: 0.330506 | Precision: 0.384844<br />Recall: 0.452285<br />F1: 0.414059  | Precision: 0.501003<br />Recall: 0.501003<br />F1: 0.501003 |
|   hepatitis   | Precision: 0.756079<br />Recall: 0.794334<br />F1: 0.774735  | Precision: 0.853733<br />Recall: 0.838710<br />F1: 0.846155  | Precision: 0.838710<br />Recall: 0.838710<br />F1: 0.838710  | Precision: 0.713810<br />Recall: 0.760638<br />F1: 0.734440 | Precision: 0.828765<br />Recall: 0.823984<br />F1: 0.825496  | Precision: 0.830000<br />Recall: 0.830000<br />F1: 0.830000 |
|    anneal     | Precision: 0.980472 <br />Recall: 0.995933<br />F1: 0.988142 | Precision: 0.991404<br />Recall: 0.991091<br />F1: 0.991248  | Precision: 0.991091 <br />Recall: 0.991091<br />F1: 0.991091 | Precision: 0.960640<br />Recall: 0.974651<br />F1: 0.967565 | Precision: 0.987445<br />Recall: 0.986822<br />F1: 0.987129  | Precision: 0.989975<br />Recall: 0.989975<br />F1: 0.989975 |
|      cmc      | Precision: 0.494251 <br />Recall: 0.506929<br />F1: 0.500510 | Precision: 0.515743<br />Recall: 0.505771<br />F1: 0.510708  | Precision: 0.505771<br />Recall: 0.505771<br />F1: 0.505771  | Precision: 0.479879<br />Recall: 0.494153<br />F1: 0.486852 | Precision: 0.499999<br />Recall: 0.493428<br />F1: 0.496606  | Precision: 0.490830<br />Recall: 0.490830<br />F1: 0.490830 |
|      car      | Precision: 0.806925<br />Recall: 0.654114<br />F1: 0.722528  | Precision: 0.871061<br />Recall: 0.873843<br />F1: 0.872450  | Precision: 0.873843<br />Recall: 0.873843<br />F1: 0.873843  | Precision: 0.782721<br />Recall: 0.609555<br />F1: 0.684041 | Precision: 0.852299<br />Recall: 0.857805<br />F1: 0.855002  | Precision: 0.857652<br />Recall: 0.857652<br />F1: 0.857652 |
| breast-cancer | Precision: 0.709936<br />Recall: 0.699356<br />F1: 0.704606  | Precision: 0.753485<br />Recall:  0.758741<br />F1: 0.756104 | Precision: 0.758741<br />Recall: 0.758741<br />F1: 0.758741  | Precision: 0.647610<br />Recall: 0.636138<br />F1: 0.641159 | Precision: 0.697813<br />Recall: 0.716202<br />F1: 0.706702  | Precision: 0.713424<br />Recall: 0.713424<br />F1: 0.713424 |
|    nursery    | Precision: 0.724922 <br />Recall: 0.566475<br />F1: 0.635979 | Precision: 0.905710<br />Recall: 0.903086<br />F1: 0.904396  | Precision: 0.903086<br />Recall: 0.903086<br />F1: 0.903086  | Precision: 0.860415<br />Recall: 0.695194<br />F1: 0.767559 | Precision: 0.902317<br />Recall: 0.902713<br />F1: 0.902494  | Precision: 0.902701<br />Recall: 0.902701<br />F1: 0.902701 |
|   mushroom    | Precision: 0.995711<br />Recall: 0.995933<br />F1: 0.995822  | Precision: 0.995839 <br />Recall: 0.995815<br />F1: 0.995827 | Precision: 0.995815<br />Recall: 0.995815<br />F1: 0.995815  | Precision: 0.995393<br />Recall: 0.995757<br />F1: 0.995575 | Precision: 0.995523<br />Recall: 0.995638<br />F1: 0.995580  | Precision: 0.995568<br />Recall: 0.995568<br />F1: 0.995568 |