# The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2019 Semester 1
-----
## Project 1: Gaining Information about Naive Bayes
-----
###### Student Name(s): Akira and Callum
###### Python version: 3.7.1 from Anaconda 
###### Submission deadline: 1pm, Fri 5 Apr 2019

This iPython notebook is a template which you may use for your Project 1 submission. (You are not required to use it; in particular, there is no need to use iPython if you do not like it.)

Marking will be applied on the five functions that are defined in this notebook, and to your responses to the questions at the end of this notebook.

You may change the prototypes of these functions, and you may write other functions, according to your requirements. We would appreciate it if the required functions were prominent/easy to find. 

**The cell below supresses forced output scrolling so you can see view the script output easier**

In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

**We have specifically imported all functions below to ensure we implemented iteratively AND without the use of external functions wherever possible**

In [2]:
from pandas import read_csv, DataFrame, Series
from collections import defaultdict, Counter
from numpy import NaN
from math import log

########## POSSIBLE CSVs ##########
d1 =  'anneal.csv'
h1 = 'family,product-type,steel,carbon,hardness,temper_rolling,condition,formability,strength,non-ageing,surface-finish,surface-quality,enamelability,bc,bf,bt,bw-me,bl,m,chrom,phos,cbond,marvi,exptl,ferro,corr,bbvc,lustre,jurofm,s,p,shape,oil,bore,packing,class'.split(',')

d2 =  'breast-cancer.csv'
h2 = 'age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,class'.split(',')

d3 =  'car.csv'
h3 = 'buying,maint,doors,persons,lug_boot,safety,class'.split(',')

d4 =  'cmc.csv'
h4 = 'w-education,h-education,n-child,w-relation,w-work,h-occupation,standard-of-living,media-exposure,class'.split(',')

d5 =  'hepatitis.csv'
h5 = 'sex,steroid,antivirals,fatigue,malaise,anorexia,liver-big,liver-firm,spleen-palpable,spiders,ascites,varices,histology,class'.split(',')

d6 =  'hypothyroid.csv'
h6 = 'sex,on-thyroxine,query-on-thyroxine,on_antithyroid,surgery,query-hypothyroid,query-hyperthyroid,pregnant,sick,tumor,lithium,goitre,TSH,T3,TT4,T4U,FTI,TBG,class'.split(',')

d7 =  'mushroom.csv'
h7 = 'cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class'.split(',')

d8 =  'nursery.csv'
h8 = 'parents,has_nurs,form,children,housing,finance,social,health,class'.split(',')

d9 = 'primary-tumor.csv'
h9 = 'age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal,class'.split(',')

datasets = [d1,d2,d3,d4,d5,d6,d7,d8,d9]
dataset_headers = [h1,h2,h3,h4,h5,h6,h7,h8,h9]

dictionary = {datasets[i] : dataset_headers[i] for i in range(len(datasets))}

"""Sets the column of each dataset"""
def set_column(filename):
    return dictionary[filename]

In [3]:
"""Has been adjusted so that it works with testing on the train data, and partitioning for cross_val"""
def preprocess(filename, testing_on_train = True, k = 10, drop = 'no', impute = 'no'):
    # Add column headers, drop columns with only one unique value (all same value)
    df = read_csv(filename, header = None, names = set_column(filename))
    
    if impute in 'yesYesYES':
        df.replace('?', NaN, inplace=True)
        mode = df.mode().iloc[0]
        df = df.fillna(mode)
    
    """If we are not using the cross validation method"""
    # Return the whole dataset as a dataframe
    if testing_on_train:
        return df
    else:
        """Drop no gain attributes given the input in the script"""
        if drop in 'yesYesYES':
            non_unique = df.apply(Series.nunique)
            df.drop(non_unique[non_unique == 1].index, axis=1, inplace=True)
            
        temp = df.copy()
        partitions = list()
        
        # k-fold Cross Validation
        divisor = k
        
        for i in range(k):
            partitions.append(temp.sample(frac=1/divisor))
            divisor -= 1
            temp.drop(partitions[-1].index, axis=0, inplace=True)
        
        del temp
        
        # Dictionary of train/test pairs
        cross_validation_pairs = defaultdict(list)
        models = list()
        
        for i in range(k):
            test = partitions[i]
            train = df.iloc[df.index.drop(test.index.values)]
            cross_validation_pairs["train"].append(train)
            cross_validation_pairs["test"].append(test)
            
        return cross_validation_pairs

In [4]:
"""Trains a model given a training dataset"""
def train(train_set):
    N = len(train_set)
    priors = {}
    posteriors = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
    """Accessable using posteriors[class j][attribute x][value i]"""
    
    for label in train_set['class'].unique():
        priors[label] = len(train_set.loc[train_set['class'] == label]) / N
        for attribute in train_set.columns[:-1]:
            temp = train_set.loc[train_set['class'] == label, [attribute,'class']]
            n = len(temp)
            count = Counter(temp[attribute])
            for i in count:
                posteriors[label][attribute][i] = count[i] / n
                
    trained_model = {"priors": priors, "posteriors": posteriors}
    
    return trained_model

"""Trains M partitions for cross_val"""
def cross_validation_train(cross_validation_pairs):
    trained_models = list()
    train_set = cross_validation_pairs["train"]
    test_set = cross_validation_pairs["test"]
    N = len(train_set)
    
    for i in range(N):
        trained_models.append(train(train_set[i]))
        
    return trained_models

In [5]:
"""Predicts a test set"""
def predict(trained_model, test_set):
    priors = trained_model["priors"]
    posteriors = trained_model["posteriors"]
    
    """Drop the class labels of the test set"""
    test_labels = test_set['class']
    test = test_set.drop('class', axis=1)
    cols = test_set.columns
    
    """Probabilistic Smoothing with epsilon -> 0"""
    n = len(test_labels)
    epsilon = 1e-100
    
    """Model Prediction"""
    prediction = {}
    
    """The Predicted Labels to be Returned"""
    """(Key, Value) = (Test Instance Row, Predicted Label)"""
    predicted_labels = {}
    
    for i in range(n):
        instance = test.iloc[i]
        for label in priors.keys():       
            prob = log(priors[label])/log(2)
            for attribute in cols:
                try:
                    """If the valyue is non missing"""
                    if instance[attribute] != '?':
                        prob += log(posteriors[label][attribute][instance[attribute]])/log(2)
                    else:
                        """Otherwise we have chosen to simply ignore it"""
                        pass
                except:
                    """If the value does not exist in our model, we use epsilon"""
                    prob += log(epsilon)/log(2)
                    
            prediction[label] = prob
        
        """Choose the predicted class with the highest probability"""
        predicted_labels[i] = max(prediction, key=prediction.get)
        
    return predicted_labels

"""Tests partitions"""
def cross_validation_predict(trained_models, cross_validation_pairs):
    test_set = cross_validation_pairs["test"]
    N = len(test_set)
    predictions = list()
    
    for i in range(N):
        predictions.append(predict(trained_models[i], test_set[i]))
    
    return predictions

In [6]:
"""Evaluates the accuracy"""
def evaluate(predicted_labels, test_set):
    test_labels = test_set['class']
    n = len(test_labels)
    
    return [1 if predicted_labels[i] == test_labels.iloc[i] else 0 for i in range(n)]

"""Evaluates cross_val accuracy"""
def cross_validation_evaluate(predictions, cross_pairs):
    test = cross_pairs["test"]
    N = len(test)
    results = list()
    for i in range(N):
        results.append(evaluate(predictions[i], test[i]))
        
    return results

In [7]:
"""Calculates the entropy given a series"""
def entropy(attribute_value):
    # Calculates the probability of class given that it has been loc'd on a said value
    N = len(attribute_value)
    # Normalise and count values iteratively
    event = [i/N for i in Counter(attribute_value).values()]
    
    # Will lose marks for this since it is an aggregate function according to forums
    # The iterative approach is off by 0.000000000000000xx compared to the .value_counts() method
    # event = pd.Series(attribute_value).value_counts(normalize=True, sort=False)
    
    return (-1*sum([i*log(i)/log(2) for i in event]))

"""Calculates the mean information given a dataset"""
def mean_info(dataset):
    mean_info_per_attribute = defaultdict(float)
    
    for attribute in dataset.columns[:-1]:
        # Calculates the probability of said value to happen (number of values / total number of instances)
        value_probabilities = Counter(dataset[attribute])
        N = len(dataset[attribute])
        for value in dataset[attribute].unique():
            # dataframe loc on said value and return the corresponding class column
            corresponding_values = dataset.loc[dataset[attribute] == value, 'class']
            # add probability of said value * entropy of said value to the attribute mean info
            mean_info_per_attribute[attribute] += value_probabilities[value]/N * entropy(corresponding_values)
            
    return mean_info_per_attribute

"""Calculates the information gain given a dataset. Adjusted so that it can drop 0 info_gain columns"""
def info_gain(dataset, drop_no_gain = False):
    mean_info_per_attribute = mean_info(dataset)
    class_entropy = entropy(dataset['class'])
    info_gain_given_class = defaultdict(float)
    
    for attribute in mean_info_per_attribute:

        info_gain_given_class[attribute] = class_entropy - mean_info_per_attribute[attribute]
    
    """If we want to drop the columns with absolutely 0 information gain"""
    if drop_no_gain:
        non_unique = dataset.apply(Series.nunique)
        dataset.drop(non_unique[non_unique == 1].index, axis=1, inplace=True)
        return dataset
    else:
        return info_gain_given_class

**Here is a script that runs every dataset and tests on its training, as well as k-fold cross-validation for a given _k_. It will also ask (y/n) for printing relevant information gain, and to drop attributes with 0 information gain**

In [8]:
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
print("*"*40)
k = int(input("Enter k value for k-Fold Cross Validation: "))
"""DO WE WANT TO DROP 0 INFO GAIN ATTRIBUTES?"""
drop = input("Drop all columns with absolutely no information gain? (y/n): ").lower()
"""DO WE WANT TO PRINT THE INFO GAIN?"""
to_print = input("Print the information gain? (y/n): ").lower()
"""DO WE WANT TO IMPUTE MISSING VALUES FOR THE TRAINING SET"""
to_impute = input("Impute missing values? (y/n): ").lower()
print("*"*40)
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for data in datasets:
    print(f"Processing {data} ...")
    print("TESTING ON THE TRAIN DATA")
    
    df = preprocess(data, impute = to_impute)
    if drop in 'yesYesYES':
        df = info_gain(df, drop_no_gain = True)
        info_gain_given_class = info_gain(df)
    else:
        info_gain_given_class = info_gain(df)
    if to_print in 'yesYesYES':
        for attribute in info_gain_given_class:
            print(f'InfoGain({attribute} | class) = {info_gain_given_class[attribute]:.4f}')
        print('...')
            
    """TRAIN / TEST"""
    model = train(df)
    prediction = predict(model, df)
    results = evaluate(prediction, df)
    print(f"Accuracy for Testing on the Training Data: {100*sum(results)/len(results):.2f}%")
    
    """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    
    print(f"\n{k}-FOLD CROSS VALIDATION")
    
    cross_validation_pairs = preprocess(data, testing_on_train = False, k = k, drop = drop)
    trained_models = cross_validation_train(cross_validation_pairs)
    print("...")
    predictions = cross_validation_predict(trained_models, cross_validation_pairs)
    cross_validation_results = cross_validation_evaluate(predictions, cross_validation_pairs)
    print("Accuracy using k-Fold Cross Validation: " + " ".join([f"{100*sum(i) / len(i):.2f}%" for i in cross_validation_results]))
    print(f"Average {k}-Fold Cross Validation Accuracy: {sum([100*sum(i) / len(i) for i in cross_validation_results]) / len(cross_validation_results):.2f}%")
    print("*"*40)

****************************************
Enter k value for k-Fold Cross Validation: 10
Drop all columns with absolutely no information gain? (y/n): y
Print the information gain? (y/n): y
Impute missing values? (y/n): y
****************************************
Processing anneal.csv ...
TESTING ON THE TRAIN DATA
InfoGain(family | class) = 0.4091
InfoGain(steel | class) = 0.3061
InfoGain(carbon | class) = 0.0513
InfoGain(hardness | class) = 0.2911
InfoGain(temper_rolling | class) = 0.1471
InfoGain(condition | class) = 0.2137
InfoGain(formability | class) = 0.2922
InfoGain(strength | class) = 0.1262
InfoGain(non-ageing | class) = 0.1411
InfoGain(surface-finish | class) = 0.0325
InfoGain(surface-quality | class) = 0.4352
InfoGain(enamelability | class) = 0.0387
InfoGain(bc | class) = 0.0004
InfoGain(bf | class) = 0.0394
InfoGain(bt | class) = 0.0218
InfoGain(bw-me | class) = 0.0380
InfoGain(bl | class) = 0.0367
InfoGain(chrom | class) = 0.1172
InfoGain(phos | class) = 0.0298
InfoGain(cbond 

# Questions 1, 2, 4 and 6 (150 - 200 words for each response):

#### 1. The Naive Bayes classifiers can be seen to vary, in terms of their effectiveness on the given datasets (e.g. in terms of Accuracy). Consider the Information Gain of each attribute, relative to the class distribution — does this help to explain the classifiers’ behaviour? Identify any results that are particularly surprising, and explain why they occur.
    - According to our Information Gain for each attribute, we find that if there is ONE attribute with SIGNIFICANTLY MORE information gain COMPARED to the other attributes, we result in a MUCH HIGHER ACCURACY
    - If all the information gains have SIMILAR values, then the model does not seem to perform as well
        - Can be seen with the `primary-tumor.csv` where all the information gain is roughly between 0.1 - 0.2
        - Compare that to `mushroom.csv` where there is one attribute with 0.9 information gain, and the rest between 0.1 - 0.4 and some less than 0.1
    

#### 2. The Information Gain can be seen as a kind of correlation coefficient between a pair of attributes: when the gain is low, the attribute values are uncorrelated; when the gain is high, the attribute values are correlated. In supervised ML, we typically calculate the Infomation Gain between a single attribute and the class, but it can be calculated for any pair of attributes. Using the pair-wise IG as a proxy for attribute interdependence, in which cases are our NB assumptions violated? Describe any evidence (or indeed, lack of evidence) that this is has some effect on the effectiveness of the NB classifier.
    - Using the results below (its pretty messy yikes)
    - See that the information gain between a pair of attributes is consistent
    - NB assumptions violated in a sense that we have assumed that each class is independent of eachother. Yet if we look at the information gain between pairs of attributes, we see that some are not just dependent, but significantly depndent on another attribute
        - See InfoGain(another_attribute | attribute = steel) = 1.9 and compare it to InfoGain(another_attribute | attribute = product-type) = 0
 
    

#### 4. Evaluating the model on the same data that we use to train the model is considered to be a major mistake in Machine Learning. Implement a hold–out or cross–validation evaluation strategy. How does your estimate of effectiveness change, compared to testing on the training data? Explain why. (The result might surprise you!)
    - Implemented a k-fold Cross-Validation strategy
    - Estimate of effectiveness does not change much.
    - Although each instance will become a training instance and testing instance at one stage, there are better partitions to train on compared to others (limited to the fact that we are taking random samples of the dataset each time)
    

#### 6. Naive Bayes is said to elegantly handle missing attribute values. For the datasets with missing values, is there any evidence that the performance is different on the instances with missing values, compared to the instances where all of the values are present? Does it matter which, or how many values are missing? Would a imputation strategy have any effect on this?
    - To an extent, it does matter how many values are missing, but it is also dependent if the attribute has a high information gain or not
    - Missing values in itself could be siginficiant (i.e. Since there is a missing value, it could lead to a specific label class)
    - See table below for results
        - Can see that training has no effect whether we impute values or not
        - However, we can see that the overall accuracy increases slightly when testing using the cross-validation method
    

##### Question 6 results

In [9]:
from IPython.display import display
display(read_csv('results.csv'))

Unnamed: 0,No Imputations and No dropping 0 info gain,10 fold Cross Validation result 1,No Imputations and Dropping 0 info gain,10 fold Cross Validation result 2,Mode Imputations and No dropping 0 info gain,10 fold Cross Validation result 3,Mode Imputations and Dropping 0 info gain,10 fold Cross Validation result 4
0,99.11,99.0,99.11,98.99,99.11,98.78,99.11,99.0
1,75.52,71.29,75.52,72.77,75.85,72.01,75.87,71.59
2,87.38,85.42,87.38,85.71,87.38,85.82,87.38,85.07
3,50.58,49.08,50.58,49.7,50.58,49.15,50.58,49.15
4,85.16,83.71,85.16,83.17,84.52,84.5,84.52,84.0
5,95.23,95.23,95.23,95.23,95.23,95.23,95.23,95.23
6,99.72,99.69,99.72,99.68,99.58,99.68,99.58,99.68
7,90.31,90.32,90.31,90.25,90.31,90.33,90.31,90.37
8,60.47,46.02,60.47,46.31,57.52,46.9,57.52,48.4


##### Question 2 script

In [17]:
def entropy(attribute_value):
    N = len(attribute_value)
    event = [i/N for i in Counter(attribute_value).values()]
    return (-1*sum([i*log(i)/log(2) for i in event]))
def mean_info(dataset, column_name):
    mean_info_per_attribute = defaultdict(float)
    for attribute in dataset.columns[:-1]:
        value_probabilities = Counter(dataset[attribute])
        N = len(dataset[attribute])
        for value in dataset[attribute].unique():
            corresponding_values = dataset.loc[dataset[attribute] == value, column_name]
            mean_info_per_attribute[attribute] += value_probabilities[value]/N * entropy(corresponding_values)
    return mean_info_per_attribute
def info_gain_attributes(dataset, to_print = 'no'):
    dataset.drop('class', axis=1, inplace=True)
    entropy_per_attribute = defaultdict(float)
    for attribute in dataset.columns:
        entropy_per_attribute[attribute] = entropy(dataset[attribute])
        mean_info_per_attribute = mean_info(dataset, attribute)
    for attribute in entropy_per_attribute:
        print(f"Entropy({attribute}) = {entropy_per_attribute[attribute]:.4f}\n")
        average = list()
        for every_other_attribute in mean_info_per_attribute:
            info = entropy_per_attribute[attribute] - mean_info_per_attribute[every_other_attribute]
            average.append(info)
            if to_print in 'yesYesYES':
                print(f"InfoGain({every_other_attribute} | {attribute}) = {info:.4f}")
        
        print(f"\nAverage Info Gain for {attribute} to every other attribute is {sum(average)/len(mean_info_per_attribute):.4f}")
        print("*"*40)

for data in datasets:
    to_print = input("Print InfoGain(every other attribute | attribute=A)? (y/n): ")
    df = preprocess(data)
    info_gain_attributes(df, to_print = to_print)
    break

Print InfoGain(every other attribute | attribute=A)? (y/n): n
Entropy(family) = 0.7250


Average Info Gain for family to every other attribute is 0.6414
****************************************
Entropy(product-type) = -0.0000


Average Info Gain for product-type to every other attribute is -0.0836
****************************************
Entropy(steel) = 1.9953


Average Info Gain for steel to every other attribute is 1.9117
****************************************
Entropy(carbon) = 0.6552


Average Info Gain for carbon to every other attribute is 0.5716
****************************************
Entropy(hardness) = 1.1782


Average Info Gain for hardness to every other attribute is 1.0946
****************************************
Entropy(temper_rolling) = 0.6162


Average Info Gain for temper_rolling to every other attribute is 0.5326
****************************************
Entropy(condition) = 1.1579


Average Info Gain for condition to every other attribute is 1.0743
*****************