# The University of Melbourne, School of Computing and Information Systems
# COMP90049 Introduction to Machine Learning, 2020 Semester 1
-----
## Project 1: Understanding Student Success with Naive Bayes
-----
###### Student Name(s): CHUAH YEE HEAN
###### Python version: 3.0
###### Submission deadline: 11am, Wed 22 Apr 2019

This iPython notebook is a template which you will use for your Project 1 submission. 

Marking will be applied on the five functions that are defined in this notebook, and to your responses to the questions at the end of this notebook.

You may change the prototypes of these functions, and you may write other functions, according to your requirements. We would appreciate it if the required functions were prominent/easy to find. 

In [1]:
#import packages / libraries
import pandas as pd
import os
import numpy as np

# This function should open a data file in csv, and transform it into a usable format 
def load_data(csv_file):
    dataset = pd.read_csv(csv_file)
    return dataset

In [1]:
from sklearn.model_selection import train_test_split

# This function should split a data set into a training set and hold-out test set
def split_data(dataset, testset_Ratio):
    X = dataset.drop(columns = 'Grade')
    y = dataset[['Grade']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testset_Ratio, random_state = 0)
    return X_train, X_test, y_train, y_test

In [3]:
# This function should build a supervised NB model
def train(X_train, y_train):
    
    y_train_values = y_train['Grade'].values
    train_set = X_train.join(y_train)
    
#First, calculate the prior probabilities    
    
    AA = 0
    A = 0
    B = 0
    C = 0 
    D = 0 
    F = 0


    for grade in y_train_values:
        if grade == 'A+':
            AA += 1
        elif grade == 'A':
            A += 1
        elif grade == 'B':
            B += 1
        elif grade == 'C':
            C += 1
        elif grade == 'D':
            D += 1
        elif grade == 'F':
            F += 1


    count_list = [AA,A,B,C,D,F]
    prior_list =[]

    for grade_count in count_list:
        prior_list.append(grade_count/len(y_train))    
    
    
    
    
#helper function to iterate through the features and calculate count the number of occurences of a categorical value 
#given a class variable    

    def count(data,colname,value,target):
        condition = (data[colname] == value) & (data['Grade'] == target)
        return len(data[condition])    
    
    

#Determine the probabilities P(X|Y) of desired attribute (eg. sex, age, location) conditioned on a given grade (eg. A+,A...)    
    
    probabilities = {'A+':{},'A':{},'B':{},'C':{},'D':{},'F':{}}
    #set up a dictionary containing grade and their corresponding prior probabilities
    grade_dict = {'A+':count_list[0], 'A':count_list[1], 'B':count_list[2], 
                  'C':count_list[3], 'D':count_list[4], 'F':count_list[5],}

    for grade in probabilities:

        for col in X_train.columns:
            probabilities[grade][col] = {}
            feature_values = X_train[col].values
            check_duplicate = []

            for category in feature_values:
    # if statement in place to avoid redundant calculations
                if category not in check_duplicate:
                    check_duplicate.append(category)
                    count_ct = count(train_set, col, category, grade)
                    probabilities[grade][col][category] = count_ct/grade_dict[grade]    
    
    return probabilities, prior_list

In [4]:
# This function should predict the class for an instance or a set of instances, based on a trained model 

def predict(X_test, prior_list, probabilities):
    
    predicted = []

#Find posterior probability P(Y|X1,X2,X3) = P(Y)P(X1|Y)P(X2|Y)....


    for row in range(len(X_test)):
        posterior_probabilities = {'A+':prior_list[0], 'A':prior_list[1], 'B':prior_list[2], 'C':prior_list[3], 'D':prior_list[4], 'F':prior_list[5]}
        for feature in X_test.columns:
            value = X_test.iloc[row][feature]
            for grade in posterior_probabilities:
                posterior_probabilities[grade] *= probabilities[grade][feature][value] #P(Y = grade)P(feature = value |Y = grade)   

#Predict the class after traversing through all features in a row by selecting the largest probability
        predicted.append(max(posterior_probabilities, key=posterior_probabilities.get))  
    
    return predicted

In [5]:
# This function should evaluate a set of predictions in terms of accuracy
from sklearn import metrics


def evaluate(y_test, predicted):
    return metrics.accuracy_score(y_test, predicted)

In [6]:
# Additional part for Q3b using Leave one out validation

#function to shuffle rows in dataframe to ensure our LOO validation works correctly
def shuffle(df):
    return df.reindex(np.random.permutation(df.index))

#helper function to iterate through the features and calculate count the number of occurences of a categorical value 
#given a class variable    

def count(data,colname,value,target):
    condition = (data[colname] == value) & (data['Grade'] == target)
    return len(data[condition])    
    

#leave one out evaluation 
'''Function takes in a dataset, and perform LOO validation, and then evaluates the accuracy of LOO method as output'''
def loo(dataset):
    accuracy_list = []
    for r in range(len(dataset)):
    #train test split
    #-----------------
        train_set2 = dataset.drop(dataset.index[r]) #drop a row by row number from dataframe
        X_train2 = train_set2.drop(columns = 'Grade')
        y_train2 = train_set2[['Grade']]

        test_set2 = dataset.iloc[r] #testing set consisting of one instance/observation (Leave one out)
        X_test2 = test_set2.drop('Grade')
        y_test2 = test_set2[['Grade']]





    #Training stage
    #---------------
    #Calculate the prior probability of grades for each class label 
        AA = 0
        A = 0
        B = 0
        C = 0 
        D = 0 
        F = 0

        #determine the count of grades

        for grade in y_train2.values:
            if grade == 'A+':
                AA += 1
            elif grade == 'A':
                A += 1
            elif grade == 'B':
                B += 1
            elif grade == 'C':
                C += 1
            elif grade == 'D':
                D += 1
            elif grade == 'F':
                F += 1

    #determine the marginal probability of the grades

        count_list2 = [AA,A,B,C,D,F]
        prior_list2 =[]

        for grade_count in count_list2:
            prior_list2.append(grade_count/len(y_train2))    



        probabilities2 = {'A+':{},'A':{},'B':{},'C':{},'D':{},'F':{}}
        #set up a dictionary containing grade and their corresponding prior probabilities
        grade_dict2 = {'A+':count_list2[0], 'A':count_list2[1], 'B':count_list2[2], 
                      'C':count_list2[3], 'D':count_list2[4], 'F':count_list2[5],}

        for grade in probabilities2:

            for col in X_train2.columns:
                probabilities2[grade][col] = {}
                feature_values2 = X_train2[col].values
                check_duplicate2 = []

                for category in feature_values2:
        # if statement in place to avoid redundant calculations
                    if category not in check_duplicate2:
                        check_duplicate2.append(category)
                        count_ct2 = count(train_set2, col, category, grade)                  
                        probabilities2[grade][col][category] = count_ct2/grade_dict2[grade]





    #TRAINING STAGE COMPLETE
    #---------------------------



    #Testing stage
    #---------------
    
    #Find posterior probability P(Y|X1,X2,X3) = P(Y)P(X1|Y)P(X2|Y)....
        predicted2 = 0



        posterior_probabilities2 = {'A+':prior_list2[0], 'A':prior_list2[1], 'B':prior_list2[2], 'C':prior_list2[3], 'D':prior_list2[4], 'F':prior_list2[5]}

        for feature in X_train2.columns:
            value = X_test2[feature]
            for grade in posterior_probabilities2:
                posterior_probabilities2[grade] *= probabilities2[grade][feature][value] #P(Y = grade)P(feature = value |Y = grade)   

        #Predict the class after traversing through all features in a row by selecting the largest probability
        predicted2 = max(posterior_probabilities2, key=posterior_probabilities2.get)

        if predicted2 == y_test2['Grade']:
            accuracy_list.append(1)
        else:
            accuracy_list.append(0)
        print("number of rows left to evaluate: ", len(dataset)-len(accuracy_list))

        
    #TRAINING STAGE COMPLETE
    #---------------------------
    
    
    
    #Evaluation stage
    #----------------------

    average_accuracy = sum(accuracy_list)/len(dataset)
    print('Accuracy for LOO Method: ', average_accuracy)

## Questions (you may respond in a cell or cells below):

You should respond to Question 1 and two additional questions of your choice. A response to a question should take about 100–250 words, and make reference to the data wherever possible.

### Question 1: Naive Bayes Concepts and Implementation

- a Explain the ‘naive’ assumption underlying Naive Bayes. (1) Why is it necessary? (2) Why can it be problematic? Link your discussion to the features of the students data set. [no programming required]
- b Implement the required functions to load the student dataset, and estimate a Naive Bayes model. Evaluate the resulting classifier using the hold-out strategy, and measure its performance using accuracy.
- c What accuracy does your classifier achieve? Manually inspect a few instances for which your classifier made correct predictions, and some for which it predicted incorrectly, and discuss any patterns you can find.

### Question 2: A Closer Look at Evaluation

- a You learnt in the lectures that precision, recall and f-1 measure can provide a more holistic and realistic picture of the classifier performance. (i) Explain the intuition behind accuracy, precision, recall, and F1-measure, (ii) contrast their utility, and (iii) discuss the difference between micro and macro averaging in the context of the data set. [no programming required]
- b Compute precision, recall and f-1 measure of your model’s predictions on the test data set (1) separately for each class, and (2) as a single number using macro-averaging. Compare the results against your accuracy scores from Question 1. In the context of the student dataset, and your response to question 2a analyze the additional knowledge you gained about your classifier performance.

### Question 3: Training Strategies 

There are other evaluation strategies, which tend to be preferred over the hold-out strategy you implemented in Question 1.
- a Select one such strategy, (i) describe how it works, and (ii) explain why it is preferable over hold-out evaluation. [no programming required]
- b Implement your chosen strategy from Question 3a, and report the accuracy score(s) of your classifier under this strategy. Compare your outcomes against your accuracy score in Question 1, and explain your observations in the context of your response to question 3a.

### Question 4: Model Comparison

In order to understand whether a machine learning model is performing satisfactorily we typically compare its performance against alternative models. 
- a Choose one (simple) comparison model, explain (i) the workings of your chosen model, and (ii) why you chose this particular model. 
- b Implement your model of choice. How does the performance of the Naive Bayes classifier compare against your additional model? Explain your observations.

### Question 5: Bias and Fairness in Student Success Prediction

As machine learning practitioners, we should be aware of possible ethical considerations around the
applications we develop. The classifier you developed in this assignment could for example be used
to classify college applicants into admitted vs not-admitted – depending on their predicted
grade.
- a Discuss ethical problems which might arise in this application and lead to unfair treatment of the applicants. Link your discussion to the set of features provided in the students data set. [no programming required]
- b Select ethically problematic features from the data set and remove them from the data set. Use your own judgment (there is no right or wrong), and document your decisions. Train your Naive Bayes classifier on the resulting data set containing only ‘unproblematic’ features. How does the performance change in comparison to the full classifier?
- c The approach to fairness we have adopted is called “fairness through unawareness” – we simply deleted any questionable features from our data. Removing all problematic features does not guarantee a fair classifier. Can you think of reasons why removing problematic features is not enough? [no programming required]


In [7]:
#Q1a
#---------------
#Naïve Bayes assumes conditional independence, which states that features are independent of each other given the class. 
#The resulting model is simple, easy to build and can compute predictions in a short amount of time. 
#It performs well in small data sets, and scales well with additional features, classes and data sizes. 
#It is necessary to make our computations cheap and to make predictions in real time. 
#Without the underlying assumption, we will need to account for covariance between variables, 
#which requires many intensive calculations such as matric inverses determinants and thus making the training process computationally expensive. 
#However, in real life features are not completely independent. 
#For example, the number of school absences can be dependent on the health status of students; 
#quality of family relationships may be dependent on parents’ cohabitation status and the students’ guardian. 
#Failure to account for the dependencies makes Naïve Bayes a bad estimator. 
#Due to the assumption of independence, if there no occurrences of a class label and a feature value together, 
#or a categorical variable has a category that was not observed in training process, 
#then the likelihood P(feature | class) estimated will be zero, and our model will be unable to make a prediction. 
#In summary, the assumption of independence makes Naïve Bayes a fast but inaccurate model to make predictions. 





In [8]:
#Q1b
#-----------------
dataset = load_data(r"C:\Users\cyeeh\OneDrive\Masters\2020 Sem 1\Intro to Machine Learning\Assignment 1\student.csv")

X_train, X_test, y_train, y_test = split_data(dataset, 0.2)
probabilities, prior_list = train(X_train, y_train)
predictions = predict(X_test, prior_list, probabilities)
accuracy = evaluate(y_test, predictions)
print('Accuracy using holdout validation: ', accuracy)



Accuracy using holdout validation:  0.3230769230769231


In [9]:
#Q1c
#------------------
#My classifier achieves an accuracy of 32%. Looking at instances where classifier correctly predicted a D or F final grade, 
#a majority of the students’ parents possess low to medium level of education, and almost none at high. 
#most of them seems to spend low to medium hours studying and have their mothers as the guardian. 
#There appears to be a lack of A+ grade prediction, which could be due to scarcity of A+ label in dataset. 
#For students with a correct grade prediction of A or B, the parents’ education seems to range mostly from medium to high, 
#and their study hours are mostly medium to high. 
#This could flag parents’ education as a good indicator in predicting grades. 
#This is further confirmed when we look at the incorrect predictions, 
#with students who have highly educated parents receiving high grade prediction despite scoring poor grade. 
#The students address also seem to play an important role in predicting grades, 
#with students coming from urban background getting a good grade prediction regardless of their actual grade. 
#In both correct and incorrect predictions, it is observed that rural students are predicted a lower grade. 
#The sex of the student doesn’t seem to affect grades, and so does travel time and family size. 
#The model seems to view students from GS school more favourably as they receive an optimistically incorrect prediction, 
#with many C students predicted A and even a case of F predicted B.

test_set = X_test.join(y_test)
merge_predict_actual = test_set.assign(Predicted = predictions)

#Select some correct predictions to compare
selected_correct = merge_predict_actual[merge_predict_actual['Grade'] == merge_predict_actual['Predicted']]
selected_correct

Unnamed: 0,school,sex,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,Grade,Predicted
532,GP,F,U,LE3,T,mid,mid,other,other,home,...,yes,4,3,3,1,1,2,none,C,C
470,MS,F,R,GT3,T,mid,mid,at_home,other,course,...,no,5,4,4,2,3,5,four_to_six,D,D
240,MS,M,R,GT3,T,mid,low,other,other,other,...,yes,5,5,5,5,5,3,more_than_ten,F,F
249,GP,F,R,GT3,T,mid,low,other,other,reputation,...,yes,4,3,5,1,2,3,more_than_ten,D,D
531,MS,F,R,GT3,A,low,low,at_home,at_home,course,...,yes,3,5,4,1,4,1,none,F,F
251,GP,F,R,GT3,T,low,low,other,other,course,...,yes,5,5,5,1,1,1,one_to_three,F,F
603,GP,F,U,LE3,A,mid,mid,other,other,home,...,no,3,1,2,1,1,1,more_than_ten,D,D
268,MS,M,R,GT3,T,mid,mid,services,other,course,...,no,2,3,1,2,2,5,none,F,F
593,GP,F,U,LE3,A,mid,low,at_home,other,home,...,yes,4,4,2,1,1,5,none,D,D
646,MS,F,R,GT3,T,low,mid,other,other,course,...,no,3,5,5,1,3,1,four_to_six,F,F


In [10]:
#Selected some incorrect predictions to compare
selected_incorrect = merge_predict_actual[merge_predict_actual['Grade'] != merge_predict_actual['Predicted']]
selected_incorrect

Unnamed: 0,school,sex,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,Grade,Predicted
375,GP,M,U,GT3,T,low,none,other,other,reputation,...,yes,4,3,2,1,1,3,none,A+,C
306,GP,M,U,GT3,T,high,mid,other,other,course,...,yes,5,2,3,1,1,2,four_to_six,C,B
625,GP,M,U,GT3,T,mid,mid,other,other,home,...,no,5,3,2,1,2,5,one_to_three,F,C
480,GP,F,U,GT3,T,low,low,at_home,other,reputation,...,yes,4,3,4,1,1,5,more_than_ten,C,B
303,GP,M,U,LE3,T,mid,mid,other,other,course,...,no,4,5,5,2,4,5,none,C,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,MS,F,U,GT3,T,high,mid,other,other,course,...,no,4,3,3,1,2,4,none,A+,C
419,MS,M,U,GT3,T,mid,mid,at_home,at_home,course,...,no,4,4,5,1,3,3,more_than_ten,F,D
236,GP,F,U,GT3,A,high,mid,services,services,reputation,...,no,4,3,2,1,1,1,none,B,A
641,GP,F,R,GT3,T,mid,mid,services,other,reputation,...,no,4,1,2,1,1,2,four_to_six,D,C


In [11]:
#Q3a
#------------------
#(i)
#Strategy used is the leave one out (LOO) strategy. 
#It is an extreme case of k-fold cross validation strategy, with k = 649 which is the number of instances in this assignment.
#The dataset is first separated into 649 partitions. 
#One partition is assigned as the testing set while the remainder is used to train the model. 
#This process is repeated until each unique partition has been used as the testing set. 
#The evaluation metric, in this case, the accuracy) of the LOO model is calculated by averaging across the accuracy of each fold. 

#(ii)
#In comparison, the holdout method splits our dataset into training and testing set once, usually at a ratio of 80% to 20%. 
#The training set is used to train our model, while the testing set is used to measure the performance of our model on unseen data. 
#This results in a smaller training set which, in turn, introduces a bias and variance to our estimated error 
#as the training data might not be an accurate representation of the underlying distribution of the dataset. 
#Thus, the estimates produced by the holdout method is highly dependent on how the train-test split is done. 
#To that end, the LOO method is highly preferred over the hold-out strategy as it allows evaluation to be done across multiple train-test splits. 
#it minimises the bias of our performance estimates because we can maximise our training data. 
#The size of the training data using the LOO method is only one instance smaller than the dataset, 
#making our performance estimates approximately unbiased, as the training set would have a similar distribution to our dataset.


In [12]:
#Q3b (PROGRAMMING PART)
#------------------
#Calculate the strategy score of LOO validation
#NOTE: this might take a while due to a lack of proficiency in my programming skills, sorry ):
loo(dataset)




number of rows left to evaluate:  648
number of rows left to evaluate:  647
number of rows left to evaluate:  646
number of rows left to evaluate:  645
number of rows left to evaluate:  644
number of rows left to evaluate:  643
number of rows left to evaluate:  642
number of rows left to evaluate:  641
number of rows left to evaluate:  640
number of rows left to evaluate:  639
number of rows left to evaluate:  638
number of rows left to evaluate:  637
number of rows left to evaluate:  636
number of rows left to evaluate:  635
number of rows left to evaluate:  634
number of rows left to evaluate:  633
number of rows left to evaluate:  632
number of rows left to evaluate:  631
number of rows left to evaluate:  630
number of rows left to evaluate:  629
number of rows left to evaluate:  628
number of rows left to evaluate:  627
number of rows left to evaluate:  626
number of rows left to evaluate:  625
number of rows left to evaluate:  624
number of rows left to evaluate:  623
number of ro

KeyboardInterrupt: 

In [None]:
#repeat our validation process with a shuffled dataset, expected value should be approx. 35.75%, same as above
shuffled = shuffle(dataset)
loo(shuffled)

In [None]:
#Q3b
#------------------
#The accuracy score obtained in this classifier is approximately 35.75%, 
#which is slightly higher than the accuracy obtained in Q1, this disparity is due to overfitting. 
#In real life, all data often contains some level of noise (outliers). 
#The implication is that the data split under holdout validation may be biased as it may not guarantee complete randomness in training and testing set, 
#causing noise to be present in the training set. 
#When the model is trained using the flawed training set, this can lead to our model memorising the training examples, 
#rather than trying to capture the underlying pattern / relationship between features and classes from the training examples.
#The model may incorporate the noise in the training process, 
#which causes our model to fit the data too well that it does not generalise to account for other samples that are not included in the training stage. 
#LOO validation helps to assess how much of an overfit our model has. 
#By iteratively holding one observation and training the model from scratch with the remaining observations, 
#LOO validation minimises bias by maximising the training set, giving a better approximation of the true model accuracy, 
#in line with the response in 3a. 

In [None]:
#Q5a
#-------------------
#Using machine learning in admissions process can create “self-fulling prophecies”, 
#stereotyping certain demographic of people with certain opportunities. 
#It encourages normalcy and deters non-conforming cases. 
#For example, our trained model may deduce a high performing student (A+, A) may be typically characterised by a specific set of feature values. 
#From our dataset, it may conclude that a stellar student would have long study hours, 
#low number of school absences and failure rates among other features. 
#As such, the model observes these features are common in good students and incorporates them in its predictions. 
#However, these features are not reliable all the time. 
#In real life, dataset often contains outliers, noises which may undermine the distribution. 
#One example would be students that score bad grades even after studying for long hours, and vice versa. 
#Their capabilities of producing good results are unobserved because of how predictions are used to allocate grades. 
#This serves to reinforce the patterns, or “recipe” for predicting good grades that is in place. 
#The black box nature of models makes it difficult to interpret predictions. 
#Unlike humans we can justify decisions with reasons, but with models, 
#the contribution and significant attributes to the final predicted label is not immediately visible, 
#which can be made further obscure by the inclusion of arbitrary features. 
#Take our dataset for instance, features that are seemingly redundant (such as being in a romantic relationship, going out with friends) 
#are used to predict student grades, which can affect the prediction result of our model.


In [None]:
#Q5b
#---------------------
#The questionable features removed are: sex, reason to choose this school, 
#travel time, family size, and the students' guardian
#The model performance using the tweaked dataset has improved compared to the full classifier, with an accuracy of 35.54%

# Remove the questionable columns,'sex', 'reason', 'famsize', 'guardian', 'traveltime'
small_dataset = dataset.drop(['sex', 'reason', 'famsize', 'guardian', 'traveltime'], axis = 1) 

X_train, X_test, y_train, y_test = split_data(small_dataset, 0.2)
probabilities, prior_list = train(X_train, y_train)
predictions = predict(X_test, prior_list, probabilities)
accuracy = evaluate(y_test, predictions)
print('Accuracy using holdout validation: ', accuracy)


In [None]:
#Q5c
#-----------------------
#One possible potential cause is a skewed dataset. To obtain our dataset, collation of data must first be conducted. 
#If any bias happens at this part, this can lead to a dataset that may not be an accurate representation of the underlying distribution. 
#Future observations will confirm predictions made by the skewed trained model more often that contradict them, 
#compounding the bias. Taking our the ‘school’ feature in dataset for example, 
#MS may have a stricter marking policy than GP, making it harder to pass subjects. 
#Our model may fail to account for this grade inflation, and thus record that MS has a higher failure rate. 
#The model trained using the data collected hence makes predictions that are positively biased towards GP. 
#Another cause is there may be proxy data present. 
#For example, even if problematic features (such as sex, reason to choose this school, household income) is removed, 
#there may exist other features that are proxies to the former (e.g. Address may be related household income). 
#The inclusion of such features will allow bias to persist in our models, 
#and this makes feature selection a tricky process as it is difficult to determine the relationship between features in dataset 
#and if we should include it in training.
#Unfairness in a model can also stem from human bias in existing older data. 
#For example, if a teacher emphasizes the study time of students when assigning grades, 
#rather than taking a holistic view of a students’ capability, 
#a model that is trained on such grading scheme will replicate the grader's bias existing in the decisions. 
