# CS-513 Midterm 1

## Question 4
___
Wyatt Blair

3/26/24
___


#4 (25 Points)

Load the CANVAS “Admission_v2.CSV” dataset into R/Python. Remove the missing values if necessary. Discretize the “GRE” scores into “up to 500 inclusive”, “above 500 and up to 600”, “above 600 and up to 700” and “above 700”.  Also discretize the GPAs into “up to 2.5 inclusive”, “above 2.5 and up to 3”, “above 3 and up to 3.5”, and “over 3.5”.   Construct a Naïve Bayes model to classify admission (admission=’yes’) based on the other variables. Predict admission for a random sample (30%) of the data (test dataset). Measure the accuracy of the model. use (30%) of the data as test dataset


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("../data/Admission_v2.csv", index_col='Applicant')

dataset['GPA'] = dataset['GPA'].replace(' ', np.nan).astype(np.float64)
dataset['GRE'] = dataset['GRE'].replace(' ', np.nan).astype(np.float64)
dataset['ADMIT'] = dataset['ADMIT'].apply(lambda admit: np.int_(admit == 'YES'))

dataset['GPA'] = dataset['GPA'].fillna(dataset['GPA'].mean())
dataset['GRE'] = dataset['GRE'].fillna(dataset['GRE'].mean())

dataset.isnull().any()

ADMIT    False
GRE      False
GPA      False
RANK     False
dtype: bool

In [3]:
def bin_GRE(gre):

    if gre <= 500:
        return "up to 500 inclusive"
    elif 500 < gre <= 600:
        return "above 500 and up to 600"
    elif 600 < gre <= 700:
        return "above 600 up to 700"
    elif gre > 700:
        return "above 700"
    else:
        raise ValueError(f"Unrecognized GRE value: {gre}")

def bin_GPA(gpa):

    if gpa <= 2.5:
        return "up to 2.5 inclusive"
    elif 2.5 < gpa <= 3:
        return "above 2.5 and up to 3"
    elif 3 < gpa <= 3.5:
        return "above 3 and up to 3.5"
    elif gpa > 3.5:
        return "over 3.5"
    else:
        raise ValueError(f"Unrecognized GPA value: {gpa}")

In [4]:
discretized_dataset = dataset.copy()
discretized_dataset['GRE'] = discretized_dataset['GRE'].apply(bin_GRE)
discretized_dataset['GPA'] = discretized_dataset['GPA'].apply(bin_GPA)

discretized_dataset.head(5)

Unnamed: 0_level_0,ADMIT,GRE,GPA,RANK
Applicant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,0,up to 500 inclusive,over 3.5,3
1002,1,above 600 up to 700,over 3.5,3
1003,1,above 700,over 3.5,1
1004,1,above 600 up to 700,above 3 and up to 3.5,4
1005,0,above 500 and up to 600,above 2.5 and up to 3,4


In [5]:
train = discretized_dataset.sample(frac=0.7)
test = discretized_dataset.drop(train.index)

### Implement Naive Bayes

$P(y | a) = \frac{P(a | y)P(y)}{P(a)}$

$P(y | x_1, ..., x_j) = \frac{P(x_1, ..., x_j | y)P(y)}{P(x_1, ..., x_j)}$

___

$a = \{x_1, ..., x_j\}$ : Data-Point

$y$ : Class

$P(y | a)$ : Posterior Probability

$P(a | y)$ : Likelihood of Features

$P(y)$ : Prior Probabilitiy

$P(a)$ : Marginal Probability

___



In [19]:
# I am adapting some of the code I wrote for HW-2-Naive Bayes here

class NaiveBayes:

    def __init__(self, labelled_data, target):

        self.data = labelled_data
        self.target = target
        self.features = self.data.columns.to_list(); self.features.remove(self.target)
        
        self.feature_values = [
            val 
            for feat in self.features
            for val in self.data[feat].unique()
        ]

        self.classes = self.data[self.target].unique()
        self.probabilities = self.calculate_probabilities()

    def calculate_probabilities(self):

        data = self.data

        # Initialize Helpful Variables
        probabilities_dict = {}

        num_data_points = len(data.index)
        features = self.features
        feature_values = data[features].stack().unique()

        # P(y)
        priors = data[self.target].value_counts() / num_data_points
        probabilities_dict.update({'priors': priors})

        # P(x1, x2, ...)
        ind = self.feature_values
        vals = np.zeros(shape=(len(self.feature_values), len(features)))
        marginal_probability_table = pd.DataFrame(data=vals, index=ind, columns=features)

        for feature in features:

            probabilities = (data[feature].value_counts() / len(data.index))
            marginal_probability_table[feature].update(probabilities)

        probabilities_dict.update({"marginal": marginal_probability_table})

        # P(x1, x2, ... | y)

        ind_arr1 = np.repeat(self.classes, repeats=len(feature_values))
        ind_arr2 = np.tile(feature_values, reps=len(self.classes))
        ind_arrs = list(zip(ind_arr1, ind_arr2))

        ind = pd.MultiIndex.from_tuples(ind_arrs, names=[self.target, 'Feature Values'])
        vals = np.zeros((len(feature_values) * len(self.classes), len(features)))
        likelihood_table = pd.DataFrame(data=vals, columns=features, index=ind)

        for y in self.classes:

            # First segment data based on class (i.e. given y)
            class_wise_data = data[data[self.target] == y]
            num_class_data_points = len(class_wise_data.index)

            for feature in features:
                
                # Calculate probabilities for each value for each feature
                probabilities = pd.concat({y: (
                    class_wise_data[feature].value_counts() / num_class_data_points
                )}, names=[self.target])

                # Update likelihood table with all the values
                likelihood_table[feature].update(probabilities)

        likelihood_table.sort_index(inplace=True)
        probabilities_dict.update({"likelihood": likelihood_table})

        return probabilities_dict

    def get_prior(self, y):

        return self.probabilities['priors'][y]

    def get_marginal(self, x):

        marginal_table = self.probabilities['marginal']

        features = [col for col in marginal_table.columns if col != self.target]
        marginals = []
        for feat, feat_vec_val in zip(features, x[features]):

            marginal_val = marginal_table[feat][feat_vec_val]
            marginals.append(marginal_val)

        marginal = np.prod(marginals)
        return marginal

    def get_likelihood(self, x, y):

        likelihood_table = self.probabilities['likelihood']

        features = [col for col in likelihood_table.columns if col != self.target]
        likelihoods = []
        for feat, feat_vec_val in zip(features, x[features]):

            likelihood_val = likelihood_table[feat][y][feat_vec_val]
            likelihoods.append(likelihood_val)

        likelihood = np.prod(likelihoods)
        return likelihood
    
    def predict(self, x):

        posterior_probabilities = {}

        for y in self.probabilities['priors'].index:

            # prior
            prior = self.get_prior(y)

            # marginal
            marginal = self.get_marginal(x)
            
            # likelihood
            likelihood = self.get_likelihood(x, y)

            # P(y | x1, x2, ...) = P(x1, x2, ... | y) * P(y) / P(x1, x2, ...)
            posterior_probability = (likelihood * prior) / (marginal)
            posterior_probabilities.update({y: posterior_probability})
        
        prediction, highest_posterior_prob = max(
            posterior_probabilities.items(), key=lambda x: x[1]
        )
        return prediction
    
    def __call__(self, x):
        return self.predict(x)

In [20]:
nb_model = NaiveBayes(labelled_data=train, target='ADMIT')

In [27]:
nb_model.probabilities['priors']

0    0.689286
1    0.310714
Name: ADMIT, dtype: float64

In [28]:
nb_model.probabilities['marginal']

Unnamed: 0,GRE,GPA,RANK
above 500 and up to 600,0.353571,0.0,0.0
up to 500 inclusive,0.225,0.0,0.0
above 600 up to 700,0.289286,0.0,0.0
above 700,0.132143,0.0,0.0
over 3.5,0.0,0.378571,0.0
up to 2.5 inclusive,0.0,0.014286,0.0
above 3 and up to 3.5,0.0,0.446429,0.0
above 2.5 and up to 3,0.0,0.160714,0.0
2,0.0,0.0,0.396429
4,0.0,0.0,0.153571


In [29]:
nb_model.probabilities['likelihood']

Unnamed: 0_level_0,Unnamed: 1_level_0,GRE,GPA,RANK
ADMIT,Feature Values,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0.0,0.0,0.098446
0,2,0.0,0.0,0.378238
0,3,0.0,0.0,0.331606
0,4,0.0,0.0,0.19171
0,above 2.5 and up to 3,0.0,0.170984,0.0
0,above 3 and up to 3.5,0.0,0.492228,0.0
0,above 500 and up to 600,0.362694,0.0,0.0
0,above 600 up to 700,0.26943,0.0,0.0
0,above 700,0.108808,0.0,0.0
0,over 3.5,0.0,0.321244,0.0


In [8]:
%%timeit

test['prediction'] = test.apply(nb_model, axis=1)
test['correct'] = test['ADMIT'] == test['prediction']

214 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
test[['ADMIT', 'prediction', 'correct']].sample(10)

Unnamed: 0_level_0,ADMIT,prediction,correct
Applicant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1339,0,0,True
1089,0,1,False
1355,1,0,False
1318,1,0,False
1171,0,0,True
1331,0,0,True
1013,1,1,True
1120,0,0,True
1027,1,1,True
1243,1,0,False


In [10]:
counts = test['correct'].value_counts()
accuracy = counts[True] / len(test.index)

print('Model Accuracy: %1.2f%%' % (100 * accuracy))

Model Accuracy: 70.00%
