In [45]:
import numpy as np
import pandas as pd


# Data Processing

Below we upload all the data from our bank notes collected in the csv file

In [46]:
bank_df=pd.read_csv("banknote_authentication.csv",sep=";")
bank_df=bank_df.sample(frac=1, random_state=42)  # Randomise
bank_df

Unnamed: 0,variance,skewness,curtosis,entropy,class
83,0.50813,0.47799,-1.980400,0.57714,1
53,-3.84830,-12.80470,15.682400,-1.28100,1
70,-0.34810,-0.38696,-0.478410,0.62627,1
45,-0.78690,9.56630,-3.786700,-7.50340,0
44,2.85610,6.91760,-0.793720,0.48403,0
...,...,...,...,...,...
60,-2.41150,-9.13590,9.344400,-0.65259,1
71,0.47368,3.36050,-4.506400,-4.04310,1
14,3.40400,8.72610,-2.991500,-0.57242,0
92,-0.28015,3.07290,-3.385700,-2.91550,1


# Test-train data split
We split it into 20-80

In [47]:
split_index = int(bank_df.shape[0] * 0.8)
print("#########")
print("# TRAIN #")
print("#########")
s2_train_features=bank_df.iloc[:split_index, :-1].to_numpy()
s2_train_labels=bank_df.iloc[:split_index, -1].to_numpy()

print(f"first ten rows of s2_train_features = \n {s2_train_features[:10]}")
print(f"\nfirst ten elements of s2_train_labels = \n {s2_train_labels[:10]}")

print("\n########")
print("# TEST #")
print("########")
s2_test_features=bank_df.iloc[split_index:, :-1].to_numpy()
s2_test_labels=bank_df.iloc[split_index:, -1].to_numpy()

print(f"first ten rows of s2_test_features = \n {s2_test_features[:10]}")
print(f"\nfirst ten elements of s2_test_labels = \n {s2_test_labels[:10]}")

#########
# TRAIN #
#########
first ten rows of s2_train_features = 
 [[  0.50813   0.47799  -1.9804    0.57714]
 [ -3.8483  -12.8047   15.6824   -1.281  ]
 [ -0.3481   -0.38696  -0.47841   0.62627]
 [ -0.7869    9.5663   -3.7867   -7.5034 ]
 [  2.8561    6.9176   -0.79372   0.48403]
 [  3.4805    9.7008   -3.7541   -3.4379 ]
 [  3.9362   10.1622   -3.8235   -4.0172 ]
 [ -2.7338    0.45523   2.4391    0.21766]
 [  1.2247    8.7779   -2.2135   -0.80647]
 [  3.6216    8.6661   -2.8073   -0.44699]]

first ten elements of s2_train_labels = 
 [1 1 1 0 0 0 0 1 0 0]

########
# TEST #
########
first ten rows of s2_test_features = 
 [[-0.36506  2.8928  -3.6461  -3.0603 ]
 [ 1.6408   4.2503  -4.9023  -2.6621 ]
 [ 3.6289   0.81322  1.6277   0.77627]
 [ 4.8906  -3.3584   3.4202   1.0905 ]
 [ 4.5459   8.1674  -2.4586  -1.4621 ]
 [-1.6677  -7.1535   7.8929   0.96765]
 [ 0.3292  -4.4552   4.5718  -0.9888 ]
 [ 3.866   -2.6383   1.9242   0.10645]
 [ 0.93584  8.8855  -1.6831  -1.6599 ]
 [-3.2238   2.79

# Prior probabilities

In [48]:
s2_priors=np.zeros(2)
class_zero_count=0
class_one_count=0
for label in s2_train_labels:
    if (label==0):
        class_zero_count+=1
    else:
        class_one_count+=1
s2_priors[0]=class_zero_count/ len(s2_train_labels)
s2_priors[1]=class_one_count/ len(s2_train_labels)

s2_priors

array([0.525, 0.475])



# Class-Conditional Gaussian Parameters (Manual Implementation)

For each feature \(x_i\) and class \(c\), the mean \(\mu_{x_i,c}\) and variance \(\sigma^2_{x_i,c}\) are computed **explicitly using the analytical formulas** from *textbook formulas*, and **not** using any built-in NumPy statistics functions.

The formulas used are:

\[
\mu_{x_i,c} = \frac{1}{N_c} \sum_{n=1}^{N_c} x_i^{(n)}
\]

\[
\sigma^2_{x_i,c} = \frac{1}{N_c} \sum_{n=1}^{N_c} \left(x_i^{(n)} - \mu_{x_i,c}\right)^2
\]

These equations are implemented directly using loops and basic arithmetic.

**Note:** Functions such as `np.mean`, `np.var`, `np.std`, or any other statistical library routines were deliberately not used in this implementation.


In [49]:
s2_cc_mean=np.zeros((2, 4))

for i in range(2):
    for j in range(4):
        sum_feature_class_zero=0
        sum_feature_class_one=0
        for k in range(len(s2_train_labels)):
            if s2_train_labels[k]==0:
                sum_feature_class_zero+=s2_train_features[k][j]
            else:
                sum_feature_class_one+=s2_train_features[k][j]
        if i==0:
            s2_cc_mean[i][j]=sum_feature_class_zero/class_zero_count
        else:
            s2_cc_mean[i][j]=sum_feature_class_one/class_one_count

s2_cc_mean

array([[ 2.11768924,  5.20368119,  0.34544714, -1.74017955],
       [-1.82843   , -0.68951058,  1.54479248, -1.21049189]])

In [50]:
s2_cc_var=np.zeros((2,4))

for i in range(2):
    for j in range(4):
        sum_feature_class_zero=0
        sum_feature_class_zero=0
        for k in range(len(s2_train_labels)):
            if s2_train_labels[k]==0:
                sum_feature_class_zero+=(s2_train_features[k][j] - s2_cc_mean[i][j])**2
            else:
                sum_feature_class_one+=(s2_train_features[k][j] - s2_cc_mean[i][j])**2
        if i==0:
            s2_cc_var[i][j]=sum_feature_class_zero/class_zero_count
        else:
            s2_cc_var[i][j]=sum_feature_class_one/class_one_count
s2_cc_var

array([[  4.02932591,  26.38644283,  15.08515814,   5.05878121],
       [112.39413608, 140.53866419, 162.80956804, 167.65640758]])

**b)** Implemented below is  the function `s2_class_conditional_fn` which will compute $P(x_i | c)$. This function takes in the feature, class (class_label), mean and variance (var).
- `feature`: $x_i$
- `class_label`: $c$
- `mean`: mean ($\mu_{x_i, c}$) of associated gaussian distribution for $(x_i, c)$
- `var`: variance ($\sigma^2_{x_i, c}$) of associated gaussian distribution for $(x_i, c)$

In [51]:
def s2_class_conditional_fn(feature, class_label, mean, var):
    cond_prob=...  # i.e. P(x_i | c)
    cond_prob=(1/np.sqrt(2*np.pi*var))*np.exp(-(1/2)*(feature - mean)**2/var)
    return cond_prob

tmp_feature=s2_train_features[0, 0]
# tmp_class = 0
print(f"P(x_0={tmp_feature}|c={0}) = {s2_class_conditional_fn(tmp_feature, 0, s2_cc_mean[0, 0], s2_cc_var[0, 0])}")
print(f"P(x_0={tmp_feature}|c={1}) = {s2_class_conditional_fn(tmp_feature, 1, s2_cc_mean[0, 1], s2_cc_var[0, 1])}")

P(x_0=0.50813|c=0) = 0.14410453742602167
P(x_0=0.50813|c=1) = 0.051141559173017714


# Posterior Probability
Below is the function `s2_calc_posterior` that calculates the posterior probability of a given class based off given data. I.e. it should compute $P(c|x)$.
- `feature`: $x$
- `class_label`: $c$

In [52]:
def s2_calc_posterior(class_label, feature):
    post_prob=s2_priors[class_label]

    for f in feature:
        for i in range(4):
            post_prob*=s2_class_conditional_fn(f, class_label, s2_cc_mean[class_label][i], s2_cc_var[class_label][i])
    denominator=0

    for i in range(2):
        for f in feature:
            for j in range(4):
                denominator+=s2_class_conditional_fn(f, i, s2_cc_mean[i][j], s2_cc_var[i][j])*s2_priors[i]
    post_prob/= denominator

    return post_prob


print(f"P(c=0 | x={s2_test_features[0]}) = {s2_calc_posterior(0, s2_test_features[0])}")
print(f"P(c=1 | x={s2_test_features[0]}) = {s2_calc_posterior(1, s2_test_features[0])}")


P(c=0 | x=[-0.36506  2.8928  -3.6461  -3.0603 ]) = 5.484612613732366e-22
P(c=1 | x=[-0.36506  2.8928  -3.6461  -3.0603 ]) = 7.508286941437842e-25


# Inference


In [53]:
def s2_infer_class(feature):
    c = ...
    class_zero_prob=s2_calc_posterior(0, feature)
    class_one_prob=s2_calc_posterior(1, feature)
    if class_zero_prob>class_one_prob:
        c=0
    else:
        c=1
    return c

print(f"Inferred class for x={s2_test_features[0]} = {s2_infer_class(s2_test_features[0])}")

Inferred class for x=[-0.36506  2.8928  -3.6461  -3.0603 ] = 0


# Confusion matrix and Accuracy

In [None]:
from sklearn.metrics import confusion_matrix

s2_confusion_matrix=np.zeros((2, 2))
actual_class=s2_test_labels
predicted_class=[]
for feature in s2_test_features:
    predicted_class.append(s2_infer_class(feature))

s2_confusion_matrix=confusion_matrix(actual_class, predicted_class)

print(s2_confusion_matrix)


#COMPUTE ACCURACY
s2_acc = ...
s2_acc = (s2_confusion_matrix[0][0] + s2_confusion_matrix[1][1])
denominator = 0
for i in range(2):
    for j in range(2):
        denominator += s2_confusion_matrix[i][j]
s2_acc = s2_acc/denominator

s2_acc