In [1]:
import pandas as pd
import numpy as np
from scipy.io import loadmat

train_data_row = loadmat('data_train.mat')
train_label_row = loadmat('label_train.mat')
test_data_row = loadmat('data_test.mat')

In [2]:
train_data = pd.DataFrame(train_data_row['data_train'])
train_label = pd.DataFrame(train_label_row['label_train'])
test_data = pd.DataFrame(test_data_row['data_test'])

In [3]:
train_label_1d = train_label.copy()
train_label_1d = train_label_1d.values.ravel()

##  Bayes Decision Rule

In [4]:
# priors probability
class_priors = pd.DataFrame(train_label.value_counts() / len(train_label))
class_priors_1 = class_priors.reset_index().transpose()
class_priors_1 = class_priors_1[1:]
class_priors_1 = pd.DataFrame(data= np.tile(class_priors_1.values, (26, 1)), columns=class_priors_1.columns)

In [5]:
# use Gaussian Mixture Model to estimate the data
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=2)  # 2 guassian parameters for binary classification
gmm.fit(train_data)

class_conditional_1 = gmm.predict_proba(test_data) # compute conditional probability

# Bayes decision rule: find the max
predictions = np.argmax(class_priors_1  * class_conditional_1, axis=1)
predictions_1 = np.where(predictions == 0, 1, -1) # map labels

In [7]:
result_df_1= test_data.copy()
result_df_1['label_pred'] = predictions_1
result_df_1

Unnamed: 0,0,1,2,3,4,label_pred
0,1.532322,2.505856,-0.117636,2.700224,0.589047,-1
1,0.791766,3.180243,0.769253,0.087467,1.178948,-1
2,1.232938,2.060771,0.063956,1.921177,0.090001,-1
3,0.109037,2.080132,-0.540951,-0.264707,0.401471,-1
4,0.90186,1.924153,2.430004,1.736052,0.979496,-1
5,1.692486,1.820443,0.59375,1.324076,0.89545,-1
6,1.863935,2.655364,1.947236,0.044897,1.097547,-1
7,-0.930126,2.7537,1.272196,0.149966,-0.421402,-1
8,1.060742,3.056132,1.229324,0.527029,0.496247,-1
9,2.168908,1.713703,1.102125,1.63811,0.507569,-1


## Naive Bayes

In [8]:
class_priors_2 = train_label.value_counts(normalize=True)


train_label_1d = train_label.copy()
train_label_1d = train_label_1d.values.ravel() 
train_data_labeled = train_data.copy()
train_data_labeled['label'] = train_label_1d
class_conditionals_2 = train_data_labeled.groupby('label').agg(['mean', 'std'])

In [9]:
class_priors_2

-1    0.5
 1    0.5
Name: proportion, dtype: float64

In [10]:
class_conditionals_2

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
-1,1.051708,1.023597,1.799575,0.915979,0.859565,1.047478,1.064208,1.01871,1.008555,1.010169
1,0.013795,1.0585,-0.087557,0.96873,0.084613,0.92315,-0.154829,0.972878,0.02275,1.081467


In [11]:
# Naive Bayes
def naive_bayes(sample, class_priors, class_conditionals):
    posteriors = []
    for (class_label,) in class_priors.index:
        prior = class_priors[class_label]
        likelihoods = []
        for feature in sample.index:
            cond_mean = class_conditionals.loc[class_label, (feature, 'mean')]
            cond_std = class_conditionals.loc[class_label, (feature, 'std')]
            likelihood = (1 / (np.sqrt(2 * np.pi) * cond_std)) * np.exp(-0.5 * ((sample[feature] - cond_mean) / cond_std) ** 2)
            likelihoods.append(likelihood)
        posterior = prior * np.prod(likelihoods)
        posteriors.append(posterior)
    predicted_class = class_priors.index[np.argmax(posteriors)]
    return predicted_class

In [12]:
predictions = test_data.apply(lambda row: naive_bayes(row, class_priors_2, class_conditionals_2), axis=1)
predictions_2 = predictions.to_numpy()

In [13]:
result_df_2= test_data.copy()
result_df_2['label_pred'] = predictions_2
result_df_2

Unnamed: 0,0,1,2,3,4,label_pred
0,1.532322,2.505856,-0.117636,2.700224,0.589047,"(-1,)"
1,0.791766,3.180243,0.769253,0.087467,1.178948,"(-1,)"
2,1.232938,2.060771,0.063956,1.921177,0.090001,"(-1,)"
3,0.109037,2.080132,-0.540951,-0.264707,0.401471,"(-1,)"
4,0.90186,1.924153,2.430004,1.736052,0.979496,"(-1,)"
5,1.692486,1.820443,0.59375,1.324076,0.89545,"(-1,)"
6,1.863935,2.655364,1.947236,0.044897,1.097547,"(-1,)"
7,-0.930126,2.7537,1.272196,0.149966,-0.421402,"(-1,)"
8,1.060742,3.056132,1.229324,0.527029,0.496247,"(-1,)"
9,2.168908,1.713703,1.102125,1.63811,0.507569,"(-1,)"


In [14]:
from sklearn.naive_bayes import GaussianNB

# Naive Bayes
classifier = GaussianNB()

classifier.fit(train_data, train_label_1d)

predictions_3 = classifier.predict(test_data)

In [15]:
result_df_3= test_data.copy()
result_df_3['label_pred'] = predictions_3
result_df_3

Unnamed: 0,0,1,2,3,4,label_pred
0,1.532322,2.505856,-0.117636,2.700224,0.589047,-1
1,0.791766,3.180243,0.769253,0.087467,1.178948,-1
2,1.232938,2.060771,0.063956,1.921177,0.090001,-1
3,0.109037,2.080132,-0.540951,-0.264707,0.401471,-1
4,0.90186,1.924153,2.430004,1.736052,0.979496,-1
5,1.692486,1.820443,0.59375,1.324076,0.89545,-1
6,1.863935,2.655364,1.947236,0.044897,1.097547,-1
7,-0.930126,2.7537,1.272196,0.149966,-0.421402,-1
8,1.060742,3.056132,1.229324,0.527029,0.496247,-1
9,2.168908,1.713703,1.102125,1.63811,0.507569,-1


## Linear Discriminant Analysis

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()

train_label_1d = train_label.copy()
train_label_1d = train_label_1d.values.ravel() 

lda.fit(train_data, train_label_1d)

predictions_4 = lda.predict(test_data)

In [17]:
result_df_4= test_data.copy()
result_df_4['label_pred'] = predictions_4
result_df_4

Unnamed: 0,0,1,2,3,4,label_pred
0,1.532322,2.505856,-0.117636,2.700224,0.589047,-1
1,0.791766,3.180243,0.769253,0.087467,1.178948,-1
2,1.232938,2.060771,0.063956,1.921177,0.090001,-1
3,0.109037,2.080132,-0.540951,-0.264707,0.401471,-1
4,0.90186,1.924153,2.430004,1.736052,0.979496,-1
5,1.692486,1.820443,0.59375,1.324076,0.89545,-1
6,1.863935,2.655364,1.947236,0.044897,1.097547,-1
7,-0.930126,2.7537,1.272196,0.149966,-0.421402,-1
8,1.060742,3.056132,1.229324,0.527029,0.496247,-1
9,2.168908,1.713703,1.102125,1.63811,0.507569,-1
