In [154]:
import pandas as pd
import numpy as np
import csv


In [155]:
# Read file to get data
file1 = open("train.csv")
file2 = open("test.csv")

csvreader1 = csv.reader(file1)
csvreader2 = csv.reader(file2)

header_train = []
header_train = next(csvreader1)

next(csvreader2)

data_train = []
for row in csvreader1:
    data_train.append(row)

data_test = []
for row in csvreader2:
    data_test.append(row)

file1.close()
file2.close()

    # Turn our data in np.array and remove the first attribue which is just numeration
data_train = np.array(data_train, dtype=float)
data_train = np.delete(data_train, 0, axis=1)

data_test = np.array(data_test, dtype=float)
data_test = np.delete(data_test, 0, axis=1)


In [156]:
class GaussianMaxLikelihood:
    def __init__(self, n_dims, cov_type='isotropic'):
        self.cov_type = cov_type
        self.n_dims = n_dims
        self.mu = np.zeros(n_dims)
        # Nous avons un scalaire comme écart-type car notre modèle est une loi gaussienne isotropique
        self.sigma_sq = 1.0

    # Pour un jeu d'entraînement, la fonction devrait calculer les estimateur ML de l'espérance et de la variance
    def train(self, train_data):
        # Ici, nous devons trouver la moyenne et la variance dans train_data et les définir dans self.mu and self.

        self.mu = np.mean(train_data, axis=0)

        # here we will create the covariance matrix
        if self.cov_type == 'isotropic':
            # Identity times sigma square
            self.covariance = np.eye(
                self.n_dims) * np.sum((train_data - self.mu) ** 2.0) / (self.n_dims * train_data.shape[0])
        elif self.cov_type == 'diagonal':
            # put the variance on the diagonal
            self.covariance = np.diag(np.var(train_data, axis=0))
        else:
            # Calculate the full covariance matrix
            self.covariance = np.cov(train_data, rowvar=False)

    # Retourne un vecteur de dimension égale au nombre d'ex. test qui contient les log probabilité de chaque
    # exemple test

    def loglikelihood(self, test_data):
        # Calculer la constante de normalisation de la façon standard, sans raccourci
        c = -(np.log(np.sqrt(np.linalg.det(self.covariance))) +
              (self.n_dims / 2) * np.log(2 * np.pi))
        # Ensuite la log prob
        # Notez l'absence d'un second np.dot. Pouvez-vous deviner pourquoi?
        log_prob = c - (np.dot((test_data - self.mu), np.linalg.inv(self.covariance))
                        * (test_data - self.mu)).sum(axis=1) / 2
        return log_prob


In [157]:
class BayesClassifier:
    def __init__(self, maximum_likelihood_models, priors):
        self.maximum_likelihood_models = maximum_likelihood_models
        self.priors = priors
        if len(self.maximum_likelihood_models) != len(self.priors):
            print('The number of ML models must be equal to the number of priors!')
        self.n_classes = len(self.maximum_likelihood_models)

    # Retourne une matrice de dimension [nb d'ex. test, nb de classes] contenant les log
    # probabilités de chaque ex. test sous le modèle entrainé par le MV.
    def loglikelihood(self, test_data):

        log_pred = np.zeros((test_data.shape[0], self.n_classes))

        for i in range(self.n_classes):
            # Ici, nous devrons utiliser maximum_likelihood_models[i] et priors pour remplir
            # chaque colonne de log_pred (c'est plus efficace de remplir une colonne à la fois)

            log_pred[:, i] = self.maximum_likelihood_models[i].loglikelihood(
                test_data) + np.log(self.priors[i])

        return log_pred


In [158]:
X_train=data_train[:,0:19]
Y_train=data_train[:,-1]

X_test = data_test


In [159]:
print(len(X_test[0]))
print(len(data_test[0]))
print(len(X_train[0]))
print(len(data_train[0]))

print(len(X_test))
print(len(data_test))
print(len(X_train))
print(len(data_train))

19
19
19
20
10320
10320
44760
44760


In [160]:
#print(len(data_train[0]))
#print(data_train[0])

In [161]:
Data_trainClass0=X_train[data_train[:,-1]==0]
Data_trainClass1 = X_train[data_train[:, -1] == 1]
Data_trainClass2 = X_train[data_train[:, -1] == 2]

cov_type = 'full'

model_class1 = GaussianMaxLikelihood(19, cov_type)
model_class2 = GaussianMaxLikelihood(19, cov_type)
model_class3 = GaussianMaxLikelihood(19, cov_type)
model_class1.train(Data_trainClass0)
model_class2.train(Data_trainClass0)
model_class3.train(Data_trainClass0)


In [162]:
print(Data_trainClass1)

[[2.17079531e+01 2.75000000e+02 7.89094315e+01 ... 1.24002275e+04
  6.62415848e+01 2.00310240e+07]
 [2.17079531e+01 2.75000000e+02 7.89094315e+01 ... 1.24002275e+04
  6.62415848e+01 2.00310240e+07]
 [2.14732725e+01 2.75000000e+02 8.06413574e+01 ... 1.23713320e+04
  6.61652679e+01 2.00310240e+07]
 ...
 [1.34941330e+01 2.53750000e+02 5.84715576e+01 ... 1.23409697e+04
  6.64385223e+01 2.00112070e+07]
 [1.34941330e+01 2.53750000e+02 5.84715576e+01 ... 1.23409697e+04
  6.64385223e+01 2.00112070e+07]
 [1.34941330e+01 2.53750000e+02 5.84715576e+01 ... 1.23409697e+04
  6.64385223e+01 2.00112070e+07]]


In [163]:
model_ml = [model_class1, model_class2, model_class3]


In [164]:
# Calculate the class priors
total_samples = data_train.shape[0]

# Count the number of samples in each class
num_samples_class0 = len(Data_trainClass0)
num_samples_class1 = len(Data_trainClass1)
num_samples_class2 = len(Data_trainClass2)

# Calculate the class priors
prior_class0 = num_samples_class0 / total_samples
prior_class1 = num_samples_class1 / total_samples
prior_class2 = num_samples_class2 / total_samples

print("Class Priors:")
print("Class 0 Prior:", prior_class0)
print("Class 1 Prior:", prior_class1)
print("Class 2 Prior:", prior_class2)


Class Priors:
Class 0 Prior: 0.7859472743521001
Class 1 Prior: 0.04077301161751564
Class 2 Prior: 0.17327971403038428


In [165]:
priors=np.array([prior_class0,prior_class1,prior_class2])
classifier = BayesClassifier(model_ml, priors)


In [166]:
def get_accuracy(data, labels):
    # Nous pouvons calculez les log-probabilités selon notre modèle
    log_prob = classifier.loglikelihood(data)
    # Il reste à calculer les classes prédites
    classes_pred = log_prob.argmax(1)
    # Retournez l'exactitude en comparant les classes prédites aux vraies étiquettes
    acc = np.mean(classes_pred == labels)
    return acc


In [167]:
print("The training accuracy is : {:.1f} % ".format(
    100 * get_accuracy(X_train, Y_train)))



The training accuracy is : 78.6 % 


In [168]:
log_prob = classifier.loglikelihood(X_test)
# Il reste à calculer les classes prédites
classes_pred = log_prob.argmax(1)

print(classes_pred)

[0 0 0 ... 0 0 0]
