# Préparation des données

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/ref_data.csv')

In [3]:
# Séparation des features et de la cible
X = data.drop(columns=["target"]).values  # Toutes les colonnes sauf "target"
y = data["target"].values  # La colonne cible

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Regression logistique Cross validation

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, make_scorer, log_loss

In [5]:
# Configuration de la grille des hyperparamètres pour LogisticRegression
param_grid = {
    "C": [0.1, 1, 10],  # Paramètre de régularisation
    "penalty": ["l2"],             # Régularisation de type L2
    "solver": ["lbfgs"]            # Solver optimisé pour les modèles logistiques
}

# Scorer basé sur la log loss
scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# Recherche du meilleur modèle via validation croisée
grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000, random_state=0),
    param_grid=param_grid,
    scoring=scorer,
    cv=5,  # Nombre de splits pour la validation croisée
    verbose=1,
    n_jobs=-1
)

# Entraînement
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




In [6]:
# Meilleur modèle
best_model = grid_search.best_estimator_
print("\n=== Best Model ===")
print(grid_search.best_params_)
print(f"Best Validation Loss: {-grid_search.best_score_:.4f}")

# Prédictions avec le meilleur modèle sur l'ensemble de test
log_reg_predictions = best_model.predict(X_test)

# Évaluation de la performance
print("\n=== Logistic Regression Performance ===")
print(classification_report(y_test, log_reg_predictions))
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")


=== Best Model ===
{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Validation Loss: 0.3566

=== Logistic Regression Performance ===
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1158
           1       0.96      0.95      0.95      1185
           2       0.88      0.89      0.89      1189
           3       0.81      0.81      0.81      1208
           4       0.89      0.88      0.88      1240
           5       0.87      0.84      0.85      1185
           6       0.91      0.94      0.92      1167
           7       0.93      0.92      0.92      1238
           8       0.96      0.96      0.96      1219
           9       0.94      0.94      0.94      1211

    accuracy                           0.91     12000
   macro avg       0.91      0.91      0.91     12000
weighted avg       0.91      0.91      0.91     12000

Logistic Regression Accuracy: 0.9055


In [7]:
import pickle

# Enregistrer le modèle avec pickle
with open('../artifacts/reglog_model_resnet50.pkl', 'wb') as f:
    pickle.dump(best_model, f)

data["prediction"] = best_model.predict(X)

# Enregistrer le fichier ref_data avec les prédictions du modèle

data.to_csv('../data/ref_data.csv', index=False)

Test

In [76]:
import base64

# Chemin de l'image
img_path = '../data/images_test/bird.jpg'

# Ouvrir l'image en mode binaire
with open(img_path, "rb") as image_file:
    # Lire l'image et encoder en base64
    encoded_string = base64.b64encode(image_file.read()).decode('utf-8')

/9j/4AAQSkZJRgABAQEAYABgAAD//gA7Q1JFQVRPUjogZ2QtanBlZyB2MS4wICh1c2luZyBJSkcgSlBFRyB2ODApLCBxdWFsaXR5ID0gOTAK/9sAQwADAgIDAgIDAwMDBAMDBAUIBQUEBAUKBwcGCAwKDAwLCgsLDQ4SEA0OEQ4LCxAWEBETFBUVFQwPFxgWFBgSFBUU/9sAQwEDBAQFBAUJBQUJFA0LDRQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQU/8IAEQgBLAEsAwERAAIRAQMRAf/EABwAAQACAwEBAQAAAAAAAAAAAAAFBgMEBwIBCP/EABkBAQADAQEAAAAAAAAAAAAAAAABAgMEBf/aAAwDAQACEAMQAAAB/VIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABzHC3KcrwkTJ2jp969Z6KZAAAAAAAAAAAAAAAfIfn/i35TpW19PPvRNT4uzcyW3fP9H9mPqQAAAAAAAAAAAAAiqW/NnF0Ubn26P6/mc88zvgrRZI00Nea0ZzZsWfvz6/00uIAAAAAAAAAABhrPLufblGWnPuff7nbB183X/S8+seR6MVydEv0ZdV9DjnLRcpbmldyQAAAAAAAAAAwVnjnH08Aw2gU+6z8ifsx9hfOjC7+jw+qpGZueevRbUy2AAAAAAAAAACOpPMubb898vVXItr2jNW2Ga/JeSZ59d6Ysfbx3X0OGySv+etutAAAAAAAAAAFewtybh6OccfRUN6VLsy3ctd6s4bRs1t6ic1Vg5tZBW1+55W5vlYonrFdJaQ

In [80]:
import pickle
import torch
from PIL import Image
import io
import base64
import torchvision.transforms as transforms
import numpy as np

# Charger le modèle ResNet18 sauvegardé en pickle
with open('../artifacts/resnet18_embedding.pkl', 'rb') as f:
    resnet18 = pickle.load(f)

# Charger le scaler sauvegardé en pickle
with open('../artifacts/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Encoded string de l'image (donnée obtenue par base64)

# Décoder l'image depuis sa chaîne base64
img_data = base64.b64decode(encoded_string)
img = Image.open(io.BytesIO(img_data))  # Convertir en image PIL

# Définir la transformation pour redimensionner et normaliser l'image
transform = transforms.Compose([
    transforms.Resize((32, 32)),  # Redimensionner l'image à 32x32 pour CIFAR-10
    transforms.ToTensor(),  # Convertir l'image en un tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalisation
])

# Appliquer la transformation à l'image
img_tensor = transform(img).unsqueeze(0)  # Ajouter une dimension batch (1, C, H, W)

# Passer l'image dans le modèle pour obtenir l'embedding
resnet18.eval()  # Mettre le modèle en mode évaluation
with torch.no_grad():
    embedding = resnet18(img_tensor)  # Obtenir l'embedding
    embedding = embedding.view(embedding.size(0), -1)  # Redimensionner les embeddings

# Convertir l'embedding en numpy array
embedding_np = embedding.numpy()

# Appliquer la transformation du scaler sur l'embedding
embedding_scaled = scaler.transform(embedding_np)

In [84]:
import pickle

# Charger le modèle avec pickle
with open('../artifacts/reglog_model.pkl', 'rb') as f:
    log_reg_model = pickle.load(f)

# Effectuer la prédiction avec l'embedding transformé
prediction = log_reg_model.predict(embedding_scaled)

# Afficher la prédiction
print(prediction)

[8]


# Régression logisitique

In [5]:
log_reg_model = LogisticRegression(max_iter=1000, random_state=0, verbose=1)
log_reg_model.fit(X_train, y_train)
log_reg_predictions = log_reg_model.predict(X_test)

# Évaluation de la régression logistique
print("\n=== Logistic Regression Performance ===")
print(classification_report(y_test, log_reg_predictions))
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")


=== Logistic Regression Performance ===
              precision    recall  f1-score   support

           0       0.63      0.59      0.61      1158
           1       0.69      0.67      0.68      1185
           2       0.50      0.49      0.50      1189
           3       0.43      0.36      0.39      1208
           4       0.50      0.54      0.52      1240
           5       0.53      0.53      0.53      1185
           6       0.63      0.69      0.66      1167
           7       0.61      0.60      0.61      1238
           8       0.67      0.70      0.69      1219
           9       0.66      0.69      0.67      1211

    accuracy                           0.59     12000
   macro avg       0.59      0.59      0.59     12000
weighted avg       0.58      0.59      0.59     12000

Logistic Regression Accuracy: 0.5874


In [None]:
import joblib

joblib.dump(log_reg_model, '../artifacts/reglog_model.pkl')

In [5]:
import joblib
log_reg_model = joblib.load('../artifacts/reglog_model.pkl')

In [6]:
# Évaluation de la régression logistique
print("\n=== Logistic Regression Performance ===")
log_reg_predictions = log_reg_model.predict(X_test)
print(classification_report(y_test, log_reg_predictions))
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")


=== Logistic Regression Performance ===
              precision    recall  f1-score   support

           0       0.60      0.57      0.59      1158
           1       0.67      0.64      0.66      1185
           2       0.47      0.46      0.47      1189
           3       0.43      0.34      0.38      1208
           4       0.48      0.51      0.50      1240
           5       0.51      0.54      0.52      1185
           6       0.61      0.69      0.64      1167
           7       0.61      0.60      0.60      1238
           8       0.65      0.68      0.66      1219
           9       0.63      0.66      0.65      1211

    accuracy                           0.57     12000
   macro avg       0.57      0.57      0.57     12000
weighted avg       0.57      0.57      0.57     12000

Logistic Regression Accuracy: 0.5689


# SGD classifier

In [9]:
from sklearn.linear_model import SGDClassifier

# Création et entraînement du modèle SGDClassifier
sgd_model = SGDClassifier(loss="log_loss", random_state=42, max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)

# Prédictions sur les données de test
sgd_predictions = sgd_model.predict(X_test)

# Évaluation des performances
print("\n=== SGDClassifier Performance ===")
print(classification_report(y_test, sgd_predictions))
sgd_accuracy = accuracy_score(y_test, sgd_predictions)
print(f"SGDClassifier Accuracy: {sgd_accuracy:.4f}")



=== SGDClassifier Performance ===
              precision    recall  f1-score   support

           0       0.61      0.57      0.59      1158
           1       0.61      0.69      0.64      1185
           2       0.50      0.43      0.46      1189
           3       0.38      0.36      0.37      1208
           4       0.48      0.45      0.46      1240
           5       0.49      0.53      0.51      1185
           6       0.61      0.70      0.65      1167
           7       0.60      0.57      0.58      1238
           8       0.66      0.64      0.65      1219
           9       0.60      0.65      0.63      1211

    accuracy                           0.56     12000
   macro avg       0.55      0.56      0.56     12000
weighted avg       0.55      0.56      0.55     12000

SGDClassifier Accuracy: 0.5570


# Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluate the Random Forest model
print("=== Random Forest Performance ===")
print(classification_report(y_test, rf_predictions))
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s


=== Random Forest Performance ===
              precision    recall  f1-score   support

           0       0.54      0.49      0.51      1158
           1       0.52      0.58      0.55      1185
           2       0.44      0.35      0.39      1189
           3       0.31      0.22      0.26      1208
           4       0.44      0.43      0.43      1240
           5       0.43      0.49      0.46      1185
           6       0.54      0.65      0.59      1167
           7       0.48      0.50      0.49      1238
           8       0.58      0.59      0.58      1219
           9       0.52      0.57      0.55      1211

    accuracy                           0.49     12000
   macro avg       0.48      0.49      0.48     12000
weighted avg       0.48      0.49      0.48     12000

Random Forest Accuracy: 0.4865


# KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the K-Nearest Neighbors classifier
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

# Evaluate the KNN model
print("=== K-Nearest Neighbors Performance ===")
print(classification_report(y_test, knn_predictions))
knn_accuracy = accuracy_score(y_test, knn_predictions)
print(f"KNN Accuracy: {knn_accuracy:.4f}")


=== K-Nearest Neighbors Performance ===
              precision    recall  f1-score   support

           0       0.48      0.50      0.49      1158
           1       0.51      0.61      0.55      1185
           2       0.38      0.39      0.38      1189
           3       0.30      0.24      0.27      1208
           4       0.35      0.47      0.40      1240
           5       0.38      0.42      0.40      1185
           6       0.54      0.56      0.55      1167
           7       0.56      0.42      0.48      1238
           8       0.63      0.52      0.57      1219
           9       0.56      0.49      0.52      1211

    accuracy                           0.46     12000
   macro avg       0.47      0.46      0.46     12000
weighted avg       0.47      0.46      0.46     12000

KNN Accuracy: 0.4622


# MLP

In [7]:
from sklearn.neural_network import MLPClassifier

# Initialize the Multilayer Perceptron classifier
mlp_model = MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=1000, random_state=42, verbose=True)
mlp_model.fit(X_train, y_train)
mlp_predictions = mlp_model.predict(X_test)

# Evaluate the MLP model
print("=== MLP Performance ===")
print(classification_report(y_test, mlp_predictions))
mlp_accuracy = accuracy_score(y_test, mlp_predictions)
print(f"MLP Accuracy: {mlp_accuracy:.4f}")


Iteration 1, loss = 1.39815581
Iteration 2, loss = 1.13064344
Iteration 3, loss = 1.01674308
Iteration 4, loss = 0.92828984
Iteration 5, loss = 0.85992009
Iteration 6, loss = 0.78034533
Iteration 7, loss = 0.70745436
Iteration 8, loss = 0.64558809
Iteration 9, loss = 0.58083006
Iteration 10, loss = 0.52981683
Iteration 11, loss = 0.47508068
Iteration 12, loss = 0.42300204
Iteration 13, loss = 0.37734343
Iteration 14, loss = 0.33634761
Iteration 15, loss = 0.30456746
Iteration 16, loss = 0.27812151
Iteration 17, loss = 0.25080483
Iteration 18, loss = 0.21733462
Iteration 19, loss = 0.19320532
Iteration 20, loss = 0.19763006
Iteration 21, loss = 0.18454186
Iteration 22, loss = 0.16925759
Iteration 23, loss = 0.14204800
Iteration 24, loss = 0.12476642
Iteration 25, loss = 0.09231683
Iteration 26, loss = 0.08160380
Iteration 27, loss = 0.13893932
Iteration 28, loss = 0.14060887
Iteration 29, loss = 0.10447464
Iteration 30, loss = 0.08829560
Iteration 31, loss = 0.11886017
Iteration 32, los

# MLP pytorch

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report, accuracy_score

# Définir le modèle MLP
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        return self.layers(x)

# Initialisation des paramètres
input_size = X_train.shape[1]  # Nombre de features (512)
hidden_size = 128  # Taille de la couche cachée
output_size = len(set(y_train))  # Nombre de classes
batch_size = 128
epochs = 10
learning_rate = 0.001

# Conversion des données en tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Charger les données dans des DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialiser le modèle, la perte et l'optimiseur
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mlp_model = MLP(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=learning_rate)

# Entraînement
print("\n=== Training MLP ===")
for epoch in range(epochs):
    mlp_model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = mlp_model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Test
mlp_model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = mlp_model(X_batch)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.cpu().numpy())
        y_true.extend(y_batch.numpy())

# Évaluation des performances
print("\n=== MLP Performance ===")
print(classification_report(y_true, y_pred))
mlp_accuracy = accuracy_score(y_true, y_pred)
print(f"MLP Accuracy: {mlp_accuracy:.4f}")

NameError: name 'X_train' is not defined

In [5]:
import joblib
# on enregistre le modèle
save_path = "../artifacts/mlp_model.pkl"
joblib.dump({
    'model_class': mlp_model.__class__,
    'model_state_dict': mlp_model.state_dict()
}, save_path)

['../artifacts/mlp_model.pkl']

# XG Boost

In [8]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, verbosity=1)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the XGBoost model
print("=== XGBoost Performance ===")
print(classification_report(y_test, xgb_predictions))
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")


=== XGBoost Performance ===
              precision    recall  f1-score   support

           0       0.61      0.58      0.59      1158
           1       0.65      0.66      0.66      1185
           2       0.50      0.46      0.48      1189
           3       0.38      0.35      0.36      1208
           4       0.49      0.48      0.48      1240
           5       0.48      0.50      0.49      1185
           6       0.62      0.68      0.65      1167
           7       0.58      0.60      0.59      1238
           8       0.68      0.68      0.68      1219
           9       0.62      0.65      0.63      1211

    accuracy                           0.56     12000
   macro avg       0.56      0.56      0.56     12000
weighted avg       0.56      0.56      0.56     12000

XGBoost Accuracy: 0.5636


# Conclusion: choix du modèle

On a séléctionné comme modèle pour la classification finale un Multi Layer Perceptron (MLP) pour deux raisons:
- meilleurs résultats sur le dataset de test parmi les modèles testés
- possibilité de mettre le modèle à jour de manière incrémentale avec les nouvelles données une fois mis en production