# Separate Semi-Supervised Training Algorithms

## 1. Pseudo-Labeling Algorithm

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

class PseudoLabelingTrainer:
    """Entra√Æneur pour pseudo-labeling"""
    
    def __init__(self, data_dir='preprocessed_ml_data'):
        self.data_dir = data_dir
        self.classes = ['dark', 'light', 'mid-dark', 'mid-light']
        self.load_data()
        self.results = []
    
    def load_data(self):
        """Charge les donn√©es pr√©trait√©es"""
        print("üìÇ Chargement des donn√©es pr√©trait√©es...")
        
        # Charger les datasets
        datasets = {}
        for split in ['train', 'val', 'test', 'unlabelled']:
            data = np.load(os.path.join(self.data_dir, f'{split}_data.npz'))
            datasets[split] = {
                'X': data['X'],
                'y': data['y'],
                'filenames': data['filenames']
            }
        
        self.X_train = datasets['train']['X']
        self.y_train = datasets['train']['y']
        self.X_val = datasets['val']['X'] 
        self.y_val = datasets['val']['y']
        self.X_test = datasets['test']['X']
        self.y_test = datasets['test']['y']
        self.X_unlabelled = datasets['unlabelled']['X']
        
        print(f"  ‚úì Donn√©es charg√©es: {len(self.X_train)} train, {len(self.X_val)} val, {len(self.X_test)} test, {len(self.X_unlabelled)} unlabelled")
    
    def train_pseudo_labeling(self, base_classifier='rf', confidence_threshold=0.8, max_iterations=10):
        """Impl√©mente le pseudo-labeling"""
        print(f"\nüîÑ PSEUDO-LABELING avec {base_classifier}, seuil={confidence_threshold}")
        
        # S√©lectionner le classifieur de base
        if base_classifier == 'rf':
            clf = RandomForestClassifier(n_estimators=100, random_state=42)
        elif base_classifier == 'svm':
            clf = SVC(probability=True, random_state=42)
        elif base_classifier == 'lr':
            clf = LogisticRegression(random_state=42, max_iter=1000)
        
        # Donn√©es initiales
        X_labeled = self.X_train.copy()
        y_labeled = self.y_train.copy()
        X_unlabeled = self.X_unlabelled.copy()
        
        best_accuracy = 0
        iteration_results = []
        
        for iteration in range(max_iterations):
            print(f"  It√©ration {iteration + 1}/{max_iterations}")
            
            # Entra√Æner sur donn√©es labellis√©es actuelles
            clf.fit(X_labeled, y_labeled)
            
            # Pr√©dire sur donn√©es non-labellis√©es
            if hasattr(clf, 'predict_proba'):
                proba_predictions = clf.predict_proba(X_unlabeled)
                predictions = np.argmax(proba_predictions, axis=1)
                confidences = np.max(proba_predictions, axis=1)
            else:
                predictions = clf.predict(X_unlabeled)
                confidences = np.ones(len(predictions))  # Pas de probabilit√©s
            
            # S√©lectionner les pr√©dictions confiantes
            confident_mask = confidences >= confidence_threshold
            X_confident = X_unlabeled[confident_mask]
            y_confident = predictions[confident_mask]
            
            if len(X_confident) == 0:
                print(f"    Aucune pr√©diction confiante trouv√©e, arr√™t √† l'it√©ration {iteration + 1}")
                break
            
            # Ajouter les donn√©es pseudo-labellis√©es
            X_labeled = np.vstack([X_labeled, X_confident])
            y_labeled = np.hstack([y_labeled, y_confident])
            
            # Retirer des donn√©es non-labellis√©es
            X_unlabeled = X_unlabeled[~confident_mask]
            
            print(f"    Ajout√© {len(X_confident)} pseudo-labels, {len(X_unlabeled)} restants")
            
            # √âvaluer sur validation
            val_accuracy = clf.score(self.X_val, self.y_val)
            iteration_results.append({
                'iteration': iteration + 1,
                'val_accuracy': val_accuracy,
                'pseudo_labels_added': len(X_confident),
                'remaining_unlabeled': len(X_unlabeled)
            })
            
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
        
        # √âvaluation finale sur test
        test_accuracy = clf.score(self.X_test, self.y_test)
        test_f1 = f1_score(self.y_test, clf.predict(self.X_test), average='weighted')
        
        result = {
            'method': 'pseudo_labeling',
            'base_classifier': base_classifier,
            'confidence_threshold': confidence_threshold,
            'max_iterations': max_iterations,
            'final_test_accuracy': test_accuracy,
            'final_test_f1': test_f1,
            'best_val_accuracy': best_accuracy,
            'iterations': iteration_results,
            'total_pseudo_labels': len(X_labeled) - len(self.X_train)
        }
        
        self.results.append(result)
        print(f"  ‚úÖ Test accuracy: {test_accuracy:.4f}, F1: {test_f1:.4f}")
        
        return result
    
    def run_multiple_sessions(self):
        """Ex√©cute plusieurs sessions avec diff√©rents param√®tres"""
        print("üöÄ PSEUDO-LABELING: SESSIONS MULTIPLES")
        print("=" * 50)
        
        for clf in ['rf', 'svm', 'lr']:
            for threshold in [0.7, 0.8, 0.9]:
                self.train_pseudo_labeling(base_classifier=clf, confidence_threshold=threshold)
    
    def save_results(self, output_file='pseudo_labeling_results.pkl'):
        """Sauvegarde les r√©sultats"""
        with open(output_file, 'wb') as f:
            pickle.dump(self.results, f)
        print(f"\nüíæ R√©sultats sauvegard√©s dans {output_file}")
    
    def display_summary(self):
        """Affiche un r√©sum√©"""
        print("\nüìä R√âSUM√â PSEUDO-LABELING")
        print("=" * 30)
        
        if not self.results:
            print("Aucun r√©sultat")
            return
        
        df = pd.DataFrame(self.results)
        print(f"  Exp√©riences: {len(df)}")
        print(f"  Accuracy moyenne: {df['final_test_accuracy'].mean():.4f}")
        print(f"  Meilleur r√©sultat: {df['final_test_accuracy'].max():.4f}")
        
        # Meilleur r√©sultat
        best_idx = df['final_test_accuracy'].idxmax()
        best = df.loc[best_idx]
        print(f"  Meilleure config: {best['base_classifier']} + seuil {best['confidence_threshold']}")

def main():
    """Fonction principale pour pseudo-labeling"""
    print("üéØ PSEUDO-LABELING POUR CLASSIFICATION DE TEINT DE PEAU")
    print("=" * 60)
    
    trainer = PseudoLabelingTrainer(data_dir='preprocessed_ml_data')
    trainer.run_multiple_sessions()
    trainer.save_results('pseudo_labeling_results.pkl')
    trainer.display_summary()
    
    print("\n‚úÖ Pseudo-labeling termin√©!")

if __name__ == "__main__":
    main()

üéØ PSEUDO-LABELING POUR CLASSIFICATION DE TEINT DE PEAU
üìÇ Chargement des donn√©es pr√©trait√©es...
  ‚úì Donn√©es charg√©es: 5748 train, 1228 val, 1236 test, 19164 unlabelled
üöÄ PSEUDO-LABELING: SESSIONS MULTIPLES

üîÑ PSEUDO-LABELING avec rf, seuil=0.7
  It√©ration 1/10
    Ajout√© 2402 pseudo-labels, 16762 restants
  It√©ration 2/10
    Ajout√© 1172 pseudo-labels, 15590 restants
  It√©ration 3/10
    Ajout√© 594 pseudo-labels, 14996 restants
  It√©ration 4/10
    Ajout√© 358 pseudo-labels, 14638 restants
  It√©ration 5/10
    Ajout√© 277 pseudo-labels, 14361 restants
  It√©ration 6/10
    Ajout√© 201 pseudo-labels, 14160 restants
  It√©ration 7/10
    Ajout√© 150 pseudo-labels, 14010 restants
  It√©ration 8/10
    Ajout√© 112 pseudo-labels, 13898 restants
  It√©ration 9/10
    Ajout√© 106 pseudo-labels, 13792 restants
  It√©ration 10/10
    Ajout√© 83 pseudo-labels, 13709 restants
  ‚úÖ Test accuracy: 0.6675, F1: 0.6569

üîÑ PSEUDO-LABELING avec rf, seuil=0.8
  It√©ration 1/

## 2. Graph-Based Methods Algorithm


In [4]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import time
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import classification_report, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

class GraphBasedTrainer:
    """Entra√Æneur pour m√©thodes graph-based"""
    
    def __init__(self, data_dir='preprocessed_ml_data'):
        self.data_dir = data_dir
        self.classes = ['dark', 'light', 'mid-dark', 'mid-light']
        self.load_data()
        self.results = []
    
    def load_data(self):
        """Charge les donn√©es pr√©trait√©es"""
        print("üìÇ Chargement des donn√©es pr√©trait√©es...")
        
        datasets = {}
        for split in ['train', 'val', 'test', 'unlabelled']:
            data = np.load(os.path.join(self.data_dir, f'{split}_data.npz'))
            datasets[split] = {
                'X': data['X'],
                'y': data['y'],
                'filenames': data['filenames']
            }
        
        self.X_train = datasets['train']['X']
        self.y_train = datasets['train']['y']
        self.X_val = datasets['val']['X'] 
        self.y_val = datasets['val']['y']
        self.X_test = datasets['test']['X']
        self.y_test = datasets['test']['y']
        self.X_unlabelled = datasets['unlabelled']['X']
        
        print(f"  ‚úì Donn√©es charg√©es: {len(self.X_train)} train, {len(self.X_val)} val, {len(self.X_test)} test, {len(self.X_unlabelled)} unlabelled")
    
    def train_graph_method(self, method='propagation', kernel='rbf', gamma=20, alpha=0.2):
        """Impl√©mente les m√©thodes graph-based"""
        print(f"\nüï∏Ô∏è  {method.upper()}: kernel={kernel}, gamma={gamma}", end="")
        if method == 'spreading':
            print(f", alpha={alpha}")
        else:
            print("")
        
        # Combiner donn√©es labellis√©es et non-labellis√©es
        X_combined = np.vstack([self.X_train, self.X_unlabelled])
        y_combined = np.hstack([self.y_train, np.full(len(self.X_unlabelled), -1)])  # -1 pour non-labellis√©
        
        # S√©lectionner la m√©thode
        if method == 'propagation':
            clf = LabelPropagation(kernel=kernel, gamma=gamma, max_iter=1000)
        elif method == 'spreading':
            clf = LabelSpreading(kernel=kernel, gamma=gamma, alpha=alpha, max_iter=1000)
        
        # Entra√Æner
        start_time = time.time()
        clf.fit(X_combined, y_combined)
        training_time = time.time() - start_time
        
        # Pr√©dire sur test
        y_pred = clf.predict(self.X_test)
        
        # √âvaluer
        test_accuracy = accuracy_score(self.y_test, y_pred)
        test_f1 = f1_score(self.y_test, y_pred, average='weighted')
        
        result = {
            'method': f'graph_{method}',
            'kernel': kernel,
            'gamma': gamma,
            'alpha': alpha if method == 'spreading' else None,
            'test_accuracy': test_accuracy,
            'test_f1': test_f1,
            'training_time': training_time
        }
        
        self.results.append(result)
        print(f"  ‚úÖ Test accuracy: {test_accuracy:.4f}, F1: {test_f1:.4f}, Time: {training_time:.2f}s")
        
        return result
    
    def run_multiple_sessions(self):
        """Ex√©cute plusieurs sessions avec diff√©rents param√®tres"""
        print("üöÄ GRAPH-BASED METHODS: SESSIONS MULTIPLES")
        print("=" * 50)
        
        for method in ['propagation', 'spreading']:
            for gamma in [10, 20, 50]:
                if method == 'propagation':
                    # LabelPropagation n'a pas de param√®tre alpha
                    self.train_graph_method(method=method, gamma=gamma)
                else:
                    # LabelSpreading a un param√®tre alpha
                    for alpha in [0.1, 0.2, 0.5]:
                        self.train_graph_method(method=method, gamma=gamma, alpha=alpha)
    
    def save_results(self, output_file='graph_based_results.pkl'):
        """Sauvegarde les r√©sultats"""
        with open(output_file, 'wb') as f:
            pickle.dump(self.results, f)
        print(f"\nüíæ R√©sultats sauvegard√©s dans {output_file}")
    
    def display_summary(self):
        """Affiche un r√©sum√©"""
        print("\nüìä R√âSUM√â GRAPH-BASED METHODS")
        print("=" * 30)
        
        if not self.results:
            print("Aucun r√©sultat")
            return
        
        df = pd.DataFrame(self.results)
        print(f"  Exp√©riences: {len(df)}")
        print(f"  Accuracy moyenne: {df['test_accuracy'].mean():.4f}")
        print(f"  Meilleur r√©sultat: {df['test_accuracy'].max():.4f}")
        
        # Meilleur r√©sultat
        best_idx = df['test_accuracy'].idxmax()
        best = df.loc[best_idx]
        config_str = f"{best['method']} + gamma {best['gamma']}"
        if best['alpha'] is not None:
            config_str += f" + alpha {best['alpha']}"
        print(f"  Meilleure config: {config_str}")

def main():
    """Fonction principale pour graph-based methods"""
    print("üéØ GRAPH-BASED METHODS POUR CLASSIFICATION DE TEINT DE PEAU")
    print("=" * 60)
    
    trainer = GraphBasedTrainer(data_dir='preprocessed_ml_data')
    trainer.run_multiple_sessions()
    trainer.save_results('graph_based_results.pkl')
    trainer.display_summary()
    
    print("\n‚úÖ Graph-based methods termin√©!")

if __name__ == "__main__":
    main()

üéØ GRAPH-BASED METHODS POUR CLASSIFICATION DE TEINT DE PEAU
üìÇ Chargement des donn√©es pr√©trait√©es...
  ‚úì Donn√©es charg√©es: 5748 train, 1228 val, 1236 test, 19164 unlabelled
üöÄ GRAPH-BASED METHODS: SESSIONS MULTIPLES

üï∏Ô∏è  PROPAGATION: kernel=rbf, gamma=10
  ‚úÖ Test accuracy: 0.2702, F1: 0.1393, Time: 34.67s

üï∏Ô∏è  PROPAGATION: kernel=rbf, gamma=20
  ‚úÖ Test accuracy: 0.2702, F1: 0.1393, Time: 23.45s

üï∏Ô∏è  PROPAGATION: kernel=rbf, gamma=50
  ‚úÖ Test accuracy: 0.2702, F1: 0.1393, Time: 22.56s

üï∏Ô∏è  SPREADING: kernel=rbf, gamma=10, alpha=0.1
  ‚úÖ Test accuracy: 0.2702, F1: 0.1393, Time: 70.25s

üï∏Ô∏è  SPREADING: kernel=rbf, gamma=10, alpha=0.2
  ‚úÖ Test accuracy: 0.2702, F1: 0.1393, Time: 92.23s

üï∏Ô∏è  SPREADING: kernel=rbf, gamma=10, alpha=0.5
  ‚úÖ Test accuracy: 0.2702, F1: 0.1393, Time: 218.44s

üï∏Ô∏è  SPREADING: kernel=rbf, gamma=20, alpha=0.1
  ‚úÖ Test accuracy: 0.2702, F1: 0.1393, Time: 70.38s

üï∏Ô∏è  SPREADING: kernel=rbf, gamma=20, alpha

## 3. Consistency Regularization Algorithm

In [3]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import time
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

class ConsistencyTrainer:
    """Entra√Æneur pour r√©gularisation de coh√©rence"""
    
    def __init__(self, data_dir='preprocessed_ml_data'):
        self.data_dir = data_dir
        self.classes = ['dark', 'light', 'mid-dark', 'mid-light']
        self.load_data()
        self.results = []
    
    def load_data(self):
        """Charge les donn√©es pr√©trait√©es"""
        print("üìÇ Chargement des donn√©es pr√©trait√©es...")
        
        datasets = {}
        for split in ['train', 'val', 'test', 'unlabelled']:
            data = np.load(os.path.join(self.data_dir, f'{split}_data.npz'))
            datasets[split] = {
                'X': data['X'],
                'y': data['y'],
                'filenames': data['filenames']
            }
        
        self.X_train = datasets['train']['X']
        self.y_train = datasets['train']['y']
        self.X_val = datasets['val']['X'] 
        self.y_val = datasets['val']['y']
        self.X_test = datasets['test']['X']
        self.y_test = datasets['test']['y']
        self.X_unlabelled = datasets['unlabelled']['X']
        
        print(f"  ‚úì Donn√©es charg√©es: {len(self.X_train)} train, {len(self.X_val)} val, {len(self.X_test)} test, {len(self.X_unlabelled)} unlabelled")
    
    def train_consistency_regularization(self, n_estimators=10, perturbation_strength=0.1):
        """Impl√©mente la r√©gularisation de coh√©rence avec ensemble"""
        print(f"\nüîÑ CONSISTENCY REGULARIZATION: {n_estimators} estimators, perturbation={perturbation_strength}")
        
        # Cr√©er des perturbations des donn√©es d'entra√Ænement
        X_perturbed = []
        y_perturbed = []
        
        for _ in range(n_estimators):
            # Ajouter du bruit gaussien
            noise = np.random.normal(0, perturbation_strength, self.X_train.shape)
            X_pert = self.X_train + noise
            X_perturbed.append(X_pert)
            y_perturbed.append(self.y_train)
        
        X_perturbed = np.vstack(X_perturbed)
        y_perturbed = np.hstack(y_perturbed)
        
        # Entra√Æner un classifieur d'ensemble
        clf = ExtraTreesClassifier(n_estimators=n_estimators, random_state=42)
        
        start_time = time.time()
        clf.fit(X_perturbed, y_perturbed)
        training_time = time.time() - start_time
        
        # √âvaluer
        y_pred = clf.predict(self.X_test)
        test_accuracy = accuracy_score(self.y_test, y_pred)
        test_f1 = f1_score(self.y_test, y_pred, average='weighted')
        
        result = {
            'method': 'consistency_regularization',
            'n_estimators': n_estimators,
            'perturbation_strength': perturbation_strength,
            'test_accuracy': test_accuracy,
            'test_f1': test_f1,
            'training_time': training_time
        }
        
        self.results.append(result)
        print(f"  ‚úÖ Test accuracy: {test_accuracy:.4f}, F1: {test_f1:.4f}, Time: {training_time:.2f}s")
        
        return result
    
    def run_multiple_sessions(self):
        """Ex√©cute plusieurs sessions avec diff√©rents param√®tres"""
        print("üöÄ CONSISTENCY REGULARIZATION: SESSIONS MULTIPLES")
        print("=" * 50)
        
        for n_est in [5, 10, 20]:
            for pert in [0.05, 0.1, 0.2]:
                self.train_consistency_regularization(n_estimators=n_est, perturbation_strength=pert)
    
    def save_results(self, output_file='consistency_results.pkl'):
        """Sauvegarde les r√©sultats"""
        with open(output_file, 'wb') as f:
            pickle.dump(self.results, f)
        print(f"\nüíæ R√©sultats sauvegard√©s dans {output_file}")
    
    def display_summary(self):
        """Affiche un r√©sum√©"""
        print("\nüìä R√âSUM√â CONSISTENCY REGULARIZATION")
        print("=" * 35)
        
        if not self.results:
            print("Aucun r√©sultat")
            return
        
        df = pd.DataFrame(self.results)
        print(f"  Exp√©riences: {len(df)}")
        print(f"  Accuracy moyenne: {df['test_accuracy'].mean():.4f}")
        print(f"  Meilleur r√©sultat: {df['test_accuracy'].max():.4f}")
        
        # Meilleur r√©sultat
        best_idx = df['test_accuracy'].idxmax()
        best = df.loc[best_idx]
        print(f"  Meilleure config: {best['n_estimators']} estimators + perturbation {best['perturbation_strength']}")

def main():
    """Fonction principale pour consistency regularization"""
    print("üéØ CONSISTENCY REGULARIZATION POUR CLASSIFICATION DE TEINT DE PEAU")
    print("=" * 60)
    
    trainer = ConsistencyTrainer(data_dir='preprocessed_ml_data')
    trainer.run_multiple_sessions()
    trainer.save_results('consistency_results.pkl')
    trainer.display_summary()
    
    print("\n‚úÖ Consistency regularization termin√©!")

if __name__ == "__main__":
    main()

üéØ CONSISTENCY REGULARIZATION POUR CLASSIFICATION DE TEINT DE PEAU
üìÇ Chargement des donn√©es pr√©trait√©es...
  ‚úì Donn√©es charg√©es: 5748 train, 1228 val, 1236 test, 19164 unlabelled
üöÄ CONSISTENCY REGULARIZATION: SESSIONS MULTIPLES

üîÑ CONSISTENCY REGULARIZATION: 5 estimators, perturbation=0.05
  ‚úÖ Test accuracy: 0.5631, F1: 0.5494, Time: 0.31s

üîÑ CONSISTENCY REGULARIZATION: 5 estimators, perturbation=0.1
  ‚úÖ Test accuracy: 0.5761, F1: 0.5614, Time: 0.33s

üîÑ CONSISTENCY REGULARIZATION: 5 estimators, perturbation=0.2
  ‚úÖ Test accuracy: 0.5655, F1: 0.5477, Time: 0.34s

üîÑ CONSISTENCY REGULARIZATION: 10 estimators, perturbation=0.05
  ‚úÖ Test accuracy: 0.5979, F1: 0.5865, Time: 1.41s

üîÑ CONSISTENCY REGULARIZATION: 10 estimators, perturbation=0.1
  ‚úÖ Test accuracy: 0.5906, F1: 0.5776, Time: 1.33s

üîÑ CONSISTENCY REGULARIZATION: 10 estimators, perturbation=0.2
  ‚úÖ Test accuracy: 0.6303, F1: 0.6187, Time: 1.33s

üîÑ CONSISTENCY REGULARIZATION: 20 estimat

# save the best model 

In [6]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import time
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import classification_report, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

class ModelSelector:
    """S√©lecteur et sauvegardeur du meilleur mod√®le"""
    
    def __init__(self, data_dir='preprocessed_ml_data'):
        self.data_dir = data_dir
        self.load_data()
    
    def load_data(self):
        """Charge les donn√©es pr√©trait√©es"""
        print("üìÇ Chargement des donn√©es pr√©trait√©es...")
        
        datasets = {}
        for split in ['train', 'val', 'test', 'unlabelled']:
            data = np.load(os.path.join(self.data_dir, f'{split}_data.npz'))
            datasets[split] = {
                'X': data['X'],
                'y': data['y'],
                'filenames': data['filenames']
            }
        
        self.X_train = datasets['train']['X']
        self.y_train = datasets['train']['y']
        self.X_val = datasets['val']['X'] 
        self.y_val = datasets['val']['y']
        self.X_test = datasets['test']['X']
        self.y_test = datasets['test']['y']
        self.X_unlabelled = datasets['unlabelled']['X']
        
        print(f"  ‚úì Donn√©es charg√©es: {len(self.X_train)} train, {len(self.X_val)} val, {len(self.X_test)} test, {len(self.X_unlabelled)} unlabelled")
    
    def load_and_compare_results(self):
        """Charge et compare tous les r√©sultats d'entra√Ænement"""
        result_files = [
            'pseudo_labeling_results.pkl',
            'graph_based_results.pkl', 
            'consistency_results.pkl'
        ]
        
        all_results = []
        
        for file in result_files:
            if os.path.exists(file):
                with open(file, 'rb') as f:
                    results = pickle.load(f)
                    all_results.extend(results)
                    print(f"‚úì Charg√© {len(results)} r√©sultats depuis {file}")
            else:
                print(f"‚ö†Ô∏è Fichier {file} non trouv√©")
        
        if not all_results:
            print("‚ùå Aucun r√©sultat trouv√©!")
            return None
        
        return all_results
    
    def select_best_model(self, all_results):
        """S√©lectionne le meilleur mod√®le bas√© sur l'accuracy de test"""
        # Fonction pour obtenir l'accuracy (g√®re les cl√©s diff√©rentes)
        def get_accuracy(result):
            if 'test_accuracy' in result:
                return result['test_accuracy']
            elif 'final_test_accuracy' in result:
                return result['final_test_accuracy']
            else:
                return 0  # Valeur par d√©faut si pas trouv√©
        
        best_result = max(all_results, key=get_accuracy)
        
        # Obtenir l'accuracy pour l'affichage
        accuracy = get_accuracy(best_result)
        f1_score_val = best_result.get('test_f1', best_result.get('final_test_f1', 0))
        
        print("\nüèÜ MEILLEUR MOD√àLE TROUV√â:")
        print(f"  M√©thode: {best_result['method']}")
        print(f"  Accuracy de test: {accuracy:.4f}")
        print(f"  F1-score: {f1_score_val:.4f}")
        
        # Afficher les param√®tres sp√©cifiques
        params = {k: v for k, v in best_result.items() 
                 if k not in ['method', 'test_accuracy', 'final_test_accuracy', 'test_f1', 'final_test_f1', 'training_time', 'iterations']}
        print(f"  Param√®tres: {params}")
        
        return best_result
    
    def retrain_best_model(self, best_result):
        """R√©entra√Æne le meilleur mod√®le avec tous les param√®tres optimaux"""
        method = best_result['method']
        print(f"\nüîÑ R√©entra√Ænement du mod√®le: {method}")
        
        if method.startswith('pseudo_labeling'):
            # Pseudo-labeling
            base_classifier = best_result['base_classifier']
            confidence_threshold = best_result['confidence_threshold']
            
            if base_classifier == 'rf':
                clf = RandomForestClassifier(n_estimators=100, random_state=42)
            elif base_classifier == 'svm':
                clf = SVC(probability=True, random_state=42)
            elif base_classifier == 'lr':
                clf = LogisticRegression(random_state=42, max_iter=1000)
            
            # Entra√Æner sur les donn√©es labellis√©es
            clf.fit(self.X_train, self.y_train)
            
        elif method.startswith('graph_'):
            # Graph-based
            method_type = method.split('_')[1]  # 'propagation' or 'spreading'
            kernel = best_result['kernel']
            gamma = best_result['gamma']
            
            X_combined = np.vstack([self.X_train, self.X_unlabelled])
            y_combined = np.hstack([self.y_train, np.full(len(self.X_unlabelled), -1)])
            
            if method_type == 'propagation':
                clf = LabelPropagation(kernel=kernel, gamma=gamma, max_iter=1000)
            elif method_type == 'spreading':
                alpha = best_result.get('alpha', 0.2)
                clf = LabelSpreading(kernel=kernel, gamma=gamma, alpha=alpha, max_iter=1000)
            
            clf.fit(X_combined, y_combined)
            
        elif method == 'consistency_regularization':
            # Consistency regularization
            n_estimators = best_result['n_estimators']
            perturbation_strength = best_result['perturbation_strength']
            
            # Cr√©er des perturbations
            X_perturbed = []
            y_perturbed = []
            
            for _ in range(n_estimators):
                noise = np.random.normal(0, perturbation_strength, self.X_train.shape)
                X_pert = self.X_train + noise
                X_perturbed.append(X_pert)
                y_perturbed.append(self.y_train)
            
            X_perturbed = np.vstack(X_perturbed)
            y_perturbed = np.hstack(y_perturbed)
            
            clf = ExtraTreesClassifier(n_estimators=n_estimators, random_state=42)
            clf.fit(X_perturbed, y_perturbed)
        
        # Validation finale
        test_accuracy = clf.score(self.X_test, self.y_test)
        test_f1 = f1_score(self.y_test, clf.predict(self.X_test), average='weighted')
        
        print(f"  ‚úÖ Mod√®le r√©entra√Æn√© - Test accuracy: {test_accuracy:.4f}, F1: {test_f1:.4f}")
        
        return clf
    
    def save_best_model(self, model, filename='best_model.pkl'):
        """Sauvegarde le meilleur mod√®le"""
        with open(filename, 'wb') as f:
            pickle.dump(model, f)
        print(f"üíæ Mod√®le sauvegard√© dans {filename}")

def main():
    """Fonction principale pour s√©lection et sauvegarde du meilleur mod√®le"""
    print("üéØ S√âLECTION ET SAUVEGARDE DU MEILLEUR MOD√àLE")
    print("=" * 60)
    
    # Initialiser le s√©lecteur
    selector = ModelSelector(data_dir='preprocessed_ml_data')
    
    # Charger et comparer les r√©sultats
    all_results = selector.load_and_compare_results()
    
    if all_results:
        # S√©lectionner le meilleur
        best_result = selector.select_best_model(all_results)
        
        # R√©entra√Æner le meilleur mod√®le
        best_model = selector.retrain_best_model(best_result)
        
        # Sauvegarder
        selector.save_best_model(best_model, 'best_model.pkl')
        
        print("\n‚úÖ Meilleur mod√®le pr√™t pour le d√©ploiement!")
        print("Vous pouvez maintenant lancer l'application Streamlit.")
    
    else:
        print("‚ùå Impossible de trouver des r√©sultats d'entra√Ænement.")
        print("Assurez-vous d'avoir ex√©cut√© les scripts d'entra√Ænement.")

if __name__ == "__main__":
    main()

üéØ S√âLECTION ET SAUVEGARDE DU MEILLEUR MOD√àLE
üìÇ Chargement des donn√©es pr√©trait√©es...
  ‚úì Donn√©es charg√©es: 5748 train, 1228 val, 1236 test, 19164 unlabelled
‚úì Charg√© 9 r√©sultats depuis pseudo_labeling_results.pkl
‚úì Charg√© 12 r√©sultats depuis graph_based_results.pkl
‚úì Charg√© 9 r√©sultats depuis consistency_results.pkl

üèÜ MEILLEUR MOD√àLE TROUV√â:
  M√©thode: pseudo_labeling
  Accuracy de test: 0.7362
  F1-score: 0.7341
  Param√®tres: {'base_classifier': 'svm', 'confidence_threshold': 0.7, 'max_iterations': 10, 'best_val_accuracy': 0.738599348534202, 'total_pseudo_labels': 17088}

üîÑ R√©entra√Ænement du mod√®le: pseudo_labeling
  ‚úÖ Mod√®le r√©entra√Æn√© - Test accuracy: 0.7257, F1: 0.7250
üíæ Mod√®le sauvegard√© dans best_model.pkl

‚úÖ Meilleur mod√®le pr√™t pour le d√©ploiement!
Vous pouvez maintenant lancer l'application Streamlit.
