In [31]:
# Setup et imports
import pandas as pd
import re
import xml.etree.ElementTree as ET
from xml.dom import minidom
import os
from datetime import datetime

print("🔧 Pipeline Vekta Amélioré - Prêt pour démonstration")
print("=" * 50)


🔧 Pipeline Vekta Amélioré - Prêt pour démonstration


In [32]:
# 1. Configuration de validation avec seuils de confiance RENFORCÉS
class ValidationConfig:
    CONFIDENCE_THRESHOLD_HIGH = 0.95    # Génération directe - EXCELLENCE REQUISE
    CONFIDENCE_THRESHOLD_MEDIUM = 0.75  # Génération avec avertissement - TRÈS BONNE CORRESPONDANCE  
    CONFIDENCE_THRESHOLD_LOW = 0.75     # Rejet en dessous - QUALITÉ MAXIMALE EXIGÉE
    
    REQUIRED_ENTITIES = {
        'duration': r'\d+\s*(min|h|s|sec|secondes?|minutes?|heures?)',
        'repetitions': r'\d+\s*[x*×]\s*',
        'intensity': r'(VO2|seuil|tempo|aerobic|sprint|endurance|threshold|anaerobic|neuromuscular)',
        'recovery': r'(recup|rec|recovery|repos)\s*\d+'
    }

print("✅ Configuration de validation chargée")
print(f"   Seuils: Élevé ≥{ValidationConfig.CONFIDENCE_THRESHOLD_HIGH}, Modéré ≥{ValidationConfig.CONFIDENCE_THRESHOLD_MEDIUM}, Faible ≥{ValidationConfig.CONFIDENCE_THRESHOLD_LOW}")


✅ Configuration de validation chargée
   Seuils: Élevé ≥0.95, Modéré ≥0.75, Faible ≥0.75


In [33]:
# 2. Corpus d'entraînements complet et réaliste
corpus_data = [
    # === ZONE AEROBIC / ENDURANCE ===
    {
        'id': 0, 'description': '1h endurance allure aerobic tranquille', 'zone': 'Aerobic',
        'structure': {'duration': 60, 'continuous': True}, 'power_zone': 2, 'intensity': 65,
        'variations': ['1h endur aerobic', '60min endurance facile', '1 heure aerobic tranquil', 'endurance 1h Z2']
    },
    {
        'id': 1, 'description': '2h endurance longue aerobic base', 'zone': 'Aerobic',
        'structure': {'duration': 120, 'continuous': True}, 'power_zone': 2, 'intensity': 62,
        'variations': ['2h endur longue', '120min aerobic', '2 heures endurance', 'sortie longue 2h']
    },
    {
        'id': 2, 'description': '90min endurance aerobic avec 3x5min tempo', 'zone': 'Aerobic',
        'structure': {'duration': 90, 'tempo_inserts': 3, 'tempo_duration': 5}, 'power_zone': 2, 'intensity': 68,
        'variations': ['90min endur + tempo', '1h30 aerobic tempo', 'endurance 90min tempo']
    },
    {
        'id': 3, 'description': '45min recuperation active facile', 'zone': 'Recovery',
        'structure': {'duration': 45, 'continuous': True}, 'power_zone': 1, 'intensity': 55,
        'variations': ['45min recup active', 'recuperation 45min', 'recup facile 45min', '45min Z1']
    },
    
    # === ZONE TEMPO / SWEET SPOT ===
    {
        'id': 4, 'description': '3x 10 min tempo sweet spot avec 5min recup', 'zone': 'Tempo',
        'structure': {'reps': 3, 'duration': 10, 'recovery': 5}, 'power_zone': 3, 'intensity': 85,
        'variations': ['3x10min tempo 5minrec', '3*10min sweet spot', '3 fois 10 minutes tempo', '3x10 tempo']
    },
    {
        'id': 5, 'description': '4x 12 min tempo 88% CP avec 4min recup', 'zone': 'Tempo',
        'structure': {'reps': 4, 'duration': 12, 'recovery': 4}, 'power_zone': 3, 'intensity': 88,
        'variations': ['4x12min tempo 88%', '4*12min 88% CP', '4 fois 12min tempo', '4x12 88%CP']
    },
    {
        'id': 6, 'description': '2x 20 min tempo sweet spot avec 10min recup', 'zone': 'Tempo',
        'structure': {'reps': 2, 'duration': 20, 'recovery': 10}, 'power_zone': 3, 'intensity': 86,
        'variations': ['2x20min tempo', '2*20min sweet spot', '2 fois 20 minutes tempo', '2x20 tempo']
    },
    {
        'id': 7, 'description': '45min tempo continu 85% CP', 'zone': 'Tempo',
        'structure': {'duration': 45, 'continuous': True}, 'power_zone': 3, 'intensity': 85,
        'variations': ['45min tempo continu', 'tempo 45min', '45 minutes tempo', 'tempo continu 45min']
    },
    {
        'id': 8, 'description': '6x 8 min tempo 90% CP avec 3min recup', 'zone': 'Tempo',
        'structure': {'reps': 6, 'duration': 8, 'recovery': 3}, 'power_zone': 3, 'intensity': 90,
        'variations': ['6x8min tempo 90%', '6*8min 90% CP', '6 fois 8min tempo', '6x8 tempo']
    },
    
    # === ZONE THRESHOLD / SEUIL ===
    {
        'id': 9, 'description': '4x 8 min seuil threshold 95% CP avec 4min recup', 'zone': 'Threshold',
        'structure': {'reps': 4, 'duration': 8, 'recovery': 4}, 'power_zone': 4, 'intensity': 95,
        'variations': ['4x8min seuil 95%', '4*8min threshold', '4 fois 8min seuil', '4x8 threshold 95%CP']
    },
    {
        'id': 10, 'description': '3x 12 min threshold FTP avec 6min recup', 'zone': 'Threshold',
        'structure': {'reps': 3, 'duration': 12, 'recovery': 6}, 'power_zone': 4, 'intensity': 100,
        'variations': ['3x12min FTP', '3*12min threshold', '3 fois 12min seuil', '3x12 seuil FTP']
    },
    {
        'id': 11, 'description': '2x 15 min seuil 102% FTP avec 8min recup', 'zone': 'Threshold',
        'structure': {'reps': 2, 'duration': 15, 'recovery': 8}, 'power_zone': 4, 'intensity': 102,
        'variations': ['2x15min seuil 102%', '2*15min 102% FTP', '2 fois 15min threshold', '2x15 seuil']
    },
    {
        'id': 12, 'description': '30min test FTP seuil continu', 'zone': 'Threshold',
        'structure': {'duration': 30, 'continuous': True}, 'power_zone': 4, 'intensity': 100,
        'variations': ['30min FTP test', 'test seuil 30min', '30 minutes FTP', 'test 30min threshold']
    },
    
    # === ZONE VO2MAX ===
    {
        'id': 13, 'description': '5x 5 min VO2max 110% CP avec 5min recup', 'zone': 'VO2max',
        'structure': {'reps': 5, 'duration': 5, 'recovery': 5}, 'power_zone': 5, 'intensity': 110,
        'variations': ['5x5min VO2 110%', '5*5min VO2max', '5 fois 5min VO2', '5x5 VO2max 110%CP']
    },
    {
        'id': 14, 'description': '8x 3 min VO2max 115% CP avec 3min recup', 'zone': 'VO2max',
        'structure': {'reps': 8, 'duration': 3, 'recovery': 3}, 'power_zone': 5, 'intensity': 115,
        'variations': ['8x3min VO2 115%', '8*3min VO2max', '8 fois 3min VO2', '8x3 VO2max']
    },
    {
        'id': 15, 'description': '6x 4 min VO2max 112% CP avec 4min recup', 'zone': 'VO2max',
        'structure': {'reps': 6, 'duration': 4, 'recovery': 4}, 'power_zone': 5, 'intensity': 112,
        'variations': ['6x4min VO2 112%', '6*4min VO2max', '6 fois 4min VO2', '6x4 VO2max 112%CP']
    },
    {
        'id': 16, 'description': '2x (3x 5 min VO2max avec 90s recup) recup 5min entre series', 'zone': 'VO2max',
        'structure': {'sets': 2, 'reps': 3, 'duration': 5, 'recovery_intra': 1.5, 'recovery_inter': 5}, 'power_zone': 5, 'intensity': 112,
        'variations': ['2x(3x5min VO2 90s)', '2*(3*5min VO2max)', '2 series 3x5min VO2', '2x3x5min VO2']
    },
    
    # === ZONE ANAEROBIC ===
    {
        'id': 17, 'description': '6x 1 min sprint anaerobic avec 2min recup', 'zone': 'Anaerobic',
        'structure': {'reps': 6, 'duration': 1, 'recovery': 2}, 'power_zone': 5, 'intensity': 130,
        'variations': ['6x1min sprint 2minrec', '6*1min anaerobic', '6 fois 1min sprint', '6x1 sprint']
    },
    {
        'id': 18, 'description': '8x 45s anaerobic 140% CP avec 3min recup', 'zone': 'Anaerobic',
        'structure': {'reps': 8, 'duration': 0.75, 'recovery': 3}, 'power_zone': 5, 'intensity': 140,
        'variations': ['8x45s anaerobic 140%', '8*45sec 140% CP', '8 fois 45s anaerobic', '8x45s 140%CP']
    },
    {
        'id': 19, 'description': '10x 30s anaerobic avec 90s recup', 'zone': 'Anaerobic',
        'structure': {'reps': 10, 'duration': 0.5, 'recovery': 1.5}, 'power_zone': 5, 'intensity': 135,
        'variations': ['10x30s anaerobic 90s', '10*30sec sprint', '10 fois 30s anaerobic', '10x30 sprint']
    },
    
    # === ZONE NEUROMUSCULAR ===
    {
        'id': 20, 'description': '5x 30s neuromuscular power max avec 4min30 recup complete', 'zone': 'Neuromuscular',
        'structure': {'reps': 5, 'duration': 0.5, 'recovery': 4.5}, 'power_zone': 6, 'intensity': 150,
        'variations': ['5x30s neuro max', '5*30sec sprint max', '5 fois 30s neuromuscular', '5x30 sprint max']
    },
    {
        'id': 21, 'description': '8x 15s sprint neuromuscular avec 3min recup', 'zone': 'Neuromuscular',
        'structure': {'reps': 8, 'duration': 0.25, 'recovery': 3}, 'power_zone': 6, 'intensity': 160,
        'variations': ['8x15s neuro sprint', '8*15sec neuromuscular', '8 fois 15s sprint', '8x15 neuro']
    },
    {
        'id': 22, 'description': '6x 20s sprint neuromuscular max avec 4min recup', 'zone': 'Neuromuscular',
        'structure': {'reps': 6, 'duration': 0.33, 'recovery': 4}, 'power_zone': 6, 'intensity': 155,
        'variations': ['6x20s neuro max', '6*20sec sprint', '6 fois 20s neuromuscular', '6x20 sprint max']
    },
    
    # === ENTRAÎNEMENTS COMPLEXES / PYRAMIDES ===
    {
        'id': 23, 'description': 'pyramide 1-2-3-4-3-2-1min VO2max avec 1min recup', 'zone': 'VO2max',
        'structure': {'pyramid': [1,2,3,4,3,2,1], 'recovery': 1}, 'power_zone': 5, 'intensity': 112,
        'variations': ['pyramide 1234321 VO2', 'pyramid 1-2-3-4-3-2-1min', 'pyramide VO2max', 'pyramid VO2']
    },
    {
        'id': 24, 'description': 'pyramide tempo 5-10-15-10-5min avec 5min recup', 'zone': 'Tempo',
        'structure': {'pyramid': [5,10,15,10,5], 'recovery': 5}, 'power_zone': 3, 'intensity': 87,
        'variations': ['pyramide tempo 5-10-15', 'pyramid tempo', 'pyramide sweet spot', 'pyramid 5-10-15-10-5']
    },
    {
        'id': 25, 'description': 'seance mixte 20min tempo + 5x3min VO2max', 'zone': 'Mixed',
        'structure': {'part1': {'duration': 20, 'intensity': 85}, 'part2': {'reps': 5, 'duration': 3, 'intensity': 112}}, 'power_zone': 4, 'intensity': 95,
        'variations': ['mixte tempo + VO2', '20min tempo + VO2max', 'seance combinee', 'tempo VO2 mixte']
    },
    
    # === TESTS ET ÉVALUATIONS ===
    {
        'id': 26, 'description': 'test 20min FTP maximal', 'zone': 'Test',
        'structure': {'duration': 20, 'test': True}, 'power_zone': 4, 'intensity': 105,
        'variations': ['test FTP 20min', 'test 20min maximal', 'évaluation FTP', 'test seuil 20min']
    },
    {
        'id': 27, 'description': 'test progressif par paliers 3min', 'zone': 'Test',
        'structure': {'progressive': True, 'step_duration': 3, 'steps': 8}, 'power_zone': 3, 'intensity': 80,
        'variations': ['test progressif paliers', 'test par paliers', 'évaluation progressive', 'test escalier']
    },
    
    # === ENTRAÎNEMENTS SPÉCIALISÉS ===
    {
        'id': 28, 'description': 'over-under 8x3min 95-105% FTP', 'zone': 'Threshold',
        'structure': {'reps': 8, 'duration': 3, 'over_under': True, 'low': 95, 'high': 105}, 'power_zone': 4, 'intensity': 100,
        'variations': ['over under 95-105%', '8x3min over-under', 'sur-sous seuil', 'over under FTP']
    },
    {
        'id': 29, 'description': 'micro-intervalles 15x1min ON/OFF VO2max', 'zone': 'VO2max',
        'structure': {'reps': 15, 'duration': 1, 'on_off': True}, 'power_zone': 5, 'intensity': 115,
        'variations': ['15x1min ON OFF', 'micro intervalles VO2', '15*1min on off', 'intervalles courts VO2']
    }
]

# Création du DataFrame avec toutes les variations
all_variations = []
for workout in corpus_data:
    # Entraînement principal
    main_entry = workout.copy()
    all_variations.append(main_entry)
    
    # Variations linguistiques
    if 'variations' in workout:
        for i, variation in enumerate(workout['variations']):
            var_entry = workout.copy()
            var_entry['id'] = f"{workout['id']}_v{i+1}"
            var_entry['description'] = variation
            var_entry['is_variation'] = True
            all_variations.append(var_entry)

corpus_df = pd.DataFrame(all_variations)

print(f"📚 Corpus complet chargé: {len(corpus_df)} entraînements")
print(f"   Entraînements principaux: {len(corpus_data)}")
print(f"   Variations linguistiques: {len(corpus_df) - len(corpus_data)}")
print(f"   Zones couvertes: {', '.join(sorted(corpus_df['zone'].unique()))}")

# Statistiques détaillées
zone_counts = corpus_df['zone'].value_counts()
print(f"\n📊 Répartition par zone:")
for zone, count in zone_counts.items():
    print(f"   {zone}: {count} entraînements")


📚 Corpus complet chargé: 149 entraînements
   Entraînements principaux: 30
   Variations linguistiques: 119
   Zones couvertes: Aerobic, Anaerobic, Mixed, Neuromuscular, Recovery, Tempo, Test, Threshold, VO2max

📊 Répartition par zone:
   Tempo: 30 entraînements
   VO2max: 30 entraînements
   Threshold: 25 entraînements
   Anaerobic: 15 entraînements
   Neuromuscular: 15 entraînements
   Aerobic: 14 entraînements
   Test: 10 entraînements
   Recovery: 5 entraînements
   Mixed: 5 entraînements


In [34]:
# 3. Validateur amélioré avec détection d'entités
class EnhancedValidator:
    def __init__(self, corpus_df):
        self.corpus_df = corpus_df
    
    def search_corpus(self, query):
        """Recherche sémantique avancée dans le corpus complet"""
        query_lower = query.lower()
        best_match = None
        best_score = 0.0
        
        # Recherche dans tous les entraînements et leurs variations
        for idx, workout in self.corpus_df.iterrows():
            score = self._calculate_similarity(query_lower, workout['description'].lower())
            
            # Boost pour les variations qui matchent exactement
            if 'variations' in workout and workout.get('variations'):
                for variation in workout['variations']:
                    var_score = self._calculate_similarity(query_lower, variation.lower())
                    if var_score > score:
                        score = var_score * 1.1  # Bonus pour variation exacte
            
            if score > best_score:
                best_score = score
                best_match = workout
        
        return best_match, min(best_score, 0.95)  # Cap à 95% pour réalisme
    
    def _calculate_similarity(self, query, description):
        """Calcule la similarité entre requête et description"""
        score = 0.0
        query_words = set(query.split())
        desc_words = set(description.split())
        
        # Mots-clés importants avec poids
        keywords = {
            # Zones d'entraînement
            'aerobic': 0.3, 'endurance': 0.3, 'tempo': 0.3, 'sweet': 0.2, 'spot': 0.2,
            'threshold': 0.3, 'seuil': 0.3, 'ftp': 0.3,
            'vo2max': 0.4, 'vo2': 0.4, 'anaerobic': 0.3,
            'neuromuscular': 0.4, 'neuro': 0.3, 'sprint': 0.3,
            'recovery': 0.2, 'recup': 0.2, 'recuperation': 0.2,
            
            # Structures d'entraînement
            'pyramide': 0.4, 'pyramid': 0.4, 'test': 0.3,
            'over': 0.3, 'under': 0.3, 'mixte': 0.3, 'micro': 0.3,
            
            # Durées communes
            '30s': 0.2, '45s': 0.2, '1min': 0.2, '3min': 0.2, '5min': 0.2,
            '8min': 0.2, '10min': 0.2, '12min': 0.2, '15min': 0.2, '20min': 0.2,
            '30min': 0.2, '45min': 0.2, '1h': 0.2, '90min': 0.2, '2h': 0.2,
            
            # Répétitions communes
            '3x': 0.2, '4x': 0.2, '5x': 0.2, '6x': 0.2, '8x': 0.2, '10x': 0.2,
            
            # Intensités
            '85%': 0.2, '88%': 0.2, '90%': 0.2, '95%': 0.2, '100%': 0.2,
            '102%': 0.2, '105%': 0.2, '110%': 0.2, '115%': 0.2, '140%': 0.2, '150%': 0.2
        }
        
        # Score basé sur les mots-clés
        for word in query_words:
            if word in desc_words:
                weight = keywords.get(word, 0.1)
                score += weight
        
        # Bonus pour correspondance exacte de patterns numériques
        import re
        query_numbers = re.findall(r'\d+', query)
        desc_numbers = re.findall(r'\d+', description)
        
        for num in query_numbers:
            if num in desc_numbers:
                score += 0.15
        
        # Pénalité si mots-clés importants manquent
        important_query_words = [w for w in query_words if keywords.get(w, 0) > 0.25]
        missing_important = [w for w in important_query_words if w not in desc_words]
        score -= len(missing_important) * 0.1
        
        return max(0.0, score)
    
    def detect_missing_entities(self, query):
        """Détecte les entités manquantes avec exemples précis"""
        entities = {}
        query_lower = query.lower()
        
        # Patterns améliorés avec plus de variantes
        enhanced_patterns = {
            'duration': {
                'pattern': r'(\d+\s*(min|h|s|sec|secondes?|minutes?|heures?)|(\d+:\d+))',
                'examples': ['10min', '1h', '30s', '1h30', '90min']
            },
            'repetitions': {
                'pattern': r'(\d+\s*[x*×]\s*|\d+\s*(fois|rep|reps|répétitions?))',
                'examples': ['3x', '5*', '4 fois', '6 reps']
            },
            'intensity': {
                'pattern': r'(VO2|seuil|tempo|aerobic|sprint|endurance|threshold|anaerobic|neuromuscular|sweet\s*spot|ftp|\d+%)',
                'examples': ['tempo', 'VO2max', 'seuil', 'aerobic', '85%', 'FTP']
            },
            'recovery': {
                'pattern': r'((recup|rec|recovery|repos|pause)\s*(\d+\s*(min|s|sec))?|\d+\s*(min|s|sec)\s*(recup|rec|recovery|repos))',
                'examples': ['5min recup', 'recup 2min', '90s repos', 'recovery 3min']
            }
        }
        
        for entity_type, config in enhanced_patterns.items():
            matches = re.findall(config['pattern'], query_lower, re.IGNORECASE)
            entities[entity_type] = {
                'found': len(matches) > 0,
                'examples': config['examples'],
                'matches': matches
            }
        
        missing_entities = []
        for entity_type, info in entities.items():
            if not info['found']:
                missing_entities.append({
                    'type': entity_type,
                    'examples': info['examples']
                })
        
        return missing_entities
    
    def validate_query(self, query):
        """Validation complète avec messages d'erreur précis"""
        best_match, confidence = self.search_corpus(query)
        
        # Fallback si aucun match trouvé
        if best_match is None:
            best_match = self.corpus_df.iloc[0]  # Premier entraînement par défaut
            confidence = 0.1
        
        result = {
            'query': query,
            'confidence': confidence,
            'corpus_match': best_match,
            'is_valid': False,
            'message': '',
            'missing_entities': []
        }
        
        if confidence >= ValidationConfig.CONFIDENCE_THRESHOLD_HIGH:
            result['is_valid'] = True
            result['message'] = f"✅ Confiance élevée ({confidence:.3f}) - Génération autorisée"
            
        elif confidence >= ValidationConfig.CONFIDENCE_THRESHOLD_MEDIUM:
            result['is_valid'] = True
            result['message'] = f"⚠️ Confiance modérée ({confidence:.3f}) - Vérifiez le résultat"
            
        elif confidence >= ValidationConfig.CONFIDENCE_THRESHOLD_LOW:
            missing = self.detect_missing_entities(query)
            result['missing_entities'] = missing
            result['message'] = f"❌ Confiance insuffisante ({confidence:.3f})\n"
            
            if missing:
                entities_fr = {
                    'duration': 'durée (ex: 10min, 1h)',
                    'repetitions': 'répétitions (ex: 3x, 5*)', 
                    'intensity': 'intensité/zone (ex: VO2, tempo)',
                    'recovery': 'récupération (ex: 2min recup)'
                }
                missing_fr = [entities_fr.get(e, e) for e in missing]
                result['message'] += f"Entités manquantes: {', '.join(missing_fr)}\n"
            
            result['message'] += f"Suggestion: {best_match['description']}"
            
        else:
            result['message'] = f"❌ Requête non reconnue (confiance: {confidence:.3f})\nSuggestion: {best_match['description']}"
        
        return result

validator = EnhancedValidator(corpus_df)
print("🔍 Validateur amélioré initialisé")


🔍 Validateur amélioré initialisé


In [35]:
# 4. Générateur XML avec vérification de fidélité
class ZWOGenerator:
    def __init__(self, output_dir="workouts_demo"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    def generate_zwo_xml(self, workout_data, confidence):
        """Génère le XML .zwo complet"""
        workout_file = ET.Element("workout_file")
        
        # Métadonnées avec traçabilité
        author = ET.SubElement(workout_file, "author")
        author.text = "Vekta Generator"
        
        name = ET.SubElement(workout_file, "name")
        name.text = f"Entraînement {workout_data['zone']}"
        
        description = ET.SubElement(workout_file, "description")
        description.text = f"Corpus #{workout_data['id']} - Confiance: {confidence:.3f}"
        
        sporttype = ET.SubElement(workout_file, "sportType")
        sporttype.text = "bike"
        
        tags = ET.SubElement(workout_file, "tags")
        
        # Structure workout fidèle au corpus
        workout = ET.SubElement(workout_file, "workout")
        
        # Échauffement
        warmup = ET.SubElement(workout, "Warmup", 
                             Duration="600", PowerLow="0.65", PowerHigh="0.65")
        
        # Corps de séance
        structure = workout_data['structure']
        if 'reps' in structure:
            for rep in range(structure['reps']):
                # Intervalle principal
                interval = ET.SubElement(workout, "SteadyState",
                                       Duration=str(int(structure['duration'] * 60)),
                                       Power=str(workout_data['intensity']/100))
                
                # Récupération (sauf dernière rep)
                if rep < structure['reps'] - 1:
                    recovery = ET.SubElement(workout, "SteadyState",
                                           Duration=str(int(structure['recovery'] * 60)),
                                           Power="0.50")
        else:
            # Entraînement continu
            continuous = ET.SubElement(workout, "SteadyState",
                                     Duration=str(int(structure['duration'] * 60)),
                                     Power=str(workout_data['intensity']/100))
        
        # Retour au calme
        cooldown = ET.SubElement(workout, "Cooldown",
                               Duration="600", PowerLow="0.50", PowerHigh="0.50")
        
        return ET.tostring(workout_file, encoding='unicode')
    
    def save_and_validate_zwo(self, workout_data, confidence):
        """Sauvegarde et valide le fichier .zwo"""
        timestamp = datetime.now().strftime("%H%M%S")
        filename = f"demo_{workout_data['zone'].lower()}_{timestamp}.zwo"
        filepath = os.path.join(self.output_dir, filename)
        
        try:
            # Génération XML
            xml_content = self.generate_zwo_xml(workout_data, confidence)
            
            # Formatage propre
            dom = minidom.parseString(xml_content)
            pretty_xml = dom.toprettyxml(indent="  ")
            pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()])
            
            # Sauvegarde
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(pretty_xml)
            
            # Validation par re-parsing
            validation = self.validate_xml_fidelity(filepath, workout_data)
            
            return {
                'success': True,
                'filepath': filepath,
                'filesize': os.path.getsize(filepath),
                'validation': validation
            }
            
        except Exception as e:
            return {'success': False, 'error': str(e)}
    
    def validate_xml_fidelity(self, filepath, original_data):
        """Vérifie la fidélité XML vs corpus"""
        try:
            tree = ET.parse(filepath)
            root = tree.getroot()
            
            # Calcul durée totale XML
            workout = root.find('workout')
            total_duration_xml = 0
            segments_count = 0
            
            if workout is not None:
                for segment in workout:
                    duration = int(segment.get('Duration', 0))
                    total_duration_xml += duration
                    segments_count += 1
            
            # Calcul durée attendue
            structure = original_data['structure']
            if 'reps' in structure:
                expected_duration = (
                    10 +  # Échauffement
                    (structure['reps'] * structure['duration']) +
                    ((structure['reps'] - 1) * structure['recovery']) +
                    10  # Retour au calme
                )
                expected_segments = 1 + structure['reps'] + (structure['reps'] - 1) + 1
            else:
                expected_duration = 10 + structure['duration'] + 10
                expected_segments = 3
            
            duration_match = abs(total_duration_xml/60 - expected_duration) < 0.1
            segments_match = segments_count == expected_segments
            
            fidelity_score = sum([duration_match, segments_match, True]) / 3
            
            return {
                'xml_valid': True,
                'duration_match': duration_match,
                'segments_match': segments_match,
                'fidelity_score': fidelity_score,
                'expected_duration': expected_duration,
                'actual_duration': total_duration_xml/60,
                'expected_segments': expected_segments,
                'actual_segments': segments_count
            }
            
        except Exception as e:
            return {'xml_valid': False, 'error': str(e)}

zwo_generator = ZWOGenerator()
print("🔧 Générateur XML avec validation de fidélité initialisé")


🔧 Générateur XML avec validation de fidélité initialisé


In [36]:
# 🎯 Pipeline complet - Fonction de démonstration
def demonstrate_enhanced_pipeline(query):
    """Démonstration complète du pipeline amélioré"""
    
    print(f"\n🔍 REQUÊTE: '{query}'")
    print("=" * 60)
    
    # 1. Validation
    validation = validator.validate_query(query)
    print(f"📊 Confiance: {validation['confidence']:.3f}")
    print(f"📋 {validation['message']}")
    
    if validation['missing_entities']:
        print(f"🔍 Entités manquantes: {validation['missing_entities']}")
    
    # 2. Génération si validé
    if validation['is_valid']:
        print(f"\n🔧 GÉNÉRATION XML...")
        
        # Génération du fichier
        zwo_result = zwo_generator.save_and_validate_zwo(
            validation['corpus_match'], 
            validation['confidence']
        )
        
        if zwo_result['success']:
            print(f"✅ Fichier généré: {zwo_result['filepath']}")
            print(f"📏 Taille: {zwo_result['filesize']} bytes")
            
            # Vérification de fidélité
            val = zwo_result['validation']
            print(f"\n🎯 FIDÉLITÉ:")
            print(f"   XML valide: {val['xml_valid']}")
            print(f"   Durée: {val['actual_duration']:.1f}min (attendu: {val['expected_duration']}min) {'✅' if val['duration_match'] else '❌'}")
            print(f"   Segments: {val['actual_segments']} (attendu: {val['expected_segments']}) {'✅' if val['segments_match'] else '❌'}")
            print(f"   Score fidélité: {val['fidelity_score']:.1%}")
            
            # Aperçu du XML généré
            print(f"\n📄 APERÇU XML:")
            try:
                with open(zwo_result['filepath'], 'r') as f:
                    lines = f.readlines()[:10]  # Premières 10 lignes
                    for line in lines:
                        print(f"   {line.rstrip()}")
                    if len(lines) == 10:
                        print("   ...")
            except:
                print("   Erreur lecture fichier")
        else:
            print(f"❌ Erreur génération: {zwo_result.get('error', 'Inconnue')}")
    else:
        print("\n🚫 Génération bloquée par la validation")
    
    return validation, zwo_result if validation['is_valid'] else None

print("🚀 Fonction de démonstration prête")
print("   Usage: demonstrate_enhanced_pipeline('votre requête')")


🚀 Fonction de démonstration prête
   Usage: demonstrate_enhanced_pipeline('votre requête')


In [37]:
# ✅ Test 1: Confiance élevée - Génération directe
demonstrate_enhanced_pipeline("3x 10 min tempo avec 5min récup")



🔍 REQUÊTE: '3x 10 min tempo avec 5min récup'
📊 Confiance: 0.950
📋 ✅ Confiance élevée (0.950) - Génération autorisée

🔧 GÉNÉRATION XML...
✅ Fichier généré: workouts_demo/demo_tempo_140224.zwo
📏 Taille: 607 bytes

🎯 FIDÉLITÉ:
   XML valide: True
   Durée: 60.0min (attendu: 60min) ✅
   Segments: 7 (attendu: 7) ✅
   Score fidélité: 100.0%

📄 APERÇU XML:
   <?xml version="1.0" ?>
   <workout_file>
     <author>Vekta Generator</author>
     <name>Entraînement Tempo</name>
     <description>Corpus #4 - Confiance: 0.950</description>
     <sportType>bike</sportType>
     <tags/>
     <workout>
       <Warmup Duration="600" PowerLow="0.65" PowerHigh="0.65"/>
       <SteadyState Duration="600" Power="0.85"/>
   ...


({'query': '3x 10 min tempo avec 5min récup',
  'confidence': 0.95,
  'corpus_match': id                                                              4
  description            3x 10 min tempo sweet spot avec 5min recup
  zone                                                        Tempo
  structure              {'reps': 3, 'duration': 10, 'recovery': 5}
  power_zone                                                      3
  intensity                                                      85
  variations      [3x10min tempo 5minrec, 3*10min sweet spot, 3 ...
  is_variation                                                  NaN
  Name: 19, dtype: object,
  'is_valid': True,
  'message': '✅ Confiance élevée (0.950) - Génération autorisée',
  'missing_entities': []},
 {'success': True,
  'filepath': 'workouts_demo/demo_tempo_140224.zwo',
  'filesize': 607,
  'validation': {'xml_valid': True,
   'duration_match': True,
   'segments_match': True,
   'fidelity_score': 1.0,
   'expected_duration': 6

In [38]:
# ⚠️ Test 2: Confiance modérée - Génération avec avertissement
demonstrate_enhanced_pipeline("1h endurance aerobic")



🔍 REQUÊTE: '1h endurance aerobic'
📊 Confiance: 0.950
📋 ✅ Confiance élevée (0.950) - Génération autorisée

🔧 GÉNÉRATION XML...
✅ Fichier généré: workouts_demo/demo_aerobic_140224.zwo
📏 Taille: 422 bytes

🎯 FIDÉLITÉ:
   XML valide: True
   Durée: 80.0min (attendu: 80min) ✅
   Segments: 3 (attendu: 3) ✅
   Score fidélité: 100.0%

📄 APERÇU XML:
   <?xml version="1.0" ?>
   <workout_file>
     <author>Vekta Generator</author>
     <name>Entraînement Aerobic</name>
     <description>Corpus #0 - Confiance: 0.950</description>
     <sportType>bike</sportType>
     <tags/>
     <workout>
       <Warmup Duration="600" PowerLow="0.65" PowerHigh="0.65"/>
       <SteadyState Duration="3600" Power="0.65"/>
   ...


({'query': '1h endurance aerobic',
  'confidence': 0.95,
  'corpus_match': id                                                              0
  description                1h endurance allure aerobic tranquille
  zone                                                      Aerobic
  structure                    {'duration': 60, 'continuous': True}
  power_zone                                                      2
  intensity                                                      65
  variations      [1h endur aerobic, 60min endurance facile, 1 h...
  is_variation                                                  NaN
  Name: 0, dtype: object,
  'is_valid': True,
  'message': '✅ Confiance élevée (0.950) - Génération autorisée',
  'missing_entities': []},
 {'success': True,
  'filepath': 'workouts_demo/demo_aerobic_140224.zwo',
  'filesize': 422,
  'validation': {'xml_valid': True,
   'duration_match': True,
   'segments_match': True,
   'fidelity_score': 1.0,
   'expected_duration': 80,
   'act

In [39]:
# ❌ Test 3: Confiance faible - Rejet avec entités manquantes
demonstrate_enhanced_pipeline("entraînement pyramide")



🔍 REQUÊTE: 'entraînement pyramide'
📊 Confiance: 0.440
📋 ❌ Requête non reconnue (confiance: 0.440)
Suggestion: pyramid 1-2-3-4-3-2-1min

🚫 Génération bloquée par la validation


({'query': 'entraînement pyramide',
  'confidence': 0.44000000000000006,
  'corpus_match': id                                                          23_v2
  description                              pyramid 1-2-3-4-3-2-1min
  zone                                                       VO2max
  structure       {'pyramid': [1, 2, 3, 4, 3, 2, 1], 'recovery': 1}
  power_zone                                                      5
  intensity                                                     112
  variations      [pyramide 1234321 VO2, pyramid 1-2-3-4-3-2-1mi...
  is_variation                                                 True
  Name: 116, dtype: object,
  'is_valid': False,
  'message': '❌ Requête non reconnue (confiance: 0.440)\nSuggestion: pyramid 1-2-3-4-3-2-1min',
  'missing_entities': []},
 None)

In [40]:
# 🚫 Test 4: Rejet complet - Confiance très faible
demonstrate_enhanced_pipeline("faire du vélo")



🔍 REQUÊTE: 'faire du vélo'
📊 Confiance: 0.100
📋 ❌ Requête non reconnue (confiance: 0.100)
Suggestion: 1h endurance allure aerobic tranquille

🚫 Génération bloquée par la validation


({'query': 'faire du vélo',
  'confidence': 0.1,
  'corpus_match': id                                                              0
  description                1h endurance allure aerobic tranquille
  zone                                                      Aerobic
  structure                    {'duration': 60, 'continuous': True}
  power_zone                                                      2
  intensity                                                      65
  variations      [1h endur aerobic, 60min endurance facile, 1 h...
  is_variation                                                  NaN
  Name: 0, dtype: object,
  'is_valid': False,
  'message': '❌ Requête non reconnue (confiance: 0.100)\nSuggestion: 1h endurance allure aerobic tranquille',
  'missing_entities': []},
 None)