In [None]:
# Imports et configuration
import os
import re
import json
import uuid
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# RAG et embeddings
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# Langchain pour RAG
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Utilitaires
import numpy as np
from difflib import SequenceMatcher
import xml.etree.ElementTree as ET

print("✅ Imports RAG chargés avec succès")
print(f"📦 Sentence-transformers version: {SentenceTransformer.__version__ if hasattr(SentenceTransformer, '__version__') else 'OK'}")
print(f"📦 ChromaDB version: {chromadb.__version__}")

# Configuration globale
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Modèle multilingue optimisé
CHROMA_PERSIST_DIR = "./chroma_db"
CONFIDENCE_THRESHOLD_HIGH = 0.85
CONFIDENCE_THRESHOLD_LOW = 0.65


In [None]:
# 1. SYSTÈME DE CORRECTION ORTHOGRAPHIQUE (Existant - Validé)
# Reprise du système de correction qui a fait ses preuves

class SpellChecker:
    def __init__(self):
        # Vocabulaire cycliste français avec corrections phonétiques et familières
        self.cycling_vocabulary = {
            # Corrections phonétiques
            'aerobik': 'aerobic',
            'seuille': 'seuil',
            'piramide': 'pyramide',
            'recupe': 'recup',
            'echauffman': 'echauffement',
            
            # Langage familier vers technique
            'doie': 'dois',
            'chaude': 'echauffement',
            'set': 'series',
            'pose': 'repos',
            'fini': 'finir',
            'avk': 'avec',
            'facile': 'facile',
            'mn': 'min',
            'minut': 'minutes',
            'dix': '10',
            
            # Expressions composées (traitement prioritaire)
            'a fond': 'max',
            'cool down': 'retour au calme',
            'warm up': 'echauffement',
            'over under': 'over-under',
            'sweet spot': 'sweet-spot',
            
            # Zones d'entraînement
            'z1': 'zone1',
            'z2': 'zone2',
            'z3': 'zone3',
            'z4': 'zone4',
            'z5': 'zone5',
            'vo2max': 'vo2',
            'vo2 max': 'vo2',
            'ftp': 'seuil',
            
            # Durées et répétitions
            'x': 'fois',
            'rep': 'repetitions',
            'reps': 'repetitions',
            'sec': 'secondes',
            's': 'secondes',
            'h': 'heures',
        }
        
        self.compound_expressions = [
            ('a fond', 'max'),
            ('cool down', 'retour au calme'),
            ('warm up', 'echauffement'),
            ('over under', 'over-under'),
            ('sweet spot', 'sweet-spot'),
            ('vo2 max', 'vo2'),
        ]
    
    def levenshtein_distance(self, s1: str, s2: str) -> int:
        """Calcule la distance de Levenshtein entre deux chaînes"""
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)
        
        if len(s2) == 0:
            return len(s1)
        
        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        
        return previous_row[-1]
    
    def normalize_text(self, text: str) -> str:
        """Normalise le texte (accents, casse)"""
        import unicodedata
        text = unicodedata.normalize('NFD', text)
        text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
        return text.lower().strip()
    
    def correct_compound_expressions(self, text: str) -> Tuple[str, List[str]]:
        """Corrige les expressions composées en priorité"""
        corrections = []
        corrected_text = text
        
        for original, correction in self.compound_expressions:
            if original in corrected_text:
                corrected_text = corrected_text.replace(original, correction)
                corrections.append(f"'{original}' → '{correction}'")
        
        return corrected_text, corrections
    
    def correct_word(self, word: str) -> Tuple[str, bool]:
        """Corrige un mot individuel"""
        normalized_word = self.normalize_text(word)
        
        # Vérification directe dans le vocabulaire
        if normalized_word in self.cycling_vocabulary:
            return self.cycling_vocabulary[normalized_word], True
        
        # Recherche par similarité
        best_match = None
        best_distance = float('inf')
        
        for vocab_word in self.cycling_vocabulary.keys():
            distance = self.levenshtein_distance(normalized_word, vocab_word)
            if distance < best_distance and distance <= 2:  # Seuil de tolérance
                best_distance = distance
                best_match = vocab_word
        
        if best_match:
            return self.cycling_vocabulary[best_match], True
        
        return word, False
    
    def correct_text(self, text: str) -> Dict[str, Any]:
        """Corrige un texte complet avec rapport détaillé"""
        # Étape 1: Correction des expressions composées
        corrected_text, compound_corrections = self.correct_compound_expressions(text)
        
        # Étape 2: Correction mot par mot
        words = corrected_text.split()
        corrected_words = []
        word_corrections = []
        
        for word in words:
            # Nettoyer la ponctuation
            clean_word = re.sub(r'[^\w]', '', word)
            if clean_word:
                corrected_word, was_corrected = self.correct_word(clean_word)
                if was_corrected and corrected_word != clean_word:
                    word_corrections.append(f"'{clean_word}' → '{corrected_word}'")
                
                # Restaurer la ponctuation
                punctuation = re.findall(r'[^\w]', word)
                final_word = corrected_word + ''.join(punctuation)
                corrected_words.append(final_word)
            else:
                corrected_words.append(word)
        
        final_text = ' '.join(corrected_words)
        all_corrections = compound_corrections + word_corrections
        
        # Calcul de la confiance de correction
        total_words = len(words)
        corrected_count = len(all_corrections)
        correction_confidence = max(0.5, 1 - (corrected_count / max(total_words, 1)) * 0.5)
        
        return {
            'original_text': text,
            'corrected_text': final_text,
            'corrections': all_corrections,
            'correction_count': len(all_corrections),
            'correction_confidence': correction_confidence,
            'was_corrected': len(all_corrections) > 0
        }

# Test du système de correction
spell_checker = SpellChecker()
test_query = "je doie faire dix minut de chaude, apres 3 set de 5 mn a fond et 2 min pose entre set. fini avk 10 min cool down facile"

correction_result = spell_checker.correct_text(test_query)
print("🔧 Test du système de correction:")
print(f"Original: {correction_result['original_text']}")
print(f"Corrigé: {correction_result['corrected_text']}")
print(f"Corrections ({correction_result['correction_count']}): {correction_result['corrections']}")
print(f"Confiance correction: {correction_result['correction_confidence']:.3f}")
print("✅ Système de correction opérationnel")


In [None]:
# 2. CORPUS ENRICHI AVEC MÉTADONNÉES STRUCTURÉES
# Extension du corpus existant avec structure pour embeddings

@dataclass
class WorkoutMetadata:
    """Métadonnées structurées pour chaque séance d'entraînement"""
    id: str
    name: str
    description: str
    zone: str
    duration_minutes: int
    structure: str  # "simple", "complex", "complete"
    difficulty: int  # 1-5
    keywords: List[str]
    ftp_percentage_range: Tuple[int, int]  # (min%, max%)
    workout_type: str  # "aerobic", "tempo", "vo2", "mixed"

class EnhancedCorpus:
    def __init__(self):
        # Corpus existant validé + nouvelles entrées structurées
        self.corpus_data = [
            # Séances complètes (warmup + main + cooldown)
            {
                "text": "10min echauffement puis 3 series de 5min max avec 2min repos entre series puis 10min retour au calme",
                "metadata": WorkoutMetadata(
                    id="complete_001",
                    name="3x5min VO2max",
                    description="Séance VO2max complète avec échauffement et récupération",
                    zone="vo2",
                    duration_minutes=41,
                    structure="complete",
                    difficulty=4,
                    keywords=["vo2max", "series", "max", "echauffement", "retour au calme"],
                    ftp_percentage_range=(105, 120),
                    workout_type="vo2"
                )
            },
            {
                "text": "15 minutes echauffement progressif puis 20 minutes tempo seuil puis 10 minutes retour calme",
                "metadata": WorkoutMetadata(
                    id="complete_002",
                    name="Tempo 20min",
                    description="Séance tempo seuil avec échauffement progressif",
                    zone="seuil",
                    duration_minutes=45,
                    structure="complete",
                    difficulty=3,
                    keywords=["tempo", "seuil", "echauffement", "progressif"],
                    ftp_percentage_range=(88, 95),
                    workout_type="tempo"
                )
            },
            {
                "text": "echauffement 12min puis 4 fois 4min seuil avec 90sec repos puis retour au calme 8min",
                "metadata": WorkoutMetadata(
                    id="complete_003",
                    name="4x4min Seuil",
                    description="Intervalles seuil classiques",
                    zone="seuil",
                    duration_minutes=42,
                    structure="complete",
                    difficulty=4,
                    keywords=["seuil", "intervalles", "4x4", "repos"],
                    ftp_percentage_range=(95, 105),
                    workout_type="tempo"
                )
            },
            {
                "text": "pyramide 1-2-3-4-3-2-1 minutes en zone4 avec repos egal travail",
                "metadata": WorkoutMetadata(
                    id="complex_001",
                    name="Pyramide Zone4",
                    description="Pyramide progressive en zone seuil",
                    zone="seuil",
                    duration_minutes=32,
                    structure="complex",
                    difficulty=4,
                    keywords=["pyramide", "zone4", "progressif", "seuil"],
                    ftp_percentage_range=(88, 95),
                    workout_type="tempo"
                )
            },
            {
                "text": "6 fois 30sec max avec 30sec repos puis 5min facile puis 4 fois 2min tempo avec 1min repos",
                "metadata": WorkoutMetadata(
                    id="complex_002",
                    name="Mixed VO2+Tempo",
                    description="Séance mixte VO2max et tempo",
                    zone="mixed",
                    duration_minutes=35,
                    structure="complex",
                    difficulty=5,
                    keywords=["vo2max", "tempo", "mixte", "30sec", "2min"],
                    ftp_percentage_range=(90, 120),
                    workout_type="mixed"
                )
            },
            {
                "text": "5 fois 3min over-under alternant 90sec a 95% et 90sec a 105% avec 2min repos",
                "metadata": WorkoutMetadata(
                    id="complex_003",
                    name="Over-Under 5x3min",
                    description="Intervalles over-under autour du seuil",
                    zone="seuil",
                    duration_minutes=23,
                    structure="complex",
                    difficulty=4,
                    keywords=["over-under", "seuil", "alternant", "95%", "105%"],
                    ftp_percentage_range=(95, 105),
                    workout_type="tempo"
                )
            },
            # Séances simples
            {
                "text": "45 minutes aerobic zone2",
                "metadata": WorkoutMetadata(
                    id="simple_001",
                    name="Aerobic 45min",
                    description="Séance aérobie continue",
                    zone="aerobic",
                    duration_minutes=45,
                    structure="simple",
                    difficulty=2,
                    keywords=["aerobic", "zone2", "continu"],
                    ftp_percentage_range=(65, 75),
                    workout_type="aerobic"
                )
            },
            {
                "text": "20 minutes tempo seuil",
                "metadata": WorkoutMetadata(
                    id="simple_002",
                    name="Tempo 20min",
                    description="Effort tempo au seuil",
                    zone="seuil",
                    duration_minutes=20,
                    structure="simple",
                    difficulty=3,
                    keywords=["tempo", "seuil", "continu"],
                    ftp_percentage_range=(88, 95),
                    workout_type="tempo"
                )
            },
            {
                "text": "8 fois 1min max avec 1min repos",
                "metadata": WorkoutMetadata(
                    id="simple_003",
                    name="8x1min VO2max",
                    description="Intervalles courts VO2max",
                    zone="vo2",
                    duration_minutes=16,
                    structure="simple",
                    difficulty=4,
                    keywords=["vo2max", "1min", "intervalles", "max"],
                    ftp_percentage_range=(105, 120),
                    workout_type="vo2"
                )
            }
        ]
        
        # Index pour recherche rapide
        self.text_to_metadata = {item["text"]: item["metadata"] for item in self.corpus_data}
        self.id_to_data = {item["metadata"].id: item for item in self.corpus_data}
    
    def get_all_texts(self) -> List[str]:
        """Retourne tous les textes du corpus"""
        return [item["text"] for item in self.corpus_data]
    
    def get_metadata(self, text: str) -> Optional[WorkoutMetadata]:
        """Récupère les métadonnées d'un texte"""
        return self.text_to_metadata.get(text)
    
    def get_by_id(self, workout_id: str) -> Optional[Dict]:
        """Récupère une séance par son ID"""
        return self.id_to_data.get(workout_id)
    
    def search_by_criteria(self, zone: str = None, difficulty: int = None, 
                          structure: str = None, workout_type: str = None) -> List[Dict]:
        """Recherche par critères métadonnées"""
        results = []
        for item in self.corpus_data:
            metadata = item["metadata"]
            match = True
            
            if zone and metadata.zone != zone:
                match = False
            if difficulty and metadata.difficulty != difficulty:
                match = False
            if structure and metadata.structure != structure:
                match = False
            if workout_type and metadata.workout_type != workout_type:
                match = False
            
            if match:
                results.append(item)
        
        return results

# Initialisation du corpus enrichi
corpus = EnhancedCorpus()
print(f"📚 Corpus enrichi initialisé avec {len(corpus.corpus_data)} séances")
print(f"📊 Répartition par type: {dict(zip(*np.unique([item['metadata'].workout_type for item in corpus.corpus_data], return_counts=True)))}")
print(f"📈 Répartition par structure: {dict(zip(*np.unique([item['metadata'].structure for item in corpus.corpus_data], return_counts=True)))}")
print("✅ Corpus enrichi opérationnel")


In [None]:
# 3. SYSTÈME D'EMBEDDINGS VECTORIELS
# Implémentation de la recherche sémantique avec sentence-transformers

class VectorEmbeddingSystem:
    def __init__(self, model_name: str = MODEL_NAME):
        """Initialise le système d'embeddings vectoriels"""
        print(f"🔄 Chargement du modèle d'embeddings: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.embeddings_cache = {}
        print("✅ Modèle d'embeddings chargé")
    
    def encode_text(self, text: str) -> np.ndarray:
        """Encode un texte en vecteur d'embedding"""
        if text in self.embeddings_cache:
            return self.embeddings_cache[text]
        
        embedding = self.model.encode(text, convert_to_numpy=True)
        self.embeddings_cache[text] = embedding
        return embedding
    
    def encode_corpus(self, texts: List[str]) -> np.ndarray:
        """Encode un corpus complet en batch (plus efficace)"""
        print(f"🔄 Encodage de {len(texts)} textes en batch...")
        embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        
        # Mise en cache
        for text, embedding in zip(texts, embeddings):
            self.embeddings_cache[text] = embedding
        
        print("✅ Encodage terminé")
        return embeddings
    
    def calculate_similarity(self, query_embedding: np.ndarray, 
                           corpus_embeddings: np.ndarray) -> np.ndarray:
        """Calcule la similarité cosinus entre une requête et le corpus"""
        from sklearn.metrics.pairwise import cosine_similarity
        
        # Reshape si nécessaire
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
        return similarities
    
    def find_most_similar(self, query: str, corpus_texts: List[str], 
                         top_k: int = 5) -> List[Tuple[str, float]]:
        """Trouve les textes les plus similaires à une requête"""
        # Encoder la requête
        query_embedding = self.encode_text(query)
        
        # Encoder le corpus (utilise le cache si disponible)
        corpus_embeddings = []
        for text in corpus_texts:
            if text in self.embeddings_cache:
                corpus_embeddings.append(self.embeddings_cache[text])
            else:
                embedding = self.encode_text(text)
                corpus_embeddings.append(embedding)
        
        corpus_embeddings = np.array(corpus_embeddings)
        
        # Calculer les similarités
        similarities = self.calculate_similarity(query_embedding, corpus_embeddings)
        
        # Trier et retourner les top_k
        sorted_indices = np.argsort(similarities)[::-1][:top_k]
        results = [(corpus_texts[i], float(similarities[i])) for i in sorted_indices]
        
        return results

# Initialisation du système d'embeddings
print("🚀 Initialisation du système d'embeddings vectoriels...")
embedding_system = VectorEmbeddingSystem()

# Pre-encodage du corpus pour optimiser les performances
corpus_texts = corpus.get_all_texts()
corpus_embeddings = embedding_system.encode_corpus(corpus_texts)

print(f"📊 Corpus encodé: {corpus_embeddings.shape}")
print(f"🎯 Dimension des embeddings: {corpus_embeddings.shape[1]}")
print("✅ Système d'embeddings opérationnel")


In [None]:
# 4. PIPELINE RAG HYBRIDE COMPLET
# Intégration correction + recherche vectorielle + scoring intelligent

class RAGPipeline:
    def __init__(self, spell_checker: SpellChecker, corpus: EnhancedCorpus, 
                 embedding_system: VectorEmbeddingSystem):
        """Initialise le pipeline RAG complet"""
        self.spell_checker = spell_checker
        self.corpus = corpus
        self.embedding_system = embedding_system
        self.corpus_texts = corpus.get_all_texts()
        
    def process_query(self, query: str, top_k: int = 3) -> Dict[str, Any]:
        """Traite une requête complète avec le pipeline RAG"""
        
        # Étape 1: Correction orthographique
        correction_result = self.spell_checker.correct_text(query)
        corrected_query = correction_result['corrected_text']
        
        # Étape 2: Recherche vectorielle sémantique
        vector_results = self.embedding_system.find_most_similar(
            corrected_query, self.corpus_texts, top_k=top_k
        )
        
        # Étape 3: Enrichissement avec métadonnées
        enriched_results = []
        for text, similarity_score in vector_results:
            metadata = self.corpus.get_metadata(text)
            enriched_results.append({
                'text': text,
                'similarity_score': similarity_score,
                'metadata': metadata
            })
        
        # Étape 4: Scoring hybride (similarité + correction + métadonnées)
        final_results = []
        for result in enriched_results:
            hybrid_score = self._calculate_hybrid_score(
                result['similarity_score'],
                correction_result['correction_confidence'],
                result['metadata'],
                corrected_query
            )
            
            final_results.append({
                **result,
                'hybrid_score': hybrid_score,
                'confidence_level': self._determine_confidence_level(hybrid_score)
            })
        
        # Trier par score hybride
        final_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
        
        return {
            'original_query': query,
            'corrected_query': corrected_query,
            'correction_info': correction_result,
            'results': final_results,
            'best_match': final_results[0] if final_results else None,
            'processing_successful': len(final_results) > 0
        }
    
    def _calculate_hybrid_score(self, similarity_score: float, correction_confidence: float, 
                              metadata: WorkoutMetadata, corrected_query: str) -> float:
        """Calcule un score hybride combinant plusieurs facteurs"""
        
        # Score de base : similarité vectorielle (poids 0.6)
        base_score = similarity_score * 0.6
        
        # Bonus correction : confiance de correction (poids 0.2)
        correction_bonus = correction_confidence * 0.2
        
        # Bonus métadonnées : structure complète, difficulté, mots-clés (poids 0.2)
        metadata_bonus = 0.0
        
        # Bonus pour séances complètes
        if metadata.structure == "complete":
            metadata_bonus += 0.05
        
        # Bonus pour correspondance de mots-clés
        query_words = set(corrected_query.lower().split())
        keyword_matches = len(query_words.intersection(set(metadata.keywords)))
        if keyword_matches > 0:
            metadata_bonus += min(0.1, keyword_matches * 0.02)
        
        # Bonus pour durée raisonnable (20-60 min)
        if 20 <= metadata.duration_minutes <= 60:
            metadata_bonus += 0.03
        
        # Score final
        hybrid_score = base_score + correction_bonus + (metadata_bonus * 0.2)
        
        return min(1.0, hybrid_score)  # Cap à 1.0
    
    def _determine_confidence_level(self, hybrid_score: float) -> str:
        """Détermine le niveau de confiance basé sur le score hybride"""
        if hybrid_score >= CONFIDENCE_THRESHOLD_HIGH:
            return "HIGH"
        elif hybrid_score >= CONFIDENCE_THRESHOLD_LOW:
            return "MEDIUM"
        else:
            return "LOW"
    
    def validate_query(self, query: str) -> Dict[str, Any]:
        """Valide une requête et retourne le résultat avec recommandations"""
        result = self.process_query(query)
        
        if not result['processing_successful']:
            return {
                'success': False,
                'confidence': 0.0,
                'message': "Aucune séance correspondante trouvée",
                'suggestions': ["Essayez des termes plus spécifiques", "Vérifiez l'orthographe"]
            }
        
        best_match = result['best_match']
        confidence = best_match['hybrid_score']
        confidence_level = best_match['confidence_level']
        
        if confidence_level == "HIGH":
            return {
                'success': True,
                'confidence': confidence,
                'message': f"Séance trouvée avec haute confiance: {best_match['metadata'].name}",
                'workout': best_match,
                'correction_applied': result['correction_info']['was_corrected'],
                'corrections': result['correction_info']['corrections']
            }
        elif confidence_level == "MEDIUM":
            return {
                'success': True,
                'confidence': confidence,
                'message': f"Séance trouvée avec confiance moyenne: {best_match['metadata'].name}",
                'workout': best_match,
                'warning': "Vérifiez que cette séance correspond à votre demande",
                'correction_applied': result['correction_info']['was_corrected'],
                'corrections': result['correction_info']['corrections']
            }
        else:
            return {
                'success': False,
                'confidence': confidence,
                'message': "Confiance insuffisante pour recommander une séance",
                'suggestions': [
                    f"Séance la plus proche: {best_match['metadata'].name}",
                    "Essayez d'être plus précis dans votre demande"
                ],
                'potential_match': best_match
            }

# Initialisation du pipeline RAG complet
print("🚀 Initialisation du pipeline RAG hybride...")
rag_pipeline = RAGPipeline(spell_checker, corpus, embedding_system)
print("✅ Pipeline RAG opérationnel")

# Test du pipeline complet
test_query = "je doie faire dix minut de chaude, apres 3 set de 5 mn a fond et 2 min pose entre set. fini avk 10 min cool down facile"
print(f"\n🧪 Test du pipeline RAG complet:")
print(f"Query: {test_query}")

validation_result = rag_pipeline.validate_query(test_query)
print(f"\n📊 Résultat de validation:")
print(f"✅ Succès: {validation_result['success']}")
print(f"🎯 Confiance: {validation_result['confidence']:.3f}")
print(f"💬 Message: {validation_result['message']}")

if validation_result.get('correction_applied'):
    print(f"🔧 Corrections appliquées ({len(validation_result['corrections'])}): {validation_result['corrections']}")

if validation_result.get('workout'):
    workout = validation_result['workout']
    print(f"🏋️ Séance recommandée: {workout['metadata'].name}")
    print(f"📝 Description: {workout['metadata'].description}")
    print(f"⏱️ Durée: {workout['metadata'].duration_minutes} min")
    print(f"🎚️ Difficulté: {workout['metadata'].difficulty}/5")
    print(f"📈 Similarité vectorielle: {workout['similarity_score']:.3f}")
    print(f"🔥 Score hybride: {workout['hybrid_score']:.3f}")

print("\n✅ Pipeline RAG complet validé avec succès!")


In [None]:
# 5. GÉNÉRATEUR DE FICHIERS .ZWO POUR ZWIFT
# Conversion des séances en format XML Zwift

@dataclass
class ZwoSegment:
    """Segment d'une séance Zwift"""
    duration: int  # en secondes
    power_low: float  # pourcentage FTP (0.0-2.0)
    power_high: float  # pourcentage FTP (0.0-2.0)
    cadence: Optional[int] = None
    segment_type: str = "SteadyState"  # SteadyState, Warmup, Cooldown, Intervals

class ZwoGenerator:
    """Générateur de fichiers .zwo pour Zwift"""
    
    def __init__(self):
        self.ftp_zones = {
            'aerobic': (0.65, 0.75),      # Zone 2
            'tempo': (0.76, 0.90),        # Zone 3
            'seuil': (0.88, 1.05),        # Zone 4
            'vo2': (1.05, 1.20),          # Zone 5
            'mixed': (0.88, 1.20)         # Variable
        }
    
    def parse_workout_to_segments(self, workout_text: str, metadata: WorkoutMetadata) -> List[ZwoSegment]:
        """Parse une description de séance en segments ZWO"""
        segments = []
        
        # Analyse basique du texte pour extraire la structure
        text = workout_text.lower()
        
        # Détection de l'échauffement
        if 'echauffement' in text or 'chaude' in text:
            warmup_duration = self._extract_duration(text, 'echauffement') or self._extract_duration(text, 'chaude')
            if warmup_duration:
                segments.append(ZwoSegment(
                    duration=warmup_duration * 60,
                    power_low=0.5,
                    power_high=0.7,
                    segment_type="Warmup"
                ))
        
        # Détection des intervalles principaux
        power_range = self.ftp_zones.get(metadata.zone, (0.85, 1.0))
        
        if 'series' in text or 'fois' in text:
            # Intervalles
            interval_duration = self._extract_interval_duration(text)
            interval_count = self._extract_interval_count(text)
            rest_duration = self._extract_rest_duration(text)
            
            if interval_duration and interval_count:
                for i in range(interval_count):
                    # Intervalle de travail
                    segments.append(ZwoSegment(
                        duration=interval_duration * 60,
                        power_low=power_range[0],
                        power_high=power_range[1],
                        segment_type="SteadyState"
                    ))
                    
                    # Repos (sauf après le dernier intervalle)
                    if i < interval_count - 1 and rest_duration:
                        segments.append(ZwoSegment(
                            duration=rest_duration * 60,
                            power_low=0.5,
                            power_high=0.6,
                            segment_type="SteadyState"
                        ))
        else:
            # Effort continu
            main_duration = metadata.duration_minutes
            if 'echauffement' in text:
                main_duration -= self._extract_duration(text, 'echauffement') or 10
            if 'retour' in text or 'cool' in text:
                main_duration -= self._extract_duration(text, 'retour') or 10
            
            if main_duration > 0:
                segments.append(ZwoSegment(
                    duration=main_duration * 60,
                    power_low=power_range[0],
                    power_high=power_range[1],
                    segment_type="SteadyState"
                ))
        
        # Détection du retour au calme
        if 'retour' in text or 'cool down' in text or 'cool' in text:
            cooldown_duration = self._extract_duration(text, 'retour') or self._extract_duration(text, 'cool')
            if cooldown_duration:
                segments.append(ZwoSegment(
                    duration=cooldown_duration * 60,
                    power_low=0.4,
                    power_high=0.6,
                    segment_type="Cooldown"
                ))
        
        return segments
    
    def _extract_duration(self, text: str, keyword: str) -> Optional[int]:
        """Extrait une durée en minutes après un mot-clé"""
        import re
        patterns = [
            f'{keyword}.*?(\\d+)\\s*min',
            f'(\\d+)\\s*min.*?{keyword}',
            f'{keyword}.*?(\\d+)m',
            f'{keyword}.*?(\\d+)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def _extract_interval_duration(self, text: str) -> Optional[int]:
        """Extrait la durée des intervalles"""
        import re
        patterns = [
            r'(\\d+)\\s*min\\s*(?:max|seuil|tempo)',
            r'de\\s*(\\d+)\\s*min',
            r'(\\d+)min\\s*(?:max|seuil)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def _extract_interval_count(self, text: str) -> Optional[int]:
        """Extrait le nombre d'intervalles"""
        import re
        patterns = [
            r'(\\d+)\\s*(?:series|fois)',
            r'(\\d+)\\s*x',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def _extract_rest_duration(self, text: str) -> Optional[int]:
        """Extrait la durée de repos"""
        import re
        patterns = [
            r'(\\d+)\\s*min\\s*repos',
            r'repos\\s*(\\d+)\\s*min',
            r'avec\\s*(\\d+)\\s*min'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def generate_zwo_xml(self, segments: List[ZwoSegment], metadata: WorkoutMetadata) -> str:
        """Génère le XML .zwo complet"""
        
        # Calcul de la durée totale
        total_duration = sum(segment.duration for segment in segments)
        
        xml_content = f'''<?xml version="1.0" encoding="UTF-8"?>
<workout_file>
    <author>Vekta Pipeline</author>
    <name>{metadata.name}</name>
    <description>{metadata.description}</description>
    <sportType>bike</sportType>
    <tags>
        <tag name="{metadata.zone}"/>
        <tag name="difficulty_{metadata.difficulty}"/>
        <tag name="{metadata.structure}"/>
    </tags>
    <workout>
'''
        
        for i, segment in enumerate(segments):
            if segment.segment_type == "Warmup":
                xml_content += f'''        <Warmup Duration="{segment.duration}" PowerLow="{segment.power_low:.2f}" PowerHigh="{segment.power_high:.2f}"/>
'''
            elif segment.segment_type == "Cooldown":
                xml_content += f'''        <Cooldown Duration="{segment.duration}" PowerLow="{segment.power_low:.2f}" PowerHigh="{segment.power_high:.2f}"/>
'''
            else:
                xml_content += f'''        <SteadyState Duration="{segment.duration}" Power="{segment.power_low:.2f}"/>
'''
        
        xml_content += '''    </workout>
</workout_file>'''
        
        return xml_content
    
    def create_zwo_file(self, workout_text: str, metadata: WorkoutMetadata, 
                       output_dir: str = "./generated_workouts") -> str:
        """Crée un fichier .zwo complet"""
        
        # Créer le répertoire si nécessaire
        os.makedirs(output_dir, exist_ok=True)
        
        # Parser la séance en segments
        segments = self.parse_workout_to_segments(workout_text, metadata)
        
        # Générer le XML
        xml_content = self.generate_zwo_xml(segments, metadata)
        
        # Nom du fichier
        safe_name = re.sub(r'[^a-zA-Z0-9_-]', '_', metadata.name.lower())
        filename = f"{safe_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zwo"
        filepath = os.path.join(output_dir, filename)
        
        # Écrire le fichier
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(xml_content)
        
        return filepath

# Test du générateur ZWO
zwo_generator = ZwoGenerator()

# Test avec notre requête critique
if validation_result['success'] and validation_result.get('workout'):
    workout = validation_result['workout']
    workout_text = workout['text']
    metadata = workout['metadata']
    
    print(f"\n🏗️ Génération du fichier .zwo pour: {metadata.name}")
    print(f"📝 Texte: {workout_text}")
    
    # Parser en segments
    segments = zwo_generator.parse_workout_to_segments(workout_text, metadata)
    print(f"🧩 Segments détectés: {len(segments)}")
    
    for i, segment in enumerate(segments):
        print(f"  {i+1}. {segment.segment_type}: {segment.duration//60}min @ {segment.power_low:.0%}-{segment.power_high:.0%} FTP")
    
    # Générer le fichier
    zwo_filepath = zwo_generator.create_zwo_file(workout_text, metadata)
    print(f"✅ Fichier .zwo généré: {zwo_filepath}")
    
    # Afficher un aperçu du XML
    with open(zwo_filepath, 'r', encoding='utf-8') as f:
        xml_preview = f.read()[:500] + "..." if len(f.read()) > 500 else f.read()
    
    print(f"\n📄 Aperçu du XML généré:")
    print(xml_preview)

print("\n✅ Générateur ZWO opérationnel!")


In [None]:
# 6. TESTS COMPLETS DU PIPELINE RAG
# Validation sur différents types de requêtes

def test_rag_pipeline():
    """Suite de tests complète pour valider le pipeline RAG"""
    
    test_cases = [
        {
            'name': 'Requête critique validée',
            'query': 'je doie faire dix minut de chaude, apres 3 set de 5 mn a fond et 2 min pose entre set. fini avk 10 min cool down facile',
            'expected_success': True,
            'expected_min_confidence': 0.8
        },
        {
            'name': 'Séance tempo simple',
            'query': '20 minutes tempo seuil',
            'expected_success': True,
            'expected_min_confidence': 0.85
        },
        {
            'name': 'Intervalles VO2max',
            'query': '8 fois 1 minute max avec 1 minute repos',
            'expected_success': True,
            'expected_min_confidence': 0.8
        },
        {
            'name': 'Séance aérobie',
            'query': '45min aerobic zone2',
            'expected_success': True,
            'expected_min_confidence': 0.85
        },
        {
            'name': 'Over-under complexe',
            'query': 'over-under 5x3min alternant 95% et 105%',
            'expected_success': True,
            'expected_min_confidence': 0.75
        },
        {
            'name': 'Requête ambiguë',
            'query': 'faire du sport',
            'expected_success': False,
            'expected_min_confidence': 0.0
        },
        {
            'name': 'Fautes multiples',
            'query': 'piramide aerobik avec recupe',
            'expected_success': True,
            'expected_min_confidence': 0.6
        }
    ]
    
    print("🧪 SUITE DE TESTS COMPLÈTE DU PIPELINE RAG")
    print("=" * 60)
    
    results = []
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"\n📝 Test {i}: {test_case['name']}")
        print(f"Query: '{test_case['query']}'")
        
        # Exécuter le test
        result = rag_pipeline.validate_query(test_case['query'])
        
        # Vérifier les résultats
        success_ok = result['success'] == test_case['expected_success']
        confidence_ok = result['confidence'] >= test_case['expected_min_confidence']
        
        test_passed = success_ok and confidence_ok
        
        print(f"✅ Succès: {result['success']} (attendu: {test_case['expected_success']}) {'✓' if success_ok else '✗'}")
        print(f"🎯 Confiance: {result['confidence']:.3f} (min: {test_case['expected_min_confidence']}) {'✓' if confidence_ok else '✗'}")
        print(f"💬 Message: {result['message']}")
        
        if result.get('correction_applied'):
            print(f"🔧 Corrections: {result['corrections']}")
        
        if result.get('workout'):
            workout = result['workout']
            print(f"🏋️ Séance: {workout['metadata'].name} (score: {workout['hybrid_score']:.3f})")
        
        print(f"🏆 Test {'RÉUSSI' if test_passed else 'ÉCHOUÉ'}")
        
        results.append({
            'name': test_case['name'],
            'query': test_case['query'],
            'passed': test_passed,
            'confidence': result['confidence'],
            'success': result['success']
        })
    
    # Résumé des tests
    print("\n" + "=" * 60)
    print("📊 RÉSUMÉ DES TESTS")
    print("=" * 60)
    
    passed_tests = sum(1 for r in results if r['passed'])
    total_tests = len(results)
    success_rate = passed_tests / total_tests * 100
    
    print(f"✅ Tests réussis: {passed_tests}/{total_tests} ({success_rate:.1f}%)")
    
    avg_confidence = np.mean([r['confidence'] for r in results if r['success']])
    print(f"🎯 Confiance moyenne: {avg_confidence:.3f}")
    
    if passed_tests == total_tests:
        print("🏆 TOUS LES TESTS SONT RÉUSSIS!")
    else:
        print("⚠️ Certains tests ont échoué:")
        for result in results:
            if not result['passed']:
                print(f"  - {result['name']}: {result['query']}")
    
    return results

# Exécution des tests
test_results = test_rag_pipeline()

print("\n✅ Suite de tests terminée!")
