In [None]:
# Imports et configuration
import os
import re
import json
import uuid
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# RAG et embeddings
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# Langchain pour RAG
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Utilitaires
import numpy as np
from difflib import SequenceMatcher
import xml.etree.ElementTree as ET

print("‚úÖ Imports RAG charg√©s avec succ√®s")
print(f"üì¶ Sentence-transformers version: {SentenceTransformer.__version__ if hasattr(SentenceTransformer, '__version__') else 'OK'}")
print(f"üì¶ ChromaDB version: {chromadb.__version__}")

# Configuration globale
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Mod√®le multilingue optimis√©
CHROMA_PERSIST_DIR = "./chroma_db"
CONFIDENCE_THRESHOLD_HIGH = 0.85
CONFIDENCE_THRESHOLD_LOW = 0.65


In [None]:
# 1. SYST√àME DE CORRECTION ORTHOGRAPHIQUE (Existant - Valid√©)
# Reprise du syst√®me de correction qui a fait ses preuves

class SpellChecker:
    def __init__(self):
        # Vocabulaire cycliste fran√ßais avec corrections phon√©tiques et famili√®res
        self.cycling_vocabulary = {
            # Corrections phon√©tiques
            'aerobik': 'aerobic',
            'seuille': 'seuil',
            'piramide': 'pyramide',
            'recupe': 'recup',
            'echauffman': 'echauffement',
            
            # Langage familier vers technique
            'doie': 'dois',
            'chaude': 'echauffement',
            'set': 'series',
            'pose': 'repos',
            'fini': 'finir',
            'avk': 'avec',
            'facile': 'facile',
            'mn': 'min',
            'minut': 'minutes',
            'dix': '10',
            
            # Expressions compos√©es (traitement prioritaire)
            'a fond': 'max',
            'cool down': 'retour au calme',
            'warm up': 'echauffement',
            'over under': 'over-under',
            'sweet spot': 'sweet-spot',
            
            # Zones d'entra√Ænement
            'z1': 'zone1',
            'z2': 'zone2',
            'z3': 'zone3',
            'z4': 'zone4',
            'z5': 'zone5',
            'vo2max': 'vo2',
            'vo2 max': 'vo2',
            'ftp': 'seuil',
            
            # Dur√©es et r√©p√©titions
            'x': 'fois',
            'rep': 'repetitions',
            'reps': 'repetitions',
            'sec': 'secondes',
            's': 'secondes',
            'h': 'heures',
        }
        
        self.compound_expressions = [
            ('a fond', 'max'),
            ('cool down', 'retour au calme'),
            ('warm up', 'echauffement'),
            ('over under', 'over-under'),
            ('sweet spot', 'sweet-spot'),
            ('vo2 max', 'vo2'),
        ]
    
    def levenshtein_distance(self, s1: str, s2: str) -> int:
        """Calcule la distance de Levenshtein entre deux cha√Ænes"""
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)
        
        if len(s2) == 0:
            return len(s1)
        
        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        
        return previous_row[-1]
    
    def normalize_text(self, text: str) -> str:
        """Normalise le texte (accents, casse)"""
        import unicodedata
        text = unicodedata.normalize('NFD', text)
        text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
        return text.lower().strip()
    
    def correct_compound_expressions(self, text: str) -> Tuple[str, List[str]]:
        """Corrige les expressions compos√©es en priorit√©"""
        corrections = []
        corrected_text = text
        
        for original, correction in self.compound_expressions:
            if original in corrected_text:
                corrected_text = corrected_text.replace(original, correction)
                corrections.append(f"'{original}' ‚Üí '{correction}'")
        
        return corrected_text, corrections
    
    def correct_word(self, word: str) -> Tuple[str, bool]:
        """Corrige un mot individuel"""
        normalized_word = self.normalize_text(word)
        
        # V√©rification directe dans le vocabulaire
        if normalized_word in self.cycling_vocabulary:
            return self.cycling_vocabulary[normalized_word], True
        
        # Recherche par similarit√©
        best_match = None
        best_distance = float('inf')
        
        for vocab_word in self.cycling_vocabulary.keys():
            distance = self.levenshtein_distance(normalized_word, vocab_word)
            if distance < best_distance and distance <= 2:  # Seuil de tol√©rance
                best_distance = distance
                best_match = vocab_word
        
        if best_match:
            return self.cycling_vocabulary[best_match], True
        
        return word, False
    
    def correct_text(self, text: str) -> Dict[str, Any]:
        """Corrige un texte complet avec rapport d√©taill√©"""
        # √âtape 1: Correction des expressions compos√©es
        corrected_text, compound_corrections = self.correct_compound_expressions(text)
        
        # √âtape 2: Correction mot par mot
        words = corrected_text.split()
        corrected_words = []
        word_corrections = []
        
        for word in words:
            # Nettoyer la ponctuation
            clean_word = re.sub(r'[^\w]', '', word)
            if clean_word:
                corrected_word, was_corrected = self.correct_word(clean_word)
                if was_corrected and corrected_word != clean_word:
                    word_corrections.append(f"'{clean_word}' ‚Üí '{corrected_word}'")
                
                # Restaurer la ponctuation
                punctuation = re.findall(r'[^\w]', word)
                final_word = corrected_word + ''.join(punctuation)
                corrected_words.append(final_word)
            else:
                corrected_words.append(word)
        
        final_text = ' '.join(corrected_words)
        all_corrections = compound_corrections + word_corrections
        
        # Calcul de la confiance de correction
        total_words = len(words)
        corrected_count = len(all_corrections)
        correction_confidence = max(0.5, 1 - (corrected_count / max(total_words, 1)) * 0.5)
        
        return {
            'original_text': text,
            'corrected_text': final_text,
            'corrections': all_corrections,
            'correction_count': len(all_corrections),
            'correction_confidence': correction_confidence,
            'was_corrected': len(all_corrections) > 0
        }

# Test du syst√®me de correction
spell_checker = SpellChecker()
test_query = "je doie faire dix minut de chaude, apres 3 set de 5 mn a fond et 2 min pose entre set. fini avk 10 min cool down facile"

correction_result = spell_checker.correct_text(test_query)
print("üîß Test du syst√®me de correction:")
print(f"Original: {correction_result['original_text']}")
print(f"Corrig√©: {correction_result['corrected_text']}")
print(f"Corrections ({correction_result['correction_count']}): {correction_result['corrections']}")
print(f"Confiance correction: {correction_result['correction_confidence']:.3f}")
print("‚úÖ Syst√®me de correction op√©rationnel")


In [None]:
# 2. CORPUS ENRICHI AVEC M√âTADONN√âES STRUCTUR√âES
# Extension du corpus existant avec structure pour embeddings

@dataclass
class WorkoutMetadata:
    """M√©tadonn√©es structur√©es pour chaque s√©ance d'entra√Ænement"""
    id: str
    name: str
    description: str
    zone: str
    duration_minutes: int
    structure: str  # "simple", "complex", "complete"
    difficulty: int  # 1-5
    keywords: List[str]
    ftp_percentage_range: Tuple[int, int]  # (min%, max%)
    workout_type: str  # "aerobic", "tempo", "vo2", "mixed"

class EnhancedCorpus:
    def __init__(self):
        # Corpus existant valid√© + nouvelles entr√©es structur√©es
        self.corpus_data = [
            # S√©ances compl√®tes (warmup + main + cooldown)
            {
                "text": "10min echauffement puis 3 series de 5min max avec 2min repos entre series puis 10min retour au calme",
                "metadata": WorkoutMetadata(
                    id="complete_001",
                    name="3x5min VO2max",
                    description="S√©ance VO2max compl√®te avec √©chauffement et r√©cup√©ration",
                    zone="vo2",
                    duration_minutes=41,
                    structure="complete",
                    difficulty=4,
                    keywords=["vo2max", "series", "max", "echauffement", "retour au calme"],
                    ftp_percentage_range=(105, 120),
                    workout_type="vo2"
                )
            },
            {
                "text": "15 minutes echauffement progressif puis 20 minutes tempo seuil puis 10 minutes retour calme",
                "metadata": WorkoutMetadata(
                    id="complete_002",
                    name="Tempo 20min",
                    description="S√©ance tempo seuil avec √©chauffement progressif",
                    zone="seuil",
                    duration_minutes=45,
                    structure="complete",
                    difficulty=3,
                    keywords=["tempo", "seuil", "echauffement", "progressif"],
                    ftp_percentage_range=(88, 95),
                    workout_type="tempo"
                )
            },
            {
                "text": "echauffement 12min puis 4 fois 4min seuil avec 90sec repos puis retour au calme 8min",
                "metadata": WorkoutMetadata(
                    id="complete_003",
                    name="4x4min Seuil",
                    description="Intervalles seuil classiques",
                    zone="seuil",
                    duration_minutes=42,
                    structure="complete",
                    difficulty=4,
                    keywords=["seuil", "intervalles", "4x4", "repos"],
                    ftp_percentage_range=(95, 105),
                    workout_type="tempo"
                )
            },
            {
                "text": "pyramide 1-2-3-4-3-2-1 minutes en zone4 avec repos egal travail",
                "metadata": WorkoutMetadata(
                    id="complex_001",
                    name="Pyramide Zone4",
                    description="Pyramide progressive en zone seuil",
                    zone="seuil",
                    duration_minutes=32,
                    structure="complex",
                    difficulty=4,
                    keywords=["pyramide", "zone4", "progressif", "seuil"],
                    ftp_percentage_range=(88, 95),
                    workout_type="tempo"
                )
            },
            {
                "text": "6 fois 30sec max avec 30sec repos puis 5min facile puis 4 fois 2min tempo avec 1min repos",
                "metadata": WorkoutMetadata(
                    id="complex_002",
                    name="Mixed VO2+Tempo",
                    description="S√©ance mixte VO2max et tempo",
                    zone="mixed",
                    duration_minutes=35,
                    structure="complex",
                    difficulty=5,
                    keywords=["vo2max", "tempo", "mixte", "30sec", "2min"],
                    ftp_percentage_range=(90, 120),
                    workout_type="mixed"
                )
            },
            {
                "text": "5 fois 3min over-under alternant 90sec a 95% et 90sec a 105% avec 2min repos",
                "metadata": WorkoutMetadata(
                    id="complex_003",
                    name="Over-Under 5x3min",
                    description="Intervalles over-under autour du seuil",
                    zone="seuil",
                    duration_minutes=23,
                    structure="complex",
                    difficulty=4,
                    keywords=["over-under", "seuil", "alternant", "95%", "105%"],
                    ftp_percentage_range=(95, 105),
                    workout_type="tempo"
                )
            },
            # S√©ances simples
            {
                "text": "45 minutes aerobic zone2",
                "metadata": WorkoutMetadata(
                    id="simple_001",
                    name="Aerobic 45min",
                    description="S√©ance a√©robie continue",
                    zone="aerobic",
                    duration_minutes=45,
                    structure="simple",
                    difficulty=2,
                    keywords=["aerobic", "zone2", "continu"],
                    ftp_percentage_range=(65, 75),
                    workout_type="aerobic"
                )
            },
            {
                "text": "20 minutes tempo seuil",
                "metadata": WorkoutMetadata(
                    id="simple_002",
                    name="Tempo 20min",
                    description="Effort tempo au seuil",
                    zone="seuil",
                    duration_minutes=20,
                    structure="simple",
                    difficulty=3,
                    keywords=["tempo", "seuil", "continu"],
                    ftp_percentage_range=(88, 95),
                    workout_type="tempo"
                )
            },
            {
                "text": "8 fois 1min max avec 1min repos",
                "metadata": WorkoutMetadata(
                    id="simple_003",
                    name="8x1min VO2max",
                    description="Intervalles courts VO2max",
                    zone="vo2",
                    duration_minutes=16,
                    structure="simple",
                    difficulty=4,
                    keywords=["vo2max", "1min", "intervalles", "max"],
                    ftp_percentage_range=(105, 120),
                    workout_type="vo2"
                )
            }
        ]
        
        # Index pour recherche rapide
        self.text_to_metadata = {item["text"]: item["metadata"] for item in self.corpus_data}
        self.id_to_data = {item["metadata"].id: item for item in self.corpus_data}
    
    def get_all_texts(self) -> List[str]:
        """Retourne tous les textes du corpus"""
        return [item["text"] for item in self.corpus_data]
    
    def get_metadata(self, text: str) -> Optional[WorkoutMetadata]:
        """R√©cup√®re les m√©tadonn√©es d'un texte"""
        return self.text_to_metadata.get(text)
    
    def get_by_id(self, workout_id: str) -> Optional[Dict]:
        """R√©cup√®re une s√©ance par son ID"""
        return self.id_to_data.get(workout_id)
    
    def search_by_criteria(self, zone: str = None, difficulty: int = None, 
                          structure: str = None, workout_type: str = None) -> List[Dict]:
        """Recherche par crit√®res m√©tadonn√©es"""
        results = []
        for item in self.corpus_data:
            metadata = item["metadata"]
            match = True
            
            if zone and metadata.zone != zone:
                match = False
            if difficulty and metadata.difficulty != difficulty:
                match = False
            if structure and metadata.structure != structure:
                match = False
            if workout_type and metadata.workout_type != workout_type:
                match = False
            
            if match:
                results.append(item)
        
        return results

# Initialisation du corpus enrichi
corpus = EnhancedCorpus()
print(f"üìö Corpus enrichi initialis√© avec {len(corpus.corpus_data)} s√©ances")
print(f"üìä R√©partition par type: {dict(zip(*np.unique([item['metadata'].workout_type for item in corpus.corpus_data], return_counts=True)))}")
print(f"üìà R√©partition par structure: {dict(zip(*np.unique([item['metadata'].structure for item in corpus.corpus_data], return_counts=True)))}")
print("‚úÖ Corpus enrichi op√©rationnel")


In [None]:
# 3. SYST√àME D'EMBEDDINGS VECTORIELS
# Impl√©mentation de la recherche s√©mantique avec sentence-transformers

class VectorEmbeddingSystem:
    def __init__(self, model_name: str = MODEL_NAME):
        """Initialise le syst√®me d'embeddings vectoriels"""
        print(f"üîÑ Chargement du mod√®le d'embeddings: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.embeddings_cache = {}
        print("‚úÖ Mod√®le d'embeddings charg√©")
    
    def encode_text(self, text: str) -> np.ndarray:
        """Encode un texte en vecteur d'embedding"""
        if text in self.embeddings_cache:
            return self.embeddings_cache[text]
        
        embedding = self.model.encode(text, convert_to_numpy=True)
        self.embeddings_cache[text] = embedding
        return embedding
    
    def encode_corpus(self, texts: List[str]) -> np.ndarray:
        """Encode un corpus complet en batch (plus efficace)"""
        print(f"üîÑ Encodage de {len(texts)} textes en batch...")
        embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        
        # Mise en cache
        for text, embedding in zip(texts, embeddings):
            self.embeddings_cache[text] = embedding
        
        print("‚úÖ Encodage termin√©")
        return embeddings
    
    def calculate_similarity(self, query_embedding: np.ndarray, 
                           corpus_embeddings: np.ndarray) -> np.ndarray:
        """Calcule la similarit√© cosinus entre une requ√™te et le corpus"""
        from sklearn.metrics.pairwise import cosine_similarity
        
        # Reshape si n√©cessaire
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
        return similarities
    
    def find_most_similar(self, query: str, corpus_texts: List[str], 
                         top_k: int = 5) -> List[Tuple[str, float]]:
        """Trouve les textes les plus similaires √† une requ√™te"""
        # Encoder la requ√™te
        query_embedding = self.encode_text(query)
        
        # Encoder le corpus (utilise le cache si disponible)
        corpus_embeddings = []
        for text in corpus_texts:
            if text in self.embeddings_cache:
                corpus_embeddings.append(self.embeddings_cache[text])
            else:
                embedding = self.encode_text(text)
                corpus_embeddings.append(embedding)
        
        corpus_embeddings = np.array(corpus_embeddings)
        
        # Calculer les similarit√©s
        similarities = self.calculate_similarity(query_embedding, corpus_embeddings)
        
        # Trier et retourner les top_k
        sorted_indices = np.argsort(similarities)[::-1][:top_k]
        results = [(corpus_texts[i], float(similarities[i])) for i in sorted_indices]
        
        return results

# Initialisation du syst√®me d'embeddings
print("üöÄ Initialisation du syst√®me d'embeddings vectoriels...")
embedding_system = VectorEmbeddingSystem()

# Pre-encodage du corpus pour optimiser les performances
corpus_texts = corpus.get_all_texts()
corpus_embeddings = embedding_system.encode_corpus(corpus_texts)

print(f"üìä Corpus encod√©: {corpus_embeddings.shape}")
print(f"üéØ Dimension des embeddings: {corpus_embeddings.shape[1]}")
print("‚úÖ Syst√®me d'embeddings op√©rationnel")


In [None]:
# 4. PIPELINE RAG HYBRIDE COMPLET
# Int√©gration correction + recherche vectorielle + scoring intelligent

class RAGPipeline:
    def __init__(self, spell_checker: SpellChecker, corpus: EnhancedCorpus, 
                 embedding_system: VectorEmbeddingSystem):
        """Initialise le pipeline RAG complet"""
        self.spell_checker = spell_checker
        self.corpus = corpus
        self.embedding_system = embedding_system
        self.corpus_texts = corpus.get_all_texts()
        
    def process_query(self, query: str, top_k: int = 3) -> Dict[str, Any]:
        """Traite une requ√™te compl√®te avec le pipeline RAG"""
        
        # √âtape 1: Correction orthographique
        correction_result = self.spell_checker.correct_text(query)
        corrected_query = correction_result['corrected_text']
        
        # √âtape 2: Recherche vectorielle s√©mantique
        vector_results = self.embedding_system.find_most_similar(
            corrected_query, self.corpus_texts, top_k=top_k
        )
        
        # √âtape 3: Enrichissement avec m√©tadonn√©es
        enriched_results = []
        for text, similarity_score in vector_results:
            metadata = self.corpus.get_metadata(text)
            enriched_results.append({
                'text': text,
                'similarity_score': similarity_score,
                'metadata': metadata
            })
        
        # √âtape 4: Scoring hybride (similarit√© + correction + m√©tadonn√©es)
        final_results = []
        for result in enriched_results:
            hybrid_score = self._calculate_hybrid_score(
                result['similarity_score'],
                correction_result['correction_confidence'],
                result['metadata'],
                corrected_query
            )
            
            final_results.append({
                **result,
                'hybrid_score': hybrid_score,
                'confidence_level': self._determine_confidence_level(hybrid_score)
            })
        
        # Trier par score hybride
        final_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
        
        return {
            'original_query': query,
            'corrected_query': corrected_query,
            'correction_info': correction_result,
            'results': final_results,
            'best_match': final_results[0] if final_results else None,
            'processing_successful': len(final_results) > 0
        }
    
    def _calculate_hybrid_score(self, similarity_score: float, correction_confidence: float, 
                              metadata: WorkoutMetadata, corrected_query: str) -> float:
        """Calcule un score hybride combinant plusieurs facteurs"""
        
        # Score de base : similarit√© vectorielle (poids 0.6)
        base_score = similarity_score * 0.6
        
        # Bonus correction : confiance de correction (poids 0.2)
        correction_bonus = correction_confidence * 0.2
        
        # Bonus m√©tadonn√©es : structure compl√®te, difficult√©, mots-cl√©s (poids 0.2)
        metadata_bonus = 0.0
        
        # Bonus pour s√©ances compl√®tes
        if metadata.structure == "complete":
            metadata_bonus += 0.05
        
        # Bonus pour correspondance de mots-cl√©s
        query_words = set(corrected_query.lower().split())
        keyword_matches = len(query_words.intersection(set(metadata.keywords)))
        if keyword_matches > 0:
            metadata_bonus += min(0.1, keyword_matches * 0.02)
        
        # Bonus pour dur√©e raisonnable (20-60 min)
        if 20 <= metadata.duration_minutes <= 60:
            metadata_bonus += 0.03
        
        # Score final
        hybrid_score = base_score + correction_bonus + (metadata_bonus * 0.2)
        
        return min(1.0, hybrid_score)  # Cap √† 1.0
    
    def _determine_confidence_level(self, hybrid_score: float) -> str:
        """D√©termine le niveau de confiance bas√© sur le score hybride"""
        if hybrid_score >= CONFIDENCE_THRESHOLD_HIGH:
            return "HIGH"
        elif hybrid_score >= CONFIDENCE_THRESHOLD_LOW:
            return "MEDIUM"
        else:
            return "LOW"
    
    def validate_query(self, query: str) -> Dict[str, Any]:
        """Valide une requ√™te et retourne le r√©sultat avec recommandations"""
        result = self.process_query(query)
        
        if not result['processing_successful']:
            return {
                'success': False,
                'confidence': 0.0,
                'message': "Aucune s√©ance correspondante trouv√©e",
                'suggestions': ["Essayez des termes plus sp√©cifiques", "V√©rifiez l'orthographe"]
            }
        
        best_match = result['best_match']
        confidence = best_match['hybrid_score']
        confidence_level = best_match['confidence_level']
        
        if confidence_level == "HIGH":
            return {
                'success': True,
                'confidence': confidence,
                'message': f"S√©ance trouv√©e avec haute confiance: {best_match['metadata'].name}",
                'workout': best_match,
                'correction_applied': result['correction_info']['was_corrected'],
                'corrections': result['correction_info']['corrections']
            }
        elif confidence_level == "MEDIUM":
            return {
                'success': True,
                'confidence': confidence,
                'message': f"S√©ance trouv√©e avec confiance moyenne: {best_match['metadata'].name}",
                'workout': best_match,
                'warning': "V√©rifiez que cette s√©ance correspond √† votre demande",
                'correction_applied': result['correction_info']['was_corrected'],
                'corrections': result['correction_info']['corrections']
            }
        else:
            return {
                'success': False,
                'confidence': confidence,
                'message': "Confiance insuffisante pour recommander une s√©ance",
                'suggestions': [
                    f"S√©ance la plus proche: {best_match['metadata'].name}",
                    "Essayez d'√™tre plus pr√©cis dans votre demande"
                ],
                'potential_match': best_match
            }

# Initialisation du pipeline RAG complet
print("üöÄ Initialisation du pipeline RAG hybride...")
rag_pipeline = RAGPipeline(spell_checker, corpus, embedding_system)
print("‚úÖ Pipeline RAG op√©rationnel")

# Test du pipeline complet
test_query = "je doie faire dix minut de chaude, apres 3 set de 5 mn a fond et 2 min pose entre set. fini avk 10 min cool down facile"
print(f"\nüß™ Test du pipeline RAG complet:")
print(f"Query: {test_query}")

validation_result = rag_pipeline.validate_query(test_query)
print(f"\nüìä R√©sultat de validation:")
print(f"‚úÖ Succ√®s: {validation_result['success']}")
print(f"üéØ Confiance: {validation_result['confidence']:.3f}")
print(f"üí¨ Message: {validation_result['message']}")

if validation_result.get('correction_applied'):
    print(f"üîß Corrections appliqu√©es ({len(validation_result['corrections'])}): {validation_result['corrections']}")

if validation_result.get('workout'):
    workout = validation_result['workout']
    print(f"üèãÔ∏è S√©ance recommand√©e: {workout['metadata'].name}")
    print(f"üìù Description: {workout['metadata'].description}")
    print(f"‚è±Ô∏è Dur√©e: {workout['metadata'].duration_minutes} min")
    print(f"üéöÔ∏è Difficult√©: {workout['metadata'].difficulty}/5")
    print(f"üìà Similarit√© vectorielle: {workout['similarity_score']:.3f}")
    print(f"üî• Score hybride: {workout['hybrid_score']:.3f}")

print("\n‚úÖ Pipeline RAG complet valid√© avec succ√®s!")


In [None]:
# 5. G√âN√âRATEUR DE FICHIERS .ZWO POUR ZWIFT
# Conversion des s√©ances en format XML Zwift

@dataclass
class ZwoSegment:
    """Segment d'une s√©ance Zwift"""
    duration: int  # en secondes
    power_low: float  # pourcentage FTP (0.0-2.0)
    power_high: float  # pourcentage FTP (0.0-2.0)
    cadence: Optional[int] = None
    segment_type: str = "SteadyState"  # SteadyState, Warmup, Cooldown, Intervals

class ZwoGenerator:
    """G√©n√©rateur de fichiers .zwo pour Zwift"""
    
    def __init__(self):
        self.ftp_zones = {
            'aerobic': (0.65, 0.75),      # Zone 2
            'tempo': (0.76, 0.90),        # Zone 3
            'seuil': (0.88, 1.05),        # Zone 4
            'vo2': (1.05, 1.20),          # Zone 5
            'mixed': (0.88, 1.20)         # Variable
        }
    
    def parse_workout_to_segments(self, workout_text: str, metadata: WorkoutMetadata) -> List[ZwoSegment]:
        """Parse une description de s√©ance en segments ZWO"""
        segments = []
        
        # Analyse basique du texte pour extraire la structure
        text = workout_text.lower()
        
        # D√©tection de l'√©chauffement
        if 'echauffement' in text or 'chaude' in text:
            warmup_duration = self._extract_duration(text, 'echauffement') or self._extract_duration(text, 'chaude')
            if warmup_duration:
                segments.append(ZwoSegment(
                    duration=warmup_duration * 60,
                    power_low=0.5,
                    power_high=0.7,
                    segment_type="Warmup"
                ))
        
        # D√©tection des intervalles principaux
        power_range = self.ftp_zones.get(metadata.zone, (0.85, 1.0))
        
        if 'series' in text or 'fois' in text:
            # Intervalles
            interval_duration = self._extract_interval_duration(text)
            interval_count = self._extract_interval_count(text)
            rest_duration = self._extract_rest_duration(text)
            
            if interval_duration and interval_count:
                for i in range(interval_count):
                    # Intervalle de travail
                    segments.append(ZwoSegment(
                        duration=interval_duration * 60,
                        power_low=power_range[0],
                        power_high=power_range[1],
                        segment_type="SteadyState"
                    ))
                    
                    # Repos (sauf apr√®s le dernier intervalle)
                    if i < interval_count - 1 and rest_duration:
                        segments.append(ZwoSegment(
                            duration=rest_duration * 60,
                            power_low=0.5,
                            power_high=0.6,
                            segment_type="SteadyState"
                        ))
        else:
            # Effort continu
            main_duration = metadata.duration_minutes
            if 'echauffement' in text:
                main_duration -= self._extract_duration(text, 'echauffement') or 10
            if 'retour' in text or 'cool' in text:
                main_duration -= self._extract_duration(text, 'retour') or 10
            
            if main_duration > 0:
                segments.append(ZwoSegment(
                    duration=main_duration * 60,
                    power_low=power_range[0],
                    power_high=power_range[1],
                    segment_type="SteadyState"
                ))
        
        # D√©tection du retour au calme
        if 'retour' in text or 'cool down' in text or 'cool' in text:
            cooldown_duration = self._extract_duration(text, 'retour') or self._extract_duration(text, 'cool')
            if cooldown_duration:
                segments.append(ZwoSegment(
                    duration=cooldown_duration * 60,
                    power_low=0.4,
                    power_high=0.6,
                    segment_type="Cooldown"
                ))
        
        return segments
    
    def _extract_duration(self, text: str, keyword: str) -> Optional[int]:
        """Extrait une dur√©e en minutes apr√®s un mot-cl√©"""
        import re
        patterns = [
            f'{keyword}.*?(\\d+)\\s*min',
            f'(\\d+)\\s*min.*?{keyword}',
            f'{keyword}.*?(\\d+)m',
            f'{keyword}.*?(\\d+)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def _extract_interval_duration(self, text: str) -> Optional[int]:
        """Extrait la dur√©e des intervalles"""
        import re
        patterns = [
            r'(\\d+)\\s*min\\s*(?:max|seuil|tempo)',
            r'de\\s*(\\d+)\\s*min',
            r'(\\d+)min\\s*(?:max|seuil)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def _extract_interval_count(self, text: str) -> Optional[int]:
        """Extrait le nombre d'intervalles"""
        import re
        patterns = [
            r'(\\d+)\\s*(?:series|fois)',
            r'(\\d+)\\s*x',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def _extract_rest_duration(self, text: str) -> Optional[int]:
        """Extrait la dur√©e de repos"""
        import re
        patterns = [
            r'(\\d+)\\s*min\\s*repos',
            r'repos\\s*(\\d+)\\s*min',
            r'avec\\s*(\\d+)\\s*min'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return int(match.group(1))
        return None
    
    def generate_zwo_xml(self, segments: List[ZwoSegment], metadata: WorkoutMetadata) -> str:
        """G√©n√®re le XML .zwo complet"""
        
        # Calcul de la dur√©e totale
        total_duration = sum(segment.duration for segment in segments)
        
        xml_content = f'''<?xml version="1.0" encoding="UTF-8"?>
<workout_file>
    <author>Vekta Pipeline</author>
    <name>{metadata.name}</name>
    <description>{metadata.description}</description>
    <sportType>bike</sportType>
    <tags>
        <tag name="{metadata.zone}"/>
        <tag name="difficulty_{metadata.difficulty}"/>
        <tag name="{metadata.structure}"/>
    </tags>
    <workout>
'''
        
        for i, segment in enumerate(segments):
            if segment.segment_type == "Warmup":
                xml_content += f'''        <Warmup Duration="{segment.duration}" PowerLow="{segment.power_low:.2f}" PowerHigh="{segment.power_high:.2f}"/>
'''
            elif segment.segment_type == "Cooldown":
                xml_content += f'''        <Cooldown Duration="{segment.duration}" PowerLow="{segment.power_low:.2f}" PowerHigh="{segment.power_high:.2f}"/>
'''
            else:
                xml_content += f'''        <SteadyState Duration="{segment.duration}" Power="{segment.power_low:.2f}"/>
'''
        
        xml_content += '''    </workout>
</workout_file>'''
        
        return xml_content
    
    def create_zwo_file(self, workout_text: str, metadata: WorkoutMetadata, 
                       output_dir: str = "./generated_workouts") -> str:
        """Cr√©e un fichier .zwo complet"""
        
        # Cr√©er le r√©pertoire si n√©cessaire
        os.makedirs(output_dir, exist_ok=True)
        
        # Parser la s√©ance en segments
        segments = self.parse_workout_to_segments(workout_text, metadata)
        
        # G√©n√©rer le XML
        xml_content = self.generate_zwo_xml(segments, metadata)
        
        # Nom du fichier
        safe_name = re.sub(r'[^a-zA-Z0-9_-]', '_', metadata.name.lower())
        filename = f"{safe_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zwo"
        filepath = os.path.join(output_dir, filename)
        
        # √âcrire le fichier
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(xml_content)
        
        return filepath

# Test du g√©n√©rateur ZWO
zwo_generator = ZwoGenerator()

# Test avec notre requ√™te critique
if validation_result['success'] and validation_result.get('workout'):
    workout = validation_result['workout']
    workout_text = workout['text']
    metadata = workout['metadata']
    
    print(f"\nüèóÔ∏è G√©n√©ration du fichier .zwo pour: {metadata.name}")
    print(f"üìù Texte: {workout_text}")
    
    # Parser en segments
    segments = zwo_generator.parse_workout_to_segments(workout_text, metadata)
    print(f"üß© Segments d√©tect√©s: {len(segments)}")
    
    for i, segment in enumerate(segments):
        print(f"  {i+1}. {segment.segment_type}: {segment.duration//60}min @ {segment.power_low:.0%}-{segment.power_high:.0%} FTP")
    
    # G√©n√©rer le fichier
    zwo_filepath = zwo_generator.create_zwo_file(workout_text, metadata)
    print(f"‚úÖ Fichier .zwo g√©n√©r√©: {zwo_filepath}")
    
    # Afficher un aper√ßu du XML
    with open(zwo_filepath, 'r', encoding='utf-8') as f:
        xml_preview = f.read()[:500] + "..." if len(f.read()) > 500 else f.read()
    
    print(f"\nüìÑ Aper√ßu du XML g√©n√©r√©:")
    print(xml_preview)

print("\n‚úÖ G√©n√©rateur ZWO op√©rationnel!")


In [None]:
# 6. TESTS COMPLETS DU PIPELINE RAG
# Validation sur diff√©rents types de requ√™tes

def test_rag_pipeline():
    """Suite de tests compl√®te pour valider le pipeline RAG"""
    
    test_cases = [
        {
            'name': 'Requ√™te critique valid√©e',
            'query': 'je doie faire dix minut de chaude, apres 3 set de 5 mn a fond et 2 min pose entre set. fini avk 10 min cool down facile',
            'expected_success': True,
            'expected_min_confidence': 0.8
        },
        {
            'name': 'S√©ance tempo simple',
            'query': '20 minutes tempo seuil',
            'expected_success': True,
            'expected_min_confidence': 0.85
        },
        {
            'name': 'Intervalles VO2max',
            'query': '8 fois 1 minute max avec 1 minute repos',
            'expected_success': True,
            'expected_min_confidence': 0.8
        },
        {
            'name': 'S√©ance a√©robie',
            'query': '45min aerobic zone2',
            'expected_success': True,
            'expected_min_confidence': 0.85
        },
        {
            'name': 'Over-under complexe',
            'query': 'over-under 5x3min alternant 95% et 105%',
            'expected_success': True,
            'expected_min_confidence': 0.75
        },
        {
            'name': 'Requ√™te ambigu√´',
            'query': 'faire du sport',
            'expected_success': False,
            'expected_min_confidence': 0.0
        },
        {
            'name': 'Fautes multiples',
            'query': 'piramide aerobik avec recupe',
            'expected_success': True,
            'expected_min_confidence': 0.6
        }
    ]
    
    print("üß™ SUITE DE TESTS COMPL√àTE DU PIPELINE RAG")
    print("=" * 60)
    
    results = []
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"\nüìù Test {i}: {test_case['name']}")
        print(f"Query: '{test_case['query']}'")
        
        # Ex√©cuter le test
        result = rag_pipeline.validate_query(test_case['query'])
        
        # V√©rifier les r√©sultats
        success_ok = result['success'] == test_case['expected_success']
        confidence_ok = result['confidence'] >= test_case['expected_min_confidence']
        
        test_passed = success_ok and confidence_ok
        
        print(f"‚úÖ Succ√®s: {result['success']} (attendu: {test_case['expected_success']}) {'‚úì' if success_ok else '‚úó'}")
        print(f"üéØ Confiance: {result['confidence']:.3f} (min: {test_case['expected_min_confidence']}) {'‚úì' if confidence_ok else '‚úó'}")
        print(f"üí¨ Message: {result['message']}")
        
        if result.get('correction_applied'):
            print(f"üîß Corrections: {result['corrections']}")
        
        if result.get('workout'):
            workout = result['workout']
            print(f"üèãÔ∏è S√©ance: {workout['metadata'].name} (score: {workout['hybrid_score']:.3f})")
        
        print(f"üèÜ Test {'R√âUSSI' if test_passed else '√âCHOU√â'}")
        
        results.append({
            'name': test_case['name'],
            'query': test_case['query'],
            'passed': test_passed,
            'confidence': result['confidence'],
            'success': result['success']
        })
    
    # R√©sum√© des tests
    print("\n" + "=" * 60)
    print("üìä R√âSUM√â DES TESTS")
    print("=" * 60)
    
    passed_tests = sum(1 for r in results if r['passed'])
    total_tests = len(results)
    success_rate = passed_tests / total_tests * 100
    
    print(f"‚úÖ Tests r√©ussis: {passed_tests}/{total_tests} ({success_rate:.1f}%)")
    
    avg_confidence = np.mean([r['confidence'] for r in results if r['success']])
    print(f"üéØ Confiance moyenne: {avg_confidence:.3f}")
    
    if passed_tests == total_tests:
        print("üèÜ TOUS LES TESTS SONT R√âUSSIS!")
    else:
        print("‚ö†Ô∏è Certains tests ont √©chou√©:")
        for result in results:
            if not result['passed']:
                print(f"  - {result['name']}: {result['query']}")
    
    return results

# Ex√©cution des tests
test_results = test_rag_pipeline()

print("\n‚úÖ Suite de tests termin√©e!")
