In [9]:
# Setup et imports
import re
import json
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
import time

print("Imports OK")


Imports OK


In [10]:
class VektaParser:
    """Parseur structurel inspiré de l'architecture Vekta observée"""
    
    def __init__(self):
        # Patterns numériques ultra-précis
        self.duration_patterns = {
            'hours_min_sec': r'(\d+)h(\d+)min(\d+)s',
            'min_sec': r'(\d+)min(\d+)s',
            'hours_min': r'(\d+)h(\d+)min',
            'minutes': r'(\d+)\s*min',
            'seconds': r'(\d+)\s*s(?:ec)?',
            'hours': r'(\d+)\s*h(?:eure)?s?'
        }
        
        self.power_patterns = {
            'ftp_decimal': r'(\d+(?:\.\d+)?)%\s*ftp',
            'ftp_int': r'(\d+)%\s*ftp',
            'watts': r'(\d+)\s*w(?:atts)?'
        }
        
        self.structure_patterns = {
            'repetitions': r'(\d+)\s*x\s*([^,]+)',
            'alternance': r'alternance\s+(\d+\w+)\s+entre\s+(\d+%?)\s+et\s+(\d+%?)\s+pendant\s+(\d+\w+)',
            'progression': r'progression\s+(\w+)[:.]\s*([^.]+)',
            'spirale': r'spirale[:.]\s*([^.]+)'
        }
    
    def parse_query(self, query: str) -> Dict[str, Any]:
        """Parsing principal avec score de complétude"""
        query_lower = query.lower()
        
        result = {
            'durations': self._extract_durations(query_lower),
            'powers': self._extract_powers(query_lower),
            'structure': self._extract_structure(query_lower),
            'completeness_score': 0.0,
            'parse_time_ms': 0.0
        }
        
        # Calcul score complétude (logique Vekta)
        score = 0.0
        if result['durations']: score += 0.4
        if result['powers']: score += 0.4
        if result['structure']: score += 0.2
        
        result['completeness_score'] = score
        
        return result
    
    def _extract_durations(self, text: str) -> List[Dict]:
        durations = []
        for pattern_name, pattern in self.duration_patterns.items():
            matches = re.finditer(pattern, text)
            for match in matches:
                if pattern_name == 'min_sec':
                    minutes, seconds = match.groups()
                    total_seconds = int(minutes) * 60 + int(seconds)
                    durations.append({
                        'type': 'min_sec',
                        'minutes': int(minutes),
                        'seconds': int(seconds),
                        'total_seconds': total_seconds,
                        'text': match.group(0)
                    })
        return durations
    
    def _extract_powers(self, text: str) -> List[Dict]:
        powers = []
        for pattern_name, pattern in self.power_patterns.items():
            matches = re.finditer(pattern, text)
            for match in matches:
                power_val = float(match.group(1))
                powers.append({
                    'type': pattern_name,
                    'value': power_val,
                    'text': match.group(0)
                })
        return powers
    
    def _extract_structure(self, text: str) -> Dict:
        structure = {}
        
        # Répétitions simples
        rep_match = re.search(self.structure_patterns['repetitions'], text)
        if rep_match:
            structure['type'] = 'repetitions'
            structure['count'] = int(rep_match.group(1))
            structure['element'] = rep_match.group(2)
        
        # Alternances avec calcul automatique
        alt_match = re.search(self.structure_patterns['alternance'], text)
        if alt_match:
            duration_text, power1, power2, total_time = alt_match.groups()
            
            # Extraction durée élémentaire
            duration_match = re.search(r'(\d+)([sm])', duration_text)
            if duration_match:
                value, unit = duration_match.groups()
                element_seconds = int(value) * (60 if unit == 'm' else 1)
                
                # Extraction durée totale
                total_match = re.search(r'(\d+)([smh])', total_time)
                if total_match:
                    total_val, total_unit = total_match.groups()
                    multiplier = {'s': 1, 'm': 60, 'h': 3600}[total_unit]
                    total_seconds = int(total_val) * multiplier
                    
                    # Calcul automatique répétitions
                    repetitions = total_seconds // element_seconds
                    
                    structure['type'] = 'alternance'
                    structure['element_duration'] = element_seconds
                    structure['power_range'] = [power1, power2]
                    structure['calculated_reps'] = repetitions
                    structure['total_duration'] = total_seconds
        
        return structure

# Test instantané
parser = VektaParser()
print("Parser initialisé")


Parser initialisé


In [11]:
# Test cas Vekta réels observés
test_queries = [
    "13x4min33s à 87.3%FTP avec 2min47s récup",
    "Alternance 47s entre 103% et 91%FTP pendant 23min",
    "6 heures à 130%FTP sans pause"
]

print("=== TESTS PRECISION NUMERIQUE ===")
for i, query in enumerate(test_queries, 1):
    start_time = time.time()
    result = parser.parse_query(query)
    parse_time = (time.time() - start_time) * 1000
    
    print(f"\nTest {i}: {query}")
    print(f"Score: {result['completeness_score']:.2f}")
    print(f"Parse time: {parse_time:.1f}ms")
    print(f"Durées: {result['durations']}")
    print(f"Puissances: {result['powers']}")
    if result['structure']:
        print(f"Structure: {result['structure']}")


=== TESTS PRECISION NUMERIQUE ===

Test 1: 13x4min33s à 87.3%FTP avec 2min47s récup
Score: 1.00
Parse time: 0.0ms
Durées: [{'type': 'min_sec', 'minutes': 4, 'seconds': 33, 'total_seconds': 273, 'text': '4min33s'}, {'type': 'min_sec', 'minutes': 2, 'seconds': 47, 'total_seconds': 167, 'text': '2min47s'}]
Puissances: [{'type': 'ftp_decimal', 'value': 87.3, 'text': '87.3%ftp'}, {'type': 'ftp_int', 'value': 3.0, 'text': '3%ftp'}]
Structure: {'type': 'repetitions', 'count': 13, 'element': '4min33s à 87.3%ftp avec 2min47s récup'}

Test 2: Alternance 47s entre 103% et 91%FTP pendant 23min
Score: 0.40
Parse time: 0.0ms
Durées: []
Puissances: [{'type': 'ftp_decimal', 'value': 91.0, 'text': '91%ftp'}, {'type': 'ftp_int', 'value': 91.0, 'text': '91%ftp'}]

Test 3: 6 heures à 130%FTP sans pause
Score: 0.40
Parse time: 0.0ms
Durées: []
Puissances: [{'type': 'ftp_decimal', 'value': 130.0, 'text': '130%ftp'}, {'type': 'ftp_int', 'value': 130.0, 'text': '130%ftp'}]


In [12]:
class WorkoutGenerator:
    """Générateur basé sur parsing structurel"""
    
    def generate_from_parsed(self, parsed_data: Dict, mode: str = "user") -> Dict:
        """Génération directe depuis données parsées"""
        
        workout = {
            'metadata': {
                'generation_method': 'structural_parsing',
                'mode': mode,
                'confidence': self._calculate_confidence(parsed_data, mode),
                'created_at': datetime.now().isoformat()
            },
            'steps': []
        }
        
        # Génération basée sur structure détectée
        if parsed_data['structure']:
            workout['steps'] = self._generate_structured_steps(parsed_data)
        else:
            workout['steps'] = self._generate_basic_steps(parsed_data)
        
        return workout
    
    def _calculate_confidence(self, parsed_data: Dict, mode: str) -> float:
        """Logique confiance type Vekta"""
        if mode == "coach":
            return 0.95  # Mode coach = confiance constante
        
        # Mode user = confiance basée parsing
        base_score = parsed_data['completeness_score']
        if base_score >= 0.9:
            return 0.90
        elif base_score >= 0.75:
            return 0.75
        else:
            return 0.60
    
    def _generate_structured_steps(self, parsed_data: Dict) -> List[Dict]:
        """Génération pour structures complexes"""
        structure = parsed_data['structure']
        steps = []
        
        if structure['type'] == 'repetitions':
            # Répétitions simples
            for i in range(structure['count']):
                steps.append({
                    'type': 'work',
                    'duration': parsed_data['durations'][0]['total_seconds'] if parsed_data['durations'] else 240,
                    'power_percent': parsed_data['powers'][0]['value'] if parsed_data['powers'] else 85,
                    'description': f"Répétition {i+1}"
                })
                
                # Récupération entre répétitions (sauf dernière)
                if i < structure['count'] - 1:
                    steps.append({
                        'type': 'recovery',
                        'duration': 120,  # 2min par défaut
                        'power_percent': 50,
                        'description': 'Récupération'
                    })
        
        elif structure['type'] == 'alternance':
            # Alternances calculées automatiquement
            power_values = [float(p.rstrip('%')) for p in structure['power_range']]
            
            for i in range(int(structure['calculated_reps'])):
                power = power_values[i % 2]  # Alternance entre les 2 valeurs
                steps.append({
                    'type': 'work',
                    'duration': structure['element_duration'],
                    'power_percent': power,
                    'description': f"Alternance {i+1} - {power}%FTP"
                })
        
        return steps
    
    def _generate_basic_steps(self, parsed_data: Dict) -> List[Dict]:
        """Génération basique pour structures simples"""
        steps = []
        
        duration = parsed_data['durations'][0]['total_seconds'] if parsed_data['durations'] else 1800
        power = parsed_data['powers'][0]['value'] if parsed_data['powers'] else 75
        
        steps.append({
            'type': 'main',
            'duration': duration,
            'power_percent': power,
            'description': 'Effort principal'
        })
        
        return steps

generator = WorkoutGenerator()
print("Générateur initialisé")


Générateur initialisé


In [13]:
class VektaValidationPipeline:
    """Pipeline validation simplifié reproduisant notre pipeline complet"""
    
    def __init__(self):
        self.parser = VektaParser()
        self.generator = WorkoutGenerator()
        
        # Corpus enrichissement (10% logique pipeline)
        self.enrichment_corpus = [
            {"query": "tempo", "power": 85, "duration": 1200},
            {"query": "seuil", "power": 95, "duration": 900},
            {"query": "endurance", "power": 65, "duration": 3600}
        ]
    
    def validate_query(self, query: str, coach_mode: bool = False) -> Dict:
        """
        Validation Vekta simplifiée - SANS validation physiologique
        Reproduit exactement la logique de notre pipeline complet
        """
        start_time = time.time()
        
        # 1. Parsing structurel primaire (90% de la logique)
        parsed_data = self.parser.parse_query(query)
        completeness = parsed_data['completeness_score']
        
        # 2. Logique de décision Vekta (3 niveaux)
        if completeness >= 0.9:
            # CAS 1: Auto-génération (>90% complétude)
            return self._generate_from_structural_parsing(parsed_data, coach_mode, start_time)
        
        elif completeness >= 0.4:
            # CAS 2: Mode "Open Duration" avec enrichissement corpus
            return self._generate_with_corpus_validation(query, parsed_data, coach_mode, start_time)
        
        else:
            # CAS 3: Informations critiques manquantes
            return self._request_missing_information(parsed_data, start_time)
    
    def _generate_from_structural_parsing(self, parsed_data: Dict, coach_mode: bool, start_time: float) -> Dict:
        """Génération directe basée parsing structurel (confiance Vekta 95%)"""
        
        # Mode coach = confiance maximale constante (comportement Vekta observé)
        confidence = 0.95 if coach_mode else 0.90
        
        workout = self.generator.generate_from_parsed(parsed_data, "coach" if coach_mode else "user")
        
        return {
            'success': True,
            'confidence': confidence,
            'message': f"Séance générée automatiquement (précision: {parsed_data['completeness_score']:.1%})",
            'status': 'structural_complete',
            'workout': workout,
            'mode': 'coach' if coach_mode else 'user',
            'validation_method': 'structural_parsing_only',  # Pas de validation physio
            'processing_time': (time.time() - start_time) * 1000
        }
    
    def _generate_with_corpus_validation(self, query: str, parsed_data: Dict, coach_mode: bool, start_time: float) -> Dict:
        """Mode "Open Duration" avec enrichissement corpus (logique Vekta 60-90%)"""
        
        # Recherche enrichissement corpus
        corpus_match = self._find_corpus_enrichment(query)
        
        # Enrichissement données manquantes
        if corpus_match:
            if not parsed_data['powers']:
                parsed_data['powers'] = [{'value': corpus_match['power'], 'type': 'corpus_enriched'}]
            if not parsed_data['durations']:
                parsed_data['durations'] = [{'total_seconds': corpus_match['duration'], 'type': 'corpus_enriched'}]
        
        # Score hybride (parsing + corpus)
        base_confidence = parsed_data['completeness_score']
        corpus_boost = 0.1 if corpus_match else 0.0
        hybrid_confidence = min(0.85, base_confidence + corpus_boost)
        
        # Mode coach boost confiance même avec parsing partiel
        if coach_mode:
            hybrid_confidence = max(0.90, hybrid_confidence)
        
        workout = self.generator.generate_from_parsed(parsed_data, "coach" if coach_mode else "user")
        
        return {
            'success': True,
            'confidence': hybrid_confidence,
            'message': f"Séance générée avec enrichissement corpus ({hybrid_confidence:.1%})\n'Open duration' appliquée aux éléments non spécifiés",
            'status': 'hybrid_enriched',
            'workout': workout,
            'mode': 'coach' if coach_mode else 'user',
            'validation_method': 'parsing_plus_corpus',  # Pas de validation physio
            'corpus_enrichment': bool(corpus_match),
            'processing_time': (time.time() - start_time) * 1000
        }
    
    def _request_missing_information(self, parsed_data: Dict, start_time: float) -> Dict:
        """Messages d'erreur style Vekta (exactement comme pipeline complet)"""
        
        missing_elements = []
        if not parsed_data['durations']:
            missing_elements.append("la durée totale de la session")
        if not parsed_data['powers']:
            missing_elements.append("les cibles d'intensité (zones de puissance ou valeurs %FTP)")
        if not parsed_data['structure']:
            missing_elements.append("la structure de séance (échauffement, intervalles, récupération)")
        
        # Message style Vekta observé en production
        if len(missing_elements) >= 2:
            message = "The workout description is missing required information. Please specify: 1) The total duration of the session, 2) The specific workout structure (warm-up, intervals, recovery periods), and 3) The intensity targets for each segment."
        else:
            message = f"Description incomplète. Manque: {', '.join(missing_elements)}"
        
        return {
            'success': False,
            'confidence': parsed_data['completeness_score'],
            'message': message,
            'status': 'missing_critical_info',
            'workout': None,
            'missing_elements': missing_elements,
            'validation_method': 'structural_parsing_insufficient',
            'processing_time': (time.time() - start_time) * 1000
        }
    
    def _find_corpus_enrichment(self, query: str) -> Optional[Dict]:
        """Recherche enrichissement dans corpus (logique simple)"""
        query_lower = query.lower()
        for item in self.enrichment_corpus:
            if item["query"] in query_lower:
                return item
        return None

pipeline = VektaValidationPipeline()
print("Pipeline validation Vekta prêt")


Pipeline validation Vekta prêt


In [14]:
# Demo validation Vekta simplifiée - reproduit pipeline complet
demo_cases = [
    # Cas 1: Parsing complet -> génération directe (>90% complétude)
    {
        "name": "Auto-génération Vekta",
        "query": "5x8min à 92%FTP avec 3min récup",
        "coach_mode": False
    },
    
    # Cas 2: Parsing partiel -> enrichissement corpus (60-90% complétude)
    {
        "name": "Mode Open Duration",
        "query": "séance tempo de 45min",
        "coach_mode": False
    },
    
    # Cas 3: Mode coach -> zero validation (comportement Vekta observé)
    {
        "name": "Mode Coach Expert",
        "query": "6 heures à 130%FTP sans pause",
        "coach_mode": True
    },
    
    # Cas 4: Parsing insuffisant -> erreur style Vekta
    {
        "name": "Informations Manquantes",
        "query": "faire du vélo",
        "coach_mode": False
    }
]

print("=== DEMO VALIDATION VEKTA SIMPLIFIÉE ===")
for case in demo_cases:
    print(f"\n--- {case['name']} ---")
    print(f"Query: '{case['query']}'")
    print(f"Mode: {'Coach' if case['coach_mode'] else 'User'}")
    
    start_time = time.time()
    result = pipeline.validate_query(case['query'], coach_mode=case['coach_mode'])
    process_time = (time.time() - start_time) * 1000
    
    if not result['success']:
        print(f"❌ ERREUR: {result['message']}")
        print(f"   Confiance: {result['confidence']:.0%}")
        print(f"   Méthode validation: {result['validation_method']}")
    else:
        print(f"✅ SUCCÈS: {result['message']}")
        print(f"   Confiance: {result['confidence']:.0%}")
        print(f"   Status: {result['status']}")
        print(f"   Méthode validation: {result['validation_method']}")
        print(f"   Steps générés: {len(result['workout']['steps'])}")
    
    print(f"   Temps processing: {process_time:.1f}ms")


=== DEMO VALIDATION VEKTA SIMPLIFIÉE ===

--- Auto-génération Vekta ---
Query: '5x8min à 92%FTP avec 3min récup'
Mode: User
✅ SUCCÈS: Séance générée avec enrichissement corpus (60.0%)
'Open duration' appliquée aux éléments non spécifiés
   Confiance: 60%
   Status: hybrid_enriched
   Méthode validation: parsing_plus_corpus
   Steps générés: 9
   Temps processing: 0.1ms

--- Mode Open Duration ---
Query: 'séance tempo de 45min'
Mode: User
❌ ERREUR: The workout description is missing required information. Please specify: 1) The total duration of the session, 2) The specific workout structure (warm-up, intervals, recovery periods), and 3) The intensity targets for each segment.
   Confiance: 0%
   Méthode validation: structural_parsing_insufficient
   Temps processing: 0.0ms

--- Mode Coach Expert ---
Query: '6 heures à 130%FTP sans pause'
Mode: Coach
✅ SUCCÈS: Séance générée avec enrichissement corpus (90.0%)
'Open duration' appliquée aux éléments non spécifiés
   Confiance: 90%
   Statu

In [15]:
# Benchmark validation - reproduction pipeline complet
test_queries_perf = [
    "8x3min à 110%FTP avec 2min récup",
    "20min tempo à 85%FTP", 
    "Alternance 30s entre 120% et 60%FTP pendant 15min",
    "endurance 2h à 65%FTP",
    "pyramide 1-2-3-4-3-2-1min à intensité croissante"
]

print("=== BENCHMARK VALIDATION PERFORMANCE ===")
total_time = 0
successful_validations = 0

for query in test_queries_perf:
    start = time.time()
    result = pipeline.validate_query(query)
    elapsed = (time.time() - start) * 1000
    
    total_time += elapsed
    if result['success']:
        successful_validations += 1
    
    status_icon = "✅" if result['success'] else "❌"
    print(f"{status_icon} {elapsed:.1f}ms - {query[:40]}... [{result['validation_method']}]")

print(f"\nStatistiques Validation:")
print(f"Temps moyen: {total_time/len(test_queries_perf):.1f}ms")
print(f"Taux succès: {successful_validations/len(test_queries_perf)*100:.0f}%")
print(f"Objectif Vekta (<100ms): {'✓' if total_time/len(test_queries_perf) < 100 else '✗'}")

# Comparaison modes user vs coach
print(f"\n=== COMPARAISON MODES ===")
test_query = "20min tempo à 85%FTP"

user_result = pipeline.validate_query(test_query, coach_mode=False)
coach_result = pipeline.validate_query(test_query, coach_mode=True)

print(f"Query: '{test_query}'")
print(f"Mode User:  {user_result['confidence']:.0%} confiance - {user_result['validation_method']}")
print(f"Mode Coach: {coach_result['confidence']:.0%} confiance - {coach_result['validation_method']}")
print(f"Boost coach: +{(coach_result['confidence'] - user_result['confidence'])*100:.0f} points")


=== BENCHMARK VALIDATION PERFORMANCE ===
✅ 0.0ms - 8x3min à 110%FTP avec 2min récup... [parsing_plus_corpus]
✅ 0.0ms - 20min tempo à 85%FTP... [parsing_plus_corpus]
✅ 0.0ms - Alternance 30s entre 120% et 60%FTP pend... [parsing_plus_corpus]
✅ 0.0ms - endurance 2h à 65%FTP... [parsing_plus_corpus]
❌ 0.0ms - pyramide 1-2-3-4-3-2-1min à intensité cr... [structural_parsing_insufficient]

Statistiques Validation:
Temps moyen: 0.0ms
Taux succès: 80%
Objectif Vekta (<100ms): ✓

=== COMPARAISON MODES ===
Query: '20min tempo à 85%FTP'
Mode User:  50% confiance - parsing_plus_corpus
Mode Coach: 90% confiance - parsing_plus_corpus
Boost coach: +40 points


In [16]:
# Résumé architecture pour entretien
architecture_summary = {
    "Pipeline Principal": {
        "Parseur Structurel": "90% - Extraction patterns numériques précis",
        "Corpus Enrichissement": "10% - Validation et complétion données",
        "Performance": "<5ms parsing, <100ms total"
    },
    
    "Logique Décision": {
        "Score ≥ 0.8": "Génération directe (confiance 90%)",
        "Score 0.4-0.8": "Enrichissement corpus (confiance 75%)", 
        "Score < 0.4": "Erreur avec suggestions spécifiques"
    },
    
    "Mode Coach": {
        "Validation": "Minimale (zero-validation physiologique)",
        "Confiance": "95% constante",
        "Acceptation": "Séances aberrantes autorisées"
    },
    
    "Différenciation vs RAG classique": {
        "RAG standard": "Corpus principal + similarité vectorielle",
        "Architecture Vekta": "Parsing structurel + corpus secondaire",
        "Avantage": "Précision numérique + performance"
    }
}

print("=== ARCHITECTURE HYBRIDE VEKTA ===")
for section, details in architecture_summary.items():
    print(f"\n{section}:")
    for key, value in details.items():
        print(f"  • {key}: {value}")

print("\n=== FIN DEMO ===")
print("Questions techniques bienvenues")


=== ARCHITECTURE HYBRIDE VEKTA ===

Pipeline Principal:
  • Parseur Structurel: 90% - Extraction patterns numériques précis
  • Corpus Enrichissement: 10% - Validation et complétion données
  • Performance: <5ms parsing, <100ms total

Logique Décision:
  • Score ≥ 0.8: Génération directe (confiance 90%)
  • Score 0.4-0.8: Enrichissement corpus (confiance 75%)
  • Score < 0.4: Erreur avec suggestions spécifiques

Mode Coach:
  • Validation: Minimale (zero-validation physiologique)
  • Confiance: 95% constante
  • Acceptation: Séances aberrantes autorisées

Différenciation vs RAG classique:
  • RAG standard: Corpus principal + similarité vectorielle
  • Architecture Vekta: Parsing structurel + corpus secondaire
  • Avantage: Précision numérique + performance

=== FIN DEMO ===
Questions techniques bienvenues
