In [1]:
"""
Analyse de performance (profilage + goulots d'étranglement)
"""

import cProfile
import pstats
import io
import time
import json
import joblib
import pandas as pd
import numpy as np
from pathlib import Path

# ============================================================
# Configuration des paths
# ============================================================

MODEL_PATH = "model.pkl"
RESULTS_DIR = "performance_results"
Path(RESULTS_DIR).mkdir(exist_ok=True)

# ============================================================
# Profilage et identification des goulots
# ============================================================

class PerformanceProfiler:
    """Profilage du modèle avec cProfile + détection des goulots d’étranglement"""
    
    def __init__(self, model_path):
        self.model = joblib.load(model_path)
        print(f" Modèle chargé: {model_path}")
        print(f" Type de modèle: {type(self.model)}")
        
    def _get_sample_data(self):
        """Exemple de données"""
        return {
            'NAME_CONTRACT_TYPE': 'Cash loans',
            'CODE_GENDER': 'M',
            'FLAG_OWN_CAR': 'Y',
            'FLAG_OWN_REALTY': 'Y',
            'CNT_CHILDREN': 0,
            'AMT_INCOME_TOTAL': 180000.0,
            'AMT_CREDIT': 450000.0,
            'AMT_ANNUITY': 25000.0,
            'AMT_GOODS_PRICE': 400000.0,
            'NAME_TYPE_SUITE': 'Unaccompanied',
            'NAME_INCOME_TYPE': 'Working',
            'NAME_EDUCATION_TYPE': 'Secondary / secondary special',
            'NAME_FAMILY_STATUS': 'Married',
            'NAME_HOUSING_TYPE': 'House / apartment',
            'REGION_POPULATION_RELATIVE': 0.018,
            'DAYS_BIRTH': -12000,
            'DAYS_EMPLOYED': -2000,
            'DAYS_REGISTRATION': -3000,
            'DAYS_ID_PUBLISH': -2000,
            'FLAG_EMP_PHONE': 1,
            'FLAG_WORK_PHONE': 1,
            'FLAG_PHONE': 1,
            'FLAG_EMAIL': 1,
            'OCCUPATION_TYPE': 'Laborers',
            'CNT_FAM_MEMBERS': 2.0,
            'REGION_RATING_CLIENT': 2,
            'REGION_RATING_CLIENT_W_CITY': 2,
            'WEEKDAY_APPR_PROCESS_START': 'MONDAY',
            'HOUR_APPR_PROCESS_START': 10,
            'REG_REGION_NOT_LIVE_REGION': 0,
            'REG_REGION_NOT_WORK_REGION': 0,
            'LIVE_REGION_NOT_WORK_REGION': 0,
            'REG_CITY_NOT_LIVE_CITY': 0,
            'REG_CITY_NOT_WORK_CITY': 0,
            'LIVE_CITY_NOT_WORK_CITY': 0,
            'ORGANIZATION_TYPE': 'Business Entity Type 3',
            'FLOORSMAX_AVG': 0.5,
            'LIVINGAREA_AVG': 0.4,
            'YEARS_BEGINEXPLUATATION_MODE': 0.95,
            'OBS_30_CNT_SOCIAL_CIRCLE': 2.0,
            'DEF_30_CNT_SOCIAL_CIRCLE': 0.0,
            'DAYS_LAST_PHONE_CHANGE': -1000.0,
            'PREVIOUS_LOANS_COUNT': 3.0,
            'CREDIT_INCOME_PERCENT': 2.5,
            'ANNUITY_INCOME_PERCENT': 0.14,
            'CREDIT_TERM': 18.0,
            'DAYS_EMPLOYED_PERCENT': 0.17
        }

    def profile_prediction(self, n_iterations=100):
        """Profilage cProfile des fonctions predict et predict_proba"""
        print("\n" + "="*60)
        print(" PROFILAGE DES PREDICTIONS")
        print("="*60)
        
        df = pd.DataFrame([self._get_sample_data()])
        
        # Profilage de predict()
        profiler_predict = cProfile.Profile()
        profiler_predict.enable()
        for _ in range(n_iterations):
            _ = self.model.predict(df)
        profiler_predict.disable()

        # Profilage de predict_proba()
        profiler_proba = cProfile.Profile()
        profiler_proba.enable()
        for _ in range(n_iterations):
            _ = self.model.predict_proba(df)
        profiler_proba.disable()
        
        # Sauvegarde des résultats
        predict_path = Path(RESULTS_DIR) / "cprofile_predict.txt"
        proba_path = Path(RESULTS_DIR) / "cprofile_proba.txt"

        with open(predict_path, "w") as f:
            stats = pstats.Stats(profiler_predict, stream=f)
            stats.strip_dirs().sort_stats('cumulative').print_stats(30)
        
        with open(proba_path, "w") as f:
            stats = pstats.Stats(profiler_proba, stream=f)
            stats.strip_dirs().sort_stats('cumulative').print_stats(30)
        
        print(f" Profilage terminé. Résultats enregistrés dans {RESULTS_DIR}/")
        print("   - cprofile_predict.txt")
        print("   - cprofile_proba.txt")

        # Résumé console
        s = io.StringIO()
        stats = pstats.Stats(profiler_predict, stream=s)
        stats.strip_dirs().sort_stats('cumulative').print_stats(10)
        print("\nTop 10 fonctions les plus coûteuses (predict):")
        print(s.getvalue())
        
    def identify_bottlenecks(self):
        """Analyse des goulots d’étranglement du pipeline"""
        print("\n" + "="*60)
        print("  ANALYSE DES GOULOTS D'ÉTRANGLEMENT")
        print("="*60)
        
        df = pd.DataFrame([self._get_sample_data()])
        bottlenecks = []

        if hasattr(self.model, 'named_steps'):
            print(" Pipeline détecté – analyse par étape:")
            for name, step in self.model.named_steps.items():
                if name == 'classifier':
                    continue
                times = []
                for _ in range(30):
                    start = time.time()
                    try:
                        _ = step.transform(df)
                    except Exception:
                        pass
                    times.append((time.time() - start) * 1000)
                avg = np.mean(times)
                print(f"  • {name}: {avg:.3f} ms")
                if avg > 5:
                    bottlenecks.append({'component': name, 'avg_time_ms': avg})
        else:
            print("Aucun pipeline détecté (pas d'analyse par étape possible).")

        out_path = Path(RESULTS_DIR) / "bottlenecks.json"
        with open(out_path, "w") as f:
            json.dump(bottlenecks, f, indent=2, ensure_ascii=False)
        
        if bottlenecks:
            print(f"\n {len(bottlenecks)} goulots détectés – voir {out_path}")
        else:
            print("\n Aucun goulot majeur détecté.")
        
        return bottlenecks


# ============================================================
# Fonction principale
# ============================================================

def main():
    print("═" * 60)
    print(" ANALYSE DE PERFORMANCE ")
    print("═" * 60)
    
    profiler = PerformanceProfiler(MODEL_PATH)
    profiler.profile_prediction(n_iterations=100)
    profiler.identify_bottlenecks()
    
    print("\n Analyse terminée.")
    print(f"Résultats disponibles dans: {RESULTS_DIR}/")

if __name__ == "__main__":
    main()


════════════════════════════════════════════════════════════
 ANALYSE DE PERFORMANCE 
════════════════════════════════════════════════════════════
 Modèle chargé: model.pkl
 Type de modèle: <class 'sklearn.pipeline.Pipeline'>

 PROFILAGE DES PREDICTIONS
 Profilage terminé. Résultats enregistrés dans performance_results/
   - cprofile_predict.txt
   - cprofile_proba.txt

Top 10 fonctions les plus coûteuses (predict):
         1317270 function calls (1291029 primitive calls) in 0.311 seconds

   Ordered by: cumulative time
   List reduced from 741 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  101/100    0.000    0.000    0.307    0.003 pipeline.py:739(predict)
  500/100    0.001    0.000    0.272    0.003 _set_output.py:314(wrapped)
      100    0.002    0.000    0.272    0.003 _column_transformer.py:1028(transform)
      100    0.001    0.000    0.259    0.003 _column_transformer.py:825(_call_func_on_transformers)
      100    0

In [5]:
"""
Benchmark des optimisations API 
Comparaison : Chargement du modèle à chaque requête vs Préchargement au début
"""

import time
import joblib
import numpy as np
import pandas as pd

MODEL_PATH = "model.pkl"


def get_sample_data():
    """Données d'exemple (simule une requête /predict)"""
    return {
        'NAME_CONTRACT_TYPE': 'Cash loans',
        'CODE_GENDER': 'M',
        'FLAG_OWN_CAR': 'Y',
        'FLAG_OWN_REALTY': 'Y',
        'CNT_CHILDREN': 0,
        'AMT_INCOME_TOTAL': 180000.0,
        'AMT_CREDIT': 450000.0,
        'AMT_ANNUITY': 25000.0,
        'AMT_GOODS_PRICE': 400000.0,
        'NAME_TYPE_SUITE': 'Unaccompanied',
        'NAME_INCOME_TYPE': 'Working',
        'NAME_EDUCATION_TYPE': 'Secondary / secondary special',
        'NAME_FAMILY_STATUS': 'Married',
        'NAME_HOUSING_TYPE': 'House / apartment',
        'REGION_POPULATION_RELATIVE': 0.018,
        'DAYS_BIRTH': -12000,
        'DAYS_EMPLOYED': -2000,
        'DAYS_REGISTRATION': -3000,
        'DAYS_ID_PUBLISH': -2000,
        'FLAG_EMP_PHONE': 1,
        'FLAG_WORK_PHONE': 1,
        'FLAG_PHONE': 1,
        'FLAG_EMAIL': 1,
        'OCCUPATION_TYPE': 'Laborers',
        'CNT_FAM_MEMBERS': 2.0,
        'REGION_RATING_CLIENT': 2,
        'REGION_RATING_CLIENT_W_CITY': 2,
        'WEEKDAY_APPR_PROCESS_START': 'MONDAY',
        'HOUR_APPR_PROCESS_START': 10,
        'REG_REGION_NOT_LIVE_REGION': 0,
        'REG_REGION_NOT_WORK_REGION': 0,
        'LIVE_REGION_NOT_WORK_REGION': 0,
        'REG_CITY_NOT_LIVE_CITY': 0,
        'REG_CITY_NOT_WORK_CITY': 0,
        'LIVE_CITY_NOT_WORK_CITY': 0,
        'ORGANIZATION_TYPE': 'Business Entity Type 3',
        'FLOORSMAX_AVG': 0.5,
        'LIVINGAREA_AVG': 0.4,
        'YEARS_BEGINEXPLUATATION_MODE': 0.95,
        'OBS_30_CNT_SOCIAL_CIRCLE': 2.0,
        'DEF_30_CNT_SOCIAL_CIRCLE': 0.0,
        'DAYS_LAST_PHONE_CHANGE': -1000.0,
        'PREVIOUS_LOANS_COUNT': 3.0,
        'CREDIT_INCOME_PERCENT': 2.5,
        'ANNUITY_INCOME_PERCENT': 0.14,
        'CREDIT_TERM': 18.0,
        'DAYS_EMPLOYED_PERCENT': 0.17
    }


def predict_with_reload(n_iter=100):
    """Benchmark : recharge le modèle à chaque requête"""
    df = pd.DataFrame([get_sample_data()])
    times = []
    for _ in range(n_iter):
        start = time.time()
        model = joblib.load(MODEL_PATH)
        _ = model.predict(df)
        times.append((time.time() - start) * 1000)
    return np.array(times)


def predict_with_preload(n_iter=100):
    """Benchmark : modèle chargé une seule fois"""
    model = joblib.load(MODEL_PATH)
    df = pd.DataFrame([get_sample_data()])
    times = []
    for _ in range(n_iter):
        start = time.time()
        _ = model.predict(df)
        times.append((time.time() - start) * 1000)
    return np.array(times)


def main():
    print("════════════════════════════════════════════════════════════")
    print(" BENCHMARK DES OPTIMISATIONS API ")
    print("════════════════════════════════════════════════════════════")

    n_iter = 100

    print("\n Test 1 : Chargement du modèle à chaque requête")
    t_reload = predict_with_reload(n_iter)
    print(f"  • Moyenne : {t_reload.mean():.3f} ms")
    print(f"  • P95     : {np.percentile(t_reload, 95):.3f} ms")

    print("\n Test 2 : Modèle préchargé en mémoire...")
    t_preload = predict_with_preload(n_iter)
    print(f"  • Moyenne : {t_preload.mean():.3f} ms")
    print(f"  • P95     : {np.percentile(t_preload, 95):.3f} ms")

    gain = ((t_reload.mean() - t_preload.mean()) / t_reload.mean()) * 100
    print("\n Gain de performance : {:.1f}%".format(gain))
    print(f"\nRésultats disponibles dans: {RESULTS_DIR}/")

    print("\n════════════════════════════════════════════════════════════")
    print(" Benchmark terminé")
    print("════════════════════════════════════════════════════════════")


if __name__ == "__main__":
    main()


════════════════════════════════════════════════════════════
 BENCHMARK DES OPTIMISATIONS API 
════════════════════════════════════════════════════════════

 Test 1 : Chargement du modèle à chaque requête
  • Moyenne : 4.895 ms
  • P95     : 6.290 ms

 Test 2 : Modèle préchargé en mémoire...
  • Moyenne : 1.381 ms
  • P95     : 1.767 ms

 Gain de performance : 71.8%

Résultats disponibles dans: performance_results/

════════════════════════════════════════════════════════════
 Benchmark terminé
════════════════════════════════════════════════════════════
