# Training data treatment for drift analysis

In [1]:
import sys
import os

# On remonte √† la racine du projet
root_path = os.path.abspath(os.path.join(os.getcwd(), "..")) # Si ton notebook est dans un sous-dossier
# ou simplement os.getcwd() si ton notebook est √† la racine
if root_path not in sys.path:
    sys.path.append(root_path)

import pandas as pd

from config.config import BASE_DIR



## R√©cup√©ration des donn√©es entr√©es P6

In [2]:
df = pd.read_parquet(BASE_DIR / "data" / "external" / "kaggle_master_dataset.parquet")

In [3]:
df.shape

(356193, 581)

In [4]:
df_normal = df.sample(1000, random_state=42)
df_drifted = df.sample(300, random_state=43)

## R√©cup√©ration de la signature du mod√®le d√©ploy√©

In [5]:
import requests
try:
    response = requests.get("http://localhost:8000/model_signature")
    data = response.json()
    columns = data['columns']
    print(f"Retrieved {len(columns)} columns")
except Exception as e:
    print(f"error : {e}")

Retrieved 15 columns


In [6]:
feature_names = [col['name'] for col in columns]
feature_names 

['FE_EXT_SOURCE_MEAN',
 'BURO_MONTHS_BALANCE_SIZE_MEAN',
 'CODE_GENDER',
 'INSTAL_DPD_MEAN',
 'BURO_MONTHS_BALANCE_MAX_MIN',
 'FE_GOODS_CREDIT_RATE',
 'APPROVED_CNT_PAYMENT_MEAN',
 'YEARS_BIRTH',
 'YEARS_EMPLOYED',
 'AMT_ANNUITY',
 'NAME_FAMILY_STATUS_Married',
 'INSTAL_AMT_PAYMENT_SUM',
 'FE_EXT_SOURCE_MIN',
 'PREV_CNT_PAYMENT_MEAN',
 'FE_EXT_SOURCE_MAX']

# Cr√©ation du df Normal et Drifted

In [7]:
df_normal = df_normal[feature_names]
df_drifted = df_drifted[feature_names]

In [8]:
df_normal.head()

Unnamed: 0,FE_EXT_SOURCE_MEAN,BURO_MONTHS_BALANCE_SIZE_MEAN,CODE_GENDER,INSTAL_DPD_MEAN,BURO_MONTHS_BALANCE_MAX_MIN,FE_GOODS_CREDIT_RATE,APPROVED_CNT_PAYMENT_MEAN,YEARS_BIRTH,YEARS_EMPLOYED,AMT_ANNUITY,NAME_FAMILY_STATUS_Married,INSTAL_AMT_PAYMENT_SUM,FE_EXT_SOURCE_MIN,PREV_CNT_PAYMENT_MEAN,FE_EXT_SOURCE_MAX
83482,0.386921,0.0,1,0.0,0.0,1.0,18.0,50,0,6421.5,False,104419.08,0.199771,18.0,0.57407
118211,0.298974,24.0,1,0.0,0.0,1.2376,12.0,23,2,8748.0,False,104536.62,0.199665,12.0,0.366006
253872,0.276205,0.0,1,0.25,0.0,1.198,12.0,40,8,20488.5,True,53162.235,0.004123,12.0,0.548286
242553,0.048021,0.0,0,0.0,0.0,1.0,12.0,22,0,42853.5,False,53292.735,0.048021,12.0,0.048021
118923,0.721261,0.0,1,1.169231,0.0,1.043562,5.333333,54,4,63463.5,True,1256113.035,0.684128,5.333333,0.758393


In [9]:
df_drifted.head()

Unnamed: 0,FE_EXT_SOURCE_MEAN,BURO_MONTHS_BALANCE_SIZE_MEAN,CODE_GENDER,INSTAL_DPD_MEAN,BURO_MONTHS_BALANCE_MAX_MIN,FE_GOODS_CREDIT_RATE,APPROVED_CNT_PAYMENT_MEAN,YEARS_BIRTH,YEARS_EMPLOYED,AMT_ANNUITY,NAME_FAMILY_STATUS_Married,INSTAL_AMT_PAYMENT_SUM,FE_EXT_SOURCE_MIN,PREV_CNT_PAYMENT_MEAN,FE_EXT_SOURCE_MAX
341678,0.452871,36.0,1,0.041096,0.0,1.0,10.75,28,0,7560.0,False,725286.69,0.157863,10.75,0.692559
348293,0.635872,30.666667,1,0.0,0.0,1.2574,10.0,35,8,26298.0,False,461560.68,0.540206,10.0,0.741107
210104,0.44694,0.0,1,0.0,0.0,1.2112,24.0,37,7,25407.0,True,98790.12,0.21098,24.0,0.652897
291101,0.37193,0.0,1,0.0,0.0,1.118802,15.0,31,0,24592.5,False,436123.935,0.220095,15.0,0.523765
353140,0.627556,17.444444,1,0.0,0.0,1.0,17.0,61,0,6673.5,True,137248.785,0.410103,17.0,0.897654


In [10]:
df_drifted['AMT_ANNUITY'] = df["AMT_ANNUITY"] * 2

In [11]:
df_drifted.head()

Unnamed: 0,FE_EXT_SOURCE_MEAN,BURO_MONTHS_BALANCE_SIZE_MEAN,CODE_GENDER,INSTAL_DPD_MEAN,BURO_MONTHS_BALANCE_MAX_MIN,FE_GOODS_CREDIT_RATE,APPROVED_CNT_PAYMENT_MEAN,YEARS_BIRTH,YEARS_EMPLOYED,AMT_ANNUITY,NAME_FAMILY_STATUS_Married,INSTAL_AMT_PAYMENT_SUM,FE_EXT_SOURCE_MIN,PREV_CNT_PAYMENT_MEAN,FE_EXT_SOURCE_MAX
341678,0.452871,36.0,1,0.041096,0.0,1.0,10.75,28,0,15120.0,False,725286.69,0.157863,10.75,0.692559
348293,0.635872,30.666667,1,0.0,0.0,1.2574,10.0,35,8,52596.0,False,461560.68,0.540206,10.0,0.741107
210104,0.44694,0.0,1,0.0,0.0,1.2112,24.0,37,7,50814.0,True,98790.12,0.21098,24.0,0.652897
291101,0.37193,0.0,1,0.0,0.0,1.118802,15.0,31,0,49185.0,False,436123.935,0.220095,15.0,0.523765
353140,0.627556,17.444444,1,0.0,0.0,1.0,17.0,61,0,13347.0,True,137248.785,0.410103,17.0,0.897654


## Enregistrement des df en CSV

In [12]:
df_normal.to_csv(BASE_DIR / "data" / 'processed' / "normal_sample.csv")

In [13]:
df_drifted.to_csv(BASE_DIR / "data" / 'processed' / "drifted_sample.csv")

## Appels API sur les lignes du normal puis du drifted

In [14]:
import time
import random
import requests

def api_calls(dataframe, limit=5):
    """
    Envoie des requ√™tes POST √† l'API pour un √©chantillon de donn√©es.
    """
    print(f"üöÄ D√©marrage des appels API pour {limit} lignes...")
    
    for index, row in dataframe.head(limit).iterrows():
        # Conversion de la ligne en dictionnaire pour le JSON
        payload = row.to_dict()

        # Petit d√©lai al√©atoire (simule un trafic r√©el)
        tempo = random.uniform(0.5, 2.0)
        time.sleep(tempo)

        try:
            # L'endpoint /individual_score attend une requ√™te POST
            response = requests.post("http://localhost:8000/individual_score", json=payload)
            
            if response.status_code == 200:
                res_data = response.json()
                # Correction : la cl√© retourn√©e par l'API est 'score' et non 'probability'
                score = res_data.get('score')
                decision = res_data.get('decision')
                print(f"‚úÖ Index {index} : Score={score:.3f}, D√©cision={decision}")
            else:
                print(f"‚ùå Index {index} : Erreur {response.status_code} - {response.text}")
                
        except Exception as e:
            print(f"‚ö†Ô∏è Index {index} : Erreur lors de l'appel : {e}")

# On peut maintenant tester la fonction
# api_calls(df_normal, limit=1000)

In [15]:
api_calls(df_drifted, limit=300)

üöÄ D√©marrage des appels API pour 300 lignes...
‚úÖ Index 341678 : Score=0.141, D√©cision=Accord√©
‚úÖ Index 348293 : Score=0.112, D√©cision=Accord√©
‚úÖ Index 210104 : Score=0.600, D√©cision=Accord√©
‚úÖ Index 291101 : Score=0.545, D√©cision=Accord√©
‚úÖ Index 353140 : Score=0.076, D√©cision=Accord√©
‚úÖ Index 173167 : Score=0.310, D√©cision=Accord√©
‚úÖ Index 248103 : Score=0.208, D√©cision=Accord√©
‚úÖ Index 250746 : Score=0.213, D√©cision=Accord√©
‚úÖ Index 243362 : Score=0.453, D√©cision=Accord√©
‚úÖ Index 123988 : Score=0.408, D√©cision=Accord√©
‚úÖ Index 311681 : Score=0.110, D√©cision=Accord√©
‚úÖ Index 154365 : Score=0.189, D√©cision=Accord√©
‚úÖ Index 325149 : Score=0.135, D√©cision=Accord√©
‚úÖ Index 102804 : Score=0.670, D√©cision=Accord√©
‚úÖ Index 40868 : Score=0.194, D√©cision=Accord√©
‚úÖ Index 189223 : Score=0.188, D√©cision=Accord√©
‚úÖ Index 247288 : Score=0.209, D√©cision=Accord√©
‚úÖ Index 183855 : Score=0.088, D√©cision=Accord√©
‚úÖ Index 232734 : Score=0.510, D