In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

df = pd.read_parquet("../bpi_2017_cleaned.parquet")

le_act = LabelEncoder()
le_type = LabelEncoder()

df['act_num'] = le_act.fit_transform(df['concept:name'])
df['is_new_credit'] = le_type.fit_transform(df['case:ApplicationType'])
df['weekday'] = df['time:timestamp'].dt.weekday
df['event_index'] = df.groupby('case:concept:name').cumcount()

df['time_log'] = np.log1p(df['time_delta'])
scaler_time = MinMaxScaler()
df['time_norm'] = scaler_time.fit_transform(df[['time_log']])

scaler_index = MinMaxScaler()
df['event_index_norm'] = scaler_index.fit_transform(df[['event_index']])

case_starts = df[df['concept:name'] == 'A_Create Application'].sort_values('time:timestamp')
unique_cases = case_starts['case:concept:name'].values

split_idx = int(len(unique_cases) * 0.8)
train_cases = unique_cases[:split_idx]
test_cases = unique_cases[split_idx:]

def create_sequences_enriched(case_list, dataframe, window_size=5):
    X, y_act, y_time = [], [], []
    subset = dataframe[dataframe['case:concept:name'].isin(case_list)]
    grouped = subset.groupby('case:concept:name')
    
    for name, group in grouped:
        group = group.sort_values('time:timestamp')
        
        acts = group['act_num'].values
        types = group['is_new_credit'].values
        days = group['weekday'].values
        idxs = group['event_index_norm'].values
        times = group['time_norm'].values
        
        if len(acts) > window_size:
            for i in range(window_size, len(acts)):
               
                window_data = []
                for j in range(i-window_size, i):
                    window_data.append([acts[j], types[j], days[j], idxs[j]])
                
                X.append(window_data)
                y_act.append(acts[i])
                y_time.append(times[i])
                
    return np.array(X), np.array(y_act), np.array(y_time)

print(" Génération des séquences enrichies...")
X_train, y_act_train, y_time_train = create_sequences_enriched(train_cases, df)
X_test, y_act_test, y_time_test = create_sequences_enriched(test_cases, df)

print(f" Preprocessing terminé !")
print(f"Structure de X_train : {X_train.shape}") 

 Génération des séquences enrichies...
 Preprocessing terminé !
Structure de X_train : (835173, 5, 4)


In [2]:
import numpy as np

path = "C:\\Users\\ShinraS\\Desktop\\Projet_Challenge_BPI"

# Sauvegarde des données d'entraînement
np.save(path + 'X_train.npy', X_train)
np.save(path + 'y_act_train.npy', y_act_train)
np.save(path + 'y_time_train.npy', y_time_train)

# Sauvegarde des données de test
np.save(path + 'X_test.npy', X_test)
np.save(path + 'y_act_test.npy', y_act_test)
np.save(path + 'y_time_test.npy', y_time_test)

print("Toutes les matrices ont été sauvegardées")

Toutes les matrices ont été sauvegardées


In [3]:
import joblib

joblib.dump(le_act, path + 'le_act.joblib')
joblib.dump(scaler_time, path + 'scaler_time.joblib')

print("Décodeur 'le_act' et 'scaler_time' sauvegardés !")

Décodeur 'le_act' et 'scaler_time' sauvegardés !
