In [23]:
import os
import pickle

import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from mlflow.tracking import MlflowClient
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.cluster import KMeans

from sklearn.metrics.pairwise import rbf_kernel, linear_kernel

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

import pickle

# Data import and preparation

In [2]:
# import file
with open("../data/cleaned_data.pkl", "rb") as f:
    df = pickle.load(f)

In [6]:
train_index = (df["DateTransaction"] >= "2017-02-01") & (
    df["DateTransaction"] <= "2017-08-31"
)
test_index = (df["DateTransaction"] >= "2017-09-01") & (
    df["DateTransaction"] <= "2017-11-30"
)

train = df[train_index]
test = df[test_index]

del train_index, test_index

# Variable to discard
to_discard = ["ZIBZIN", "IDAvisAutorisationCheque", "DateTransaction", "CodeDecision"]

In [7]:
test = test.drop(columns=to_discard)
train = train.drop(columns=to_discard)

y_train = train["FlagImpaye"]
X_train = train.drop(columns=["FlagImpaye"])
y_test = test["FlagImpaye"]
X_test = test.drop(columns=["FlagImpaye"])

## Standardization

Standardization of quantitative variables (OneHotEncoded variable "JourSemaine" remain unchanged)

In [12]:
cols_to_keep = ['JourSemaine']
cols_to_scale = X_train.columns.difference(['JourSemaine'])

In [8]:
X_train

Unnamed: 0,Montant,VerifianceCPT1,VerifianceCPT2,VerifianceCPT3,D2CB,ScoringFP1,ScoringFP2,ScoringFP3,TauxImpNb_RB,TauxImpNB_CPM,EcartNumCheq,NbrMagasin3J,DiffDateTr1,DiffDateTr2,DiffDateTr3,CA3TRetMtt,CA3TR,Heure,JourSemaine
0,20.00,0,0,0,551,0.000000,0.000000,0.000000,37.186668,52.076034,0,1,4.000000,4.0,4.0,20.00,0.00,27134,2
1,20.00,0,0,0,551,0.000000,0.000000,0.000000,48.844716,52.076034,1,2,1.797685,4.0,4.0,28.61,8.61,27817,2
2,57.64,0,0,0,549,0.000000,0.000000,0.000000,73.118280,52.076034,0,1,4.000000,4.0,4.0,57.64,0.00,28058,2
3,54.29,1,1,1,267,0.000000,0.000000,0.000000,110.056926,53.554234,0,1,4.000000,4.0,4.0,54.29,0.00,28128,2
4,26.90,0,0,0,549,0.003769,8.586333,0.001192,45.368313,52.076034,1,1,1.997106,4.0,4.0,59.15,32.25,29607,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3888464,137.90,0,0,0,451,0.008264,7.010333,0.001799,45.368313,52.076034,0,1,4.000000,4.0,4.0,137.90,0.00,77684,2
3888465,73.76,0,0,0,526,0.007380,-8.307757,0.001744,222.222222,52.076034,0,1,4.000000,4.0,4.0,73.76,0.00,78875,2
3888466,61.21,0,0,0,489,0.006588,-4.133844,0.001519,60.699193,52.076034,0,1,4.000000,4.0,4.0,61.21,0.00,78896,2
3888467,58.90,0,0,0,538,0.006082,17.139117,0.001282,45.368313,52.076034,1,1,2.040174,4.0,4.0,304.50,245.60,78917,2


In [10]:
scaler = StandardScaler()

In [15]:
# Scale only the selected columns and reconcatenante the full data matrix
X_train_scaled_part = scaler.fit_transform(X_train[cols_to_scale])
X_train_scaled_part = pd.DataFrame(X_train_scaled_part, columns=cols_to_scale, index=X_train.index)
X_train_scaled = pd.concat([X_train_scaled_part, X_train[cols_to_keep]], axis=1)

In [16]:
# X_test scaling
X_test_scaled_part = scaler.transform(X_test[cols_to_scale])
X_test_scaled_part = pd.DataFrame(X_test_scaled_part, columns=cols_to_scale, index=X_test.index)
X_test_scaled = pd.concat([X_test_scaled_part, X_test[cols_to_keep]], axis=1)

# Landmarks approximation

## Functions preparation

In [17]:
def select_landmarks(X, n_landmarks, method="random", random_state=42):
    """
    Select landmarks into dataset X.

    Parameters:
    -----------
    X : array (n_samples, n_features)
    n_landmarks : int
        Number of landmarks to select
    method : str
        'random' or 'kmeans'
    """
    if method == "random":
        np.random.seed(random_state)
        indices = np.random.choice(X.shape[0], n_landmarks, replace=False)
        return X[indices], indices

    elif method == "kmeans":
        kmeans = KMeans(n_clusters=n_landmarks, random_state=random_state, n_init=10)
        kmeans.fit(X)
        # Landmarks are the cluster centers
        return kmeans.cluster_centers_, None

    else:
        raise ValueError("Méthode inconnue. Utilisez 'random' ou 'kmeans'.")

In [27]:
def kernel_approximation(X, landmarks, kernel_type="gaussian", gamma=1.0):
    """
    Project data X into the kernel space defined by the landmarks.
    
    Each point becomes a vector of similarities with all landmarks.
    
    Returns:
    --------
    X_transformed : array (n_samples, n_landmarks)
    """
    if kernel_type == "gaussian":
        # For each point in X, compute similarity to each landmark
        return rbf_kernel(X, landmarks, gamma=gamma)
    elif kernel_type == "linear":
        return linear_kernel(X, landmarks)
    else:
        raise ValueError("kernel_type doit être 'gaussian' ou 'linear'")

## Landmarks approximation

## Selecting landmarks

In [None]:
# n landmarks selection (using square root rule)
n_landmarks = 400

np.int64(1971)

In [20]:
landmarks = select_landmarks(
    X_train_scaled.values, n_landmarks=n_landmarks, method="kmeans")[0]

In [24]:
# Save landmarks
with open("../data/landmarks_svm_2k.pkl", "wb") as f:
    pickle.dump(landmarks, f)

## Compute Gaussian kernel approximation

In [28]:
gamma = 0.1

X_train_transformed = kernel_approximation(X_train_scaled.values, landmarks, 
                                                   kernel_type="gaussian", gamma=gamma)

X_test_transformed = kernel_approximation(X_test_scaled.values, landmarks, 
                                            kernel_type="gaussian", gamma=gamma)

MemoryError: Unable to allocate 57.1 GiB for an array with shape (3888468, 1971) and data type float64

## Train the SVM model with precomputed kernel