In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import zscore
import scipy.stats as stats
from scipy.stats import chi2_contingency
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc
from mlxtend.evaluate import permutation_test
import statsmodels.api as sm
import warnings
from category_encoders import TargetEncoder

# Ignorer les avertissements de dépassement de capacité et de division par zéro
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [34]:
df_app = pd.read_csv('./data/application_train_vf.csv')

In [35]:
df_app['date_mensuelle'] = pd.to_datetime(df_app['date_mensuelle'])

In [37]:
X = df_app.drop(columns = ['SK_ID_CURR'], axis=1).copy()
X = X[X['date_mensuelle']<'2020-01-01']
y = df_app[df_app['date_mensuelle']<'2020-01-01']['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
X_train.drop(columns = 'TARGET',inplace =True)
X_test.drop(columns = 'TARGET',inplace =True)

In [38]:
df_app.drop(columns = ['date_mensuelle','Unnamed: 0','SK_ID_CURR','TARGET']).select_dtypes(include ='number').columns.to_list()

['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXP

In [39]:

liste_gini = []

for col in df_app.drop(columns = ['date_mensuelle','Unnamed: 0','SK_ID_CURR','TARGET','FLAG_MOBIL']).select_dtypes(include ='number').columns : 
    # NaN
    moyenne = X_train[col].mean()
    X_train[col] = X_train[col].fillna(moyenne)
    X_test[col] = X_test[col].fillna(moyenne)
    # Constante
    X_train_scaled = X_train[col]
    X_test_scaled = X_test[col]
    X_train_scaled = sm.add_constant(X_train_scaled)
    X_test_scaled = sm.add_constant(X_test_scaled)
    
    # Fit 
    model = sm.Logit(y_train, X_train_scaled)
    result = model.fit_regularized(method='l1', disp=False)
    
    # Pred
    y_pred = result.predict(X_test_scaled)
    y_pred = (y_pred > 0.5).astype(int)
    
    y_prob = result.predict(X_test_scaled)
    
    # Calculer le taux de faux positifs (FPR), le taux de vrais positifs (TPR) et les seuils
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    
    # Calculer l'aire sous la courbe ROC (AUC)
    roc_auc = auc(fpr, tpr)
    
    # Calculer le coefficient de Gini
    gini_coefficient = 2 * roc_auc - 1
    liste_gini.append(gini_coefficient)
    print(f'{col} gini : {gini_coefficient}')
df_selec_var_num = pd.DataFrame(liste_gini,index = df_app.drop(columns = ['date_mensuelle','Unnamed: 0','SK_ID_CURR','TARGET','FLAG_MOBIL']).select_dtypes(include ='number').columns, columns =['Gini'] )

CNT_CHILDREN gini : 0.03216619164155565
AMT_INCOME_TOTAL gini : 0.04213384721289759
AMT_CREDIT gini : 0.04107928355513213
AMT_ANNUITY gini : 0.00598573174288175
AMT_GOODS_PRICE gini : 0.06942864076560973
REGION_POPULATION_RELATIVE gini : 0.05953560374272504
DAYS_BIRTH gini : 0.15614267742882615
DAYS_EMPLOYED gini : -0.053450406972955444
DAYS_REGISTRATION gini : 0.08274027632448533
DAYS_ID_PUBLISH gini : 0.09341884774246134
OWN_CAR_AGE gini : 0.05660736343806838
FLAG_EMP_PHONE gini : 0.0577214498634655
FLAG_WORK_PHONE gini : 0.03612975627112758
FLAG_CONT_MOBILE gini : -0.00017725304652438378
FLAG_PHONE gini : 0.044516028673380115
FLAG_EMAIL gini : 0.00795671263833797
CNT_FAM_MEMBERS gini : 0.003484653084320666
REGION_RATING_CLIENT gini : 0.09586979337717971
REGION_RATING_CLIENT_W_CITY gini : 0.1002639857983958
HOUR_APPR_PROCESS_START gini : 0.05513527510663718
REG_REGION_NOT_LIVE_REGION gini : 0.0014036208315642007
REG_REGION_NOT_WORK_REGION gini : -0.0010641907410500195
LIVE_REGION_NOT

In [48]:
df_app[df_selec_var_num[df_selec_var_num['Gini']>0.05].index.to_list()].isna().sum()

AMT_GOODS_PRICE                    277
REGION_POPULATION_RELATIVE           0
DAYS_BIRTH                           0
DAYS_REGISTRATION                    0
DAYS_ID_PUBLISH                      0
OWN_CAR_AGE                     201962
FLAG_EMP_PHONE                       0
REGION_RATING_CLIENT                 0
REGION_RATING_CLIENT_W_CITY          0
HOUR_APPR_PROCESS_START              0
REG_CITY_NOT_LIVE_CITY               0
REG_CITY_NOT_WORK_CITY               0
EXT_SOURCE_1                    172564
EXT_SOURCE_2                       656
EXT_SOURCE_3                     60640
YEARS_BEGINEXPLUATATION_AVG     149238
YEARS_BEGINEXPLUATATION_MODE    149238
YEARS_BEGINEXPLUATATION_MEDI    149238
DAYS_LAST_PHONE_CHANGE               1
FLAG_DOCUMENT_3                      0
dtype: int64

In [41]:

liste_gini = []

for col in df_app.drop(columns = ['date_mensuelle','Unnamed: 0','SK_ID_CURR','TARGET','FLAG_MOBIL']).select_dtypes(include ='object').columns : 
    # NaN
    mode = X_train[col].mode()
    X_train[col] = X_train[col].fillna(mode)
    X_test[col] = X_test[col].fillna(mode)
    
    # Encoding
    target_encoder = TargetEncoder()
    target_encoder.fit(X_train[col],y_train)
    X_train[col] = target_encoder.transform(X_train[col])
    X_test[col] = target_encoder.transform(X_test[col])
    
    # Constante
    X_train_scaled = X_train[col]
    X_test_scaled = X_test[col]
    X_train_scaled = sm.add_constant(X_train_scaled)
    X_test_scaled = sm.add_constant(X_test_scaled)
    
    # Fit 
    model = sm.Logit(y_train, X_train_scaled)
    result = model.fit_regularized(method='l1', disp=False)
    
    # Pred
    y_pred = result.predict(X_test_scaled)
    y_pred = (y_pred > 0.5).astype(int)
    
    y_prob = result.predict(X_test_scaled)
    
    # Calculer le taux de faux positifs (FPR), le taux de vrais positifs (TPR) et les seuils
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    
    # Calculer l'aire sous la courbe ROC (AUC)
    roc_auc = auc(fpr, tpr)
    
    # Calculer le coefficient de Gini
    gini_coefficient = 2 * roc_auc - 1
    liste_gini.append(gini_coefficient)
    print(f'{col} gini : {gini_coefficient}')
df_selec_var_cat = pd.DataFrame(liste_gini,index = df_app.drop(columns = ['date_mensuelle','Unnamed: 0','SK_ID_CURR','TARGET','FLAG_MOBIL']).select_dtypes(include ='object').columns, columns =['Gini'] )

NAME_CONTRACT_TYPE gini : 0.03136489394830777
CODE_GENDER gini : 0.08583451088335092
FLAG_OWN_CAR gini : 0.04389878775885969
FLAG_OWN_REALTY gini : -0.018553179424603883
NAME_TYPE_SUITE gini : 0.016179542185958073
NAME_INCOME_TYPE gini : 0.11789442882124268
NAME_EDUCATION_TYPE gini : 0.09234891635236497
NAME_FAMILY_STATUS gini : 0.08034609933033665
NAME_HOUSING_TYPE gini : 0.032453535601962846
OCCUPATION_TYPE gini : 0.1510958211733926
WEEKDAY_APPR_PROCESS_START gini : -0.022032665430325782
ORGANIZATION_TYPE gini : 0.1456220603896501
FONDKAPREMONT_MODE gini : 0.051587361157631406
HOUSETYPE_MODE gini : 0.08105337102680088
WALLSMATERIAL_MODE gini : 0.08749990234468674
EMERGENCYSTATE_MODE gini : 0.08847882887199354


In [46]:
liste = df_selec_var_cat[df_selec_var_cat['Gini']>0.08].index.to_list()

In [47]:
df_app[liste].isna().sum()

CODE_GENDER                 0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
OCCUPATION_TYPE         96049
ORGANIZATION_TYPE           0
HOUSETYPE_MODE         153493
WALLSMATERIAL_MODE     155543
EMERGENCYSTATE_MODE    144998
dtype: int64