# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import zscore
import scipy.stats as stats
from scipy.stats import chi2_contingency
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc
from mlxtend.evaluate import permutation_test
import statsmodels.api as sm

In [2]:
data = pd.read_csv('./data/df_train.csv')

In [3]:
data.shape

(306107, 48)

# Preprocessing

In [4]:
data.drop(columns = 'Unnamed: 0',inplace = True)

In [5]:
data.isna().sum()/len(data)*100 

CNT_CHILDREN                    0.000000
AMT_INCOME_TOTAL                0.000000
AMT_CREDIT                      0.000000
REGION_POPULATION_RELATIVE      0.000000
DAYS_BIRTH                      0.000000
DAYS_EMPLOYED                   0.000000
DAYS_REGISTRATION               0.000000
DAYS_ID_PUBLISH                 0.000000
FLAG_EMP_PHONE                  0.000000
FLAG_WORK_PHONE                 0.000000
FLAG_PHONE                      0.000000
REGION_RATING_CLIENT            0.000000
REGION_RATING_CLIENT_W_CITY     0.000000
HOUR_APPR_PROCESS_START         0.000000
REG_REGION_NOT_LIVE_REGION      0.000000
REG_REGION_NOT_WORK_REGION      0.000000
REG_CITY_NOT_LIVE_CITY          0.000000
REG_CITY_NOT_WORK_CITY          0.000000
LIVE_CITY_NOT_WORK_CITY         0.000000
FLAG_DOCUMENT_2                 0.000000
FLAG_DOCUMENT_3                 0.000000
FLAG_DOCUMENT_6                 0.000000
FLAG_DOCUMENT_8                 0.000000
FLAG_DOCUMENT_9                 0.000000
FLAG_DOCUMENT_11

In [6]:
data.drop(columns = ['FONDKAPREMONT_MODE','WALLSMATERIAL_MODE','HOUSETYPE_MODE','EMERGENCYSTATE_MODE','OCCUPATION_TYPE'], inplace = True)

In [7]:
data.dropna(inplace = True)

# Encoding

In [8]:
for col in data.select_dtypes(include = 'object').columns:
    print(f"{col} : {data[col].nunique()}")

NAME_CONTRACT_TYPE : 2
CODE_GENDER : 3
FLAG_OWN_CAR : 2
FLAG_OWN_REALTY : 2
NAME_TYPE_SUITE : 7
NAME_INCOME_TYPE : 8
NAME_EDUCATION_TYPE : 5
NAME_FAMILY_STATUS : 5
NAME_HOUSING_TYPE : 6
WEEKDAY_APPR_PROCESS_START : 7
ORGANIZATION_TYPE : 58


### Impact encoding

Pour les variables qu'on ne peut pas OHE en raison de leur nombre de modalités

In [9]:
impact_dicts = {}
categorical_columns = data.select_dtypes(include = 'object')

for categorical_feature in data['ORGANIZATION_TYPE'] :#.select_dtypes(include = 'object').columns :#

    category_means = data.groupby(categorical_feature)['TARGET'].mean()
    category_impacts = category_means - category_means.mean()

    impact_dicts[categorical_feature] = category_impacts.to_dict()

    data['encoded_' + categorical_feature] = data[categorical_feature].map(impact_dicts[categorical_feature])
    data.drop(columns= categorical_feature, inplace=True)   

KeyError: 'Transport: type 3'

### OHE

In [None]:
categorical_columns = data.select_dtypes(include='object').columns

data = pd.get_dummies(data, columns=categorical_columns, dtype='int',drop_first=True)

# Train test split

In [None]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=889)

# LOGIT

In [None]:
import statsmodels.api as sm

X_train_scaled = sm.add_constant(X_train)

model = sm.Logit(y_train, X_train)
result = model.fit_regularized(method='l1')

# Affichage des résultats
print(result.summary())


# Résultats

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

X_test_scaled = sm.add_constant(X_test)


y_prob = result.predict(X_test)


fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Feature importance

In [None]:
coefficients = result.params
pvalues = result.pvalues

var_signif = [variable for variable, p_value in zip(X.columns, pvalues) if p_value < 0.05]

# Plot des coefficients avec des couleurs différentes en fonction des p-values (transposé)
colors = ['red' if p > 0.05 else 'green' for p in pvalues]

plt.figure(figsize=(15, 15))
plt.barh(range(len(coefficients)), coefficients, color=colors)
plt.yticks(range(len(coefficients)), X.columns)
plt.xlabel('Coefficients')
plt.title('Logistic Regression Coefficients with P-values (Transposed)')
plt.show()

In [None]:
var_signif

In [None]:
data.shape

# Deuxième modèle avec uniquement les var correlées à + de 5% avec TARGET

In [None]:
X_bis = data[var_signif]

X_train, X_test, y_train, y_test = train_test_split(X_bis, y, test_size=0.2, random_state=889)


X_train_scaled = sm.add_constant(X_train)

model = sm.Logit(y_train, X_train)
result = model.fit_regularized(method='l1')

In [None]:
print(result.summary())

In [None]:
#X_test_cst = sm.add_constant(X_test)


y_prob = result.predict(X_test)


fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
coefficients = result.params
pvalues = result.pvalues

var_signif = [variable for variable, p_value in zip(X.columns, pvalues) if p_value < 0.05]

# Plot des coefficients avec des couleurs différentes en fonction des p-values (transposé)
colors = ['red' if p > 0.05 else 'green' for p in pvalues]

plt.figure(figsize=(15, 15))
plt.barh(range(len(coefficients)), coefficients, color=colors)
plt.yticks(range(len(coefficients)), X.columns)
plt.xlabel('Coefficients')
plt.title('Logistic Regression Coefficients with P-values (Transposed)')
plt.show()