In [1]:
import numpy as np
import pandas as pd
import scorecardpy as sc
from feature_engine.encoding import WoEEncoder

In [2]:
train_data = pd.read_csv('https://files.challengerocket.com/files/lions-den-ing-2024/development_sample.csv')
test_data = pd.read_csv('https://files.challengerocket.com/files/lions-den-ing-2024/testing_sample.csv')

In [3]:
discrete_variables = ['ID', 'customer_id', 'Var1', 'Var15', 'Var16', 'Var20', 'Var21', 'Var22',
                      	'Var23', 'Var29', 'Var4', 'Var5', 'Var9', 'Var24', 'Var30', 'Var6'
]

continuous_variables = [
    'Var7', 'Var8', 'Var10', 
    'Var17', 'Var25', 'Var26', '_r_'
]

binary_variables = [
    'target', 'Application_status', 'Var18', 
    'Var19', 'Var27', 'Var28'
]

categorical_nominal_variables = [
    'Var2', 'Var3', 'Var11', 'Var12', 'Var14'
]


datetime_variables = [
    'application_date', 'Var13'
]


In [4]:
#plik z opisem zmiennych
names_xlsx = pd.read_excel('./variables_description.xlsx')
#Słownik zmian nazw kolumn
names = {f"{names_xlsx['Column'][i]}":f"{names_xlsx['Description'][i]}" for i in range(5, len(names_xlsx))}
"""
training_data = training_data.rename(columns=names)
test_data = test_data.rename(columns = names)
"""

'\ntraining_data = training_data.rename(columns=names)\ntest_data = test_data.rename(columns = names)\n'

In [5]:
subtypes_list = [discrete_variables, continuous_variables, 
binary_variables, categorical_nominal_variables, datetime_variables]

for subtype_idx in range(len(subtypes_list)):
    for variable_idx in range(len(subtypes_list[subtype_idx])):
        if subtypes_list[subtype_idx][variable_idx] in names.keys():
            subtypes_list[subtype_idx][variable_idx] = names[subtypes_list[subtype_idx][variable_idx]]

In [6]:
discrete_variables

['ID',
 'customer_id',
 'Number of applicants',
 'Application data: number of children of main applicant',
 'Application data: number of dependences of main applicant',
 'Number of requests during the last 3 months (External data)',
 'Number of requests during the last 6 months (External data)',
 'Number of requests during the last 9 months (External data)',
 'Number of requests during the last 12 months (External data)',
 'Credit bureau score (Exterval data)',
 'Application amount',
 'Credit duration (months)',
 'Application data: income of main applicant',
 'Limit on credit card',
 'Average income (Exterval data)',
 'Payment frequency']

In [7]:
#Rename datasetów
variables_to_drop=["customer_id",'Application_status']+datetime_variables
train_data = (train_data.rename(columns=names)
              .set_index("ID")
              .dropna(subset=['target'])
              .drop(variables_to_drop, axis=1))
test_data = (test_data.rename(columns = names)
             .set_index("ID")
             .dropna(subset=['target'])
             .drop(variables_to_drop, axis=1))

In [8]:
train_data.columns

Index(['target', 'Number of applicants', 'Loan purpose',
       'Distribution channel', 'Application amount',
       'Credit duration (months)', 'Payment frequency', 'Installment amount',
       'Value of the goods (car)',
       'Application data: income of main applicant',
       'Application data: income of second applicant',
       'Application data: profession of main applicant',
       'Application data: profession of second applicant',
       'Application data: marital status of main applicant',
       'Application data: number of children of main applicant',
       'Application data: number of dependences of main applicant',
       'Spendings estimation', 'Property ownership for property renovation',
       'Clasification of the vehicle (Car, Motorbike)',
       'Number of requests during the last 3 months (External data)',
       'Number of requests during the last 6 months (External data)',
       'Number of requests during the last 9 months (External data)',
       'Number o

In [9]:
train_data['target'].value_counts()

target
0.0    35591
1.0     1127
Name: count, dtype: int64

In [10]:
variables_to_skip=["customer_id", "application_date"]

In [11]:
sc.woebin?

[0;31mSignature:[0m
[0msc[0m[0;34m.[0m[0mwoebin[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdt[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mx[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvar_skip[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbreaks_list[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mspecial_values[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstop_limit[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcount_distr_limit[0m[0;34m=[0m[0;36m0.05[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbin_num_limit[0m[0;34m=[0m[0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpositive[0m[0;34m=[0m[0;34m'bad|1'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mno_cores[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprint_step[0m[0;34m=[0m[0;36m0[0m[0;34m,

In [12]:
train_data.target.value_counts()

target
0.0    35591
1.0     1127
Name: count, dtype: int64

In [13]:
sc.germancredit()["creditability"]

0      good
1       bad
2      good
3      good
4       bad
       ... 
995    good
996    good
997    good
998     bad
999    good
Name: creditability, Length: 1000, dtype: object

In [14]:
train_data['target']=train_data['target'].map({1: 'bad', 0: 'good'})

In [15]:
# binning continues variables
bins = sc.woebin(train_data, y="target")
train_woe_data = sc.woebin_ply(train_data, bins).rename(columns= lambda x: x.removesuffix('_woe'))
train_woe_data['target']=train_data['target'].map({'bad':1 , 'good': 0})

[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  .stack().replace('missing', np.nan) \
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  ).groupby(['variable', 'rowid', 'bin_chr'], group_keys=False).agg({'bad':sum,'good':sum})\
  ).groupby(['variable', 

[INFO] converting into woe values ...


In [16]:
train_woe_data

Unnamed: 0_level_0,target,Average income (Exterval data),Number of requests during the last 3 months (External data),Number of requests during the last 12 months (External data),Distribution channel,Application data: marital status of main applicant,Credit bureau score (Exterval data),Credit duration (months),Number of requests during the last 6 months (External data),Application data: number of dependences of main applicant,...,Installment amount,Amount on current account,Property ownership for property renovation,Loan purpose,_r_,Payment frequency,Value of the goods (car),Application data: income of main applicant,Spendings estimation,Application amount
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11034977,0,-0.030511,-0.044806,-0.011903,-0.104102,-0.288743,-0.378480,-0.230699,-0.007991,0.079092,...,-0.050065,0.237729,-0.258372,-0.260049,-0.044592,-0.006905,0.081302,-0.079888,-0.078735,0.303246
11034978,0,-0.030511,-0.032632,-0.140003,0.001819,0.288064,-0.378480,-0.230699,-0.073105,0.000838,...,-0.050065,0.237729,0.078159,-0.103599,-0.258946,-0.006905,0.027256,1.087539,-0.259186,-0.177443
11034979,0,-0.030511,-0.044806,-0.011903,-0.104102,-0.288743,0.173599,0.050616,-0.007991,0.079092,...,-0.050065,0.237729,0.078159,0.313798,0.012624,-0.006905,0.081302,-0.079888,-0.078735,0.303246
11034980,0,-0.030511,-0.032632,-0.140003,0.001819,-0.288743,0.173599,-0.118368,-0.073105,0.079092,...,0.109400,0.237729,0.078159,-0.103599,-0.258946,0.027242,0.027256,-0.079888,-0.078735,-0.177443
11034982,0,-0.030511,-0.032632,-0.011903,-0.104102,0.066858,0.173599,-0.118368,-0.007991,-0.055207,...,-0.050065,0.237729,-0.258372,-0.260049,0.361491,-0.006905,0.081302,1.087539,-0.259186,0.303246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11084969,0,0.048583,-0.044806,-0.011903,0.322787,0.066858,-0.378480,0.258912,0.157178,0.079092,...,-0.203619,-0.042444,0.078159,0.313798,0.012624,0.027242,0.081302,-0.186832,0.039338,0.303246
11084970,0,0.048583,-0.032632,-0.140003,0.001819,-0.288743,-0.378480,0.050616,-0.073105,-0.055207,...,-0.203619,-0.052411,-0.258372,-0.260049,-0.044592,0.027242,0.081302,-0.186832,-0.078735,-0.177443
11084972,1,0.048583,-0.044806,-0.011903,-0.104102,0.066858,-0.378480,-0.230699,-0.007991,-0.055207,...,-0.063350,-0.042444,0.078159,-0.103599,0.012624,-0.006905,-0.105154,-0.079888,0.039338,-0.177443
11084974,0,0.048583,-0.032632,-0.140003,-0.104102,0.066858,-0.378480,0.050616,-0.073105,0.079092,...,-0.203619,-0.223985,0.078159,-0.103599,-0.187319,-0.006905,-0.690105,-0.186832,0.039338,-0.177443


In [18]:
train_woe_data.to_pickle("./Intermidiate_data/Data_after_woe_binning.pk")

In [17]:
#bins = sc.woebin(training_data, y="target")