# Data Preparation

In [340]:
import pandas as pd
import numpy as np
from IPython.display import display

DEV = True

loans = pd.read_csv('data/loan_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('kaggle/loan_comp.csv', dtype={'date':object}, sep=';')
account = pd.read_csv('data/account.csv', dtype={'date':object})
cards = pd.read_csv('data/card_dev.csv', dtype={'issued':object}) if DEV else pd.read_csv('kaggle/card_comp.csv', dtype={'issued':object}, sep=';')
client = pd.read_csv('data/client.csv')
disposition = pd.read_csv('data/disp.csv')
district = pd.read_csv('data/district.csv')
trans = pd.read_csv('data/trans_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('kaggle/trans_comp.csv', dtype={'date':object}, sep=';')

pd.set_option('display.max_columns', None)


  trans = pd.read_csv('data/trans_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('kaggle/trans_comp.csv', dtype={'date':object}, sep=';')


## Loan preparation

### Save

In [341]:
loans.rename(columns={'date':'loan_date', 'amount': 'loan_amount', 'duration': 'loan_duration', 'payments': 'loan_payments' }, inplace=True)
loans.to_csv('data_processed/' + ('loan_dev' if DEV else 'loan_comp') + '.csv', index=False)
loans

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,status
0,5314,1787,930705,96396,12,8033,-1
1,5316,1801,930711,165960,36,4610,1
2,6863,9188,930728,127080,60,2118,1
3,5325,1843,930803,105804,36,2939,1
4,7240,11013,930906,274740,60,4579,1
...,...,...,...,...,...,...,...
323,6818,9030,961212,155616,48,3242,1
324,5625,3189,961215,222180,60,3703,-1
325,6805,8972,961221,45024,48,938,1
326,7233,10963,961225,115812,36,3217,1


## Account preparation

In [342]:
account.drop(['date'], inplace=True, axis=1)

### Save

In [343]:
account.rename(columns={'frequency':'account_frequency', 'district_id': 'account_district_id'}, inplace=True)
account.to_csv('data_processed/account.csv', index=False)
account

Unnamed: 0,account_id,account_district_id,account_frequency
0,576,55,monthly issuance
1,3818,74,monthly issuance
2,704,55,monthly issuance
3,2378,16,monthly issuance
4,2632,24,monthly issuance
...,...,...,...
4495,124,55,monthly issuance
4496,3958,59,monthly issuance
4497,777,30,monthly issuance
4498,1573,63,monthly issuance


## Card preparation

There are only a few loans associated with a credit card

In [344]:
cards.drop(["issued"], inplace=True, axis=1)
cards = cards.fillna({'type': 'unkown'})

### Save

In [345]:
cards.rename(columns={'type':'card_type', 'disp_id': 'card_disp_id'}, inplace=True)
cards.to_csv('data_processed/' + ('card_dev' if DEV else 'card_comp') + '.csv', index=False)

## Client preparation

### Extract gender and normalize dates
Since the birth date of each client is represented as YYMMDD when 'Male' or YY(MM+50)DD when 'Female' the gender was extracted as a new feature and the Female client's birth date modified to YYMMDD, the same as Male clients.
Female = False
Male = True

In [346]:
from datetime import datetime, timedelta

def extract_gender(date):
    month = int(str(date)[2:4])
    gender = ''
    if month > 12:
        month -= 50
        gender = 'Female'
    else: 
        gender = 'Male'
    new_date = str(date)[:2] + str(month).zfill(2) + str(date)[4:]
    return (new_date, gender)

In [347]:
genders = []
dates = []

for birth_number in client['birth_number']:
    (new_date, gender) = extract_gender(birth_number)
    dates.append(new_date)
    genders.append(gender)

client['birth_number'] = dates
client['gender'] = genders

### Save

In [348]:
client.rename(columns={'district_id': 'client_district_id'}, inplace=True)
client.to_csv('data_processed/client.csv', index=False)

## Disposition preparation

### Remove disponent clients and add to the account's owner entry
Without this chenage more than one client were associated to a single account_id. This was a problem when merging Dispositions Loans tables since this would lead to several entries with the same loan_id and target classes replicated.

In [349]:
disp_owners = disposition.loc[disposition['type'] == 'OWNER'].copy()
disp_disponent = disposition.loc[disposition['type'] == 'DISPONENT'].copy()

has_disponent = [id in disp_disponent['account_id'].values for id in disp_owners['account_id'].values]

disp_owners['has_disponent'] = has_disponent
disp_owners.drop('type', inplace=True, axis=1)
display(disp_owners)

Unnamed: 0,disp_id,client_id,account_id,has_disponent
0,1,1,1,False
1,2,2,2,True
3,4,4,3,True
5,6,6,4,False
6,7,7,5,False
...,...,...,...,...
5363,13623,13931,11333,False
5364,13647,13955,11349,True
5366,13660,13968,11359,False
5367,13663,13971,11362,False


### Save

In [350]:
disp_owners.to_csv('data_processed/disp.csv', index=False)

## District preparation

### Clean missing values

By analysing the box plots, we found out that the data values for "no. of commited crimes '95" and "unemploymant rate '95" are not symmetrical and there are outliers, therefore it would be best to use median instead of mean to replace the cells with '?'. 
Even tho, the missing values will be replaced according to a **linear regression**.

In [351]:

def replace_by_median(district):
    district_clean = district.loc[(district["unemploymant rate '95"] != '?') & (district["no. of commited crimes '95"] != '?')]
    median_unemploymant = district_clean["unemploymant rate '95"].astype(float).median()
    median_commited_crimes = district_clean["no. of commited crimes '95"].astype(int).median()

    # "no. of commited crimes '95" = '?' => median_commited_crimes
    # "unemploymant rate '95" = '?' => median_unemploymant
    district.loc[district["no. of commited crimes '95"] == '?', "no. of commited crimes '95"] = median_commited_crimes
    district.loc[district["unemploymant rate '95"] == '?', "unemploymant rate '95"] = median_unemploymant

def replace_by_regression(district):
    from sklearn.preprocessing import OrdinalEncoder
    from sklearn.linear_model import LinearRegression
    oe = OrdinalEncoder()
    lr = LinearRegression()

    district_copy = district.copy()
    ordinal_cols = ['name', 'region']
    district_copy[ordinal_cols] = oe.set_params(encoded_missing_value=-1).fit_transform(district_copy[ordinal_cols])

    district_clean = district_copy.loc[(district["unemploymant rate '95"] != '?') & (district["no. of commited crimes '95"] != '?')]
    X = district_clean.drop(["no. of commited crimes '95", "unemploymant rate '95"], axis=1)
    
    y1 = district_clean["no. of commited crimes '95"].astype(int)
    lr.fit(X, y1)
    y1_pred = lr.predict(district_copy.loc[(district["no. of commited crimes '95"] == '?')].drop(["no. of commited crimes '95", "unemploymant rate '95"], axis=1))
    district.loc[(district["no. of commited crimes '95"] == '?'), "no. of commited crimes '95"] = y1_pred
    
    y2 = district_clean["unemploymant rate '95"].astype(float)
    lr.fit(X, y2)
    y2_pred = lr.predict(district_copy.loc[(district["unemploymant rate '95"] == '?')].drop(["no. of commited crimes '95", "unemploymant rate '95"], axis=1))
    district.loc[(district["unemploymant rate '95"] == '?'), "unemploymant rate '95"] = y2_pred

replace_by_regression(district)  

district.head()

Unnamed: 0,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040


### Data Cleaning

In [352]:
# values of "no. of municipalities with inhabitants 2000-9999" and "no. of municipalities with inhabitants >10000" are really small compared to the other ones
district['no. of municipalities with inhabitants > 2000'] = district['no. of municipalities with inhabitants 2000-9999'] + district['no. of municipalities with inhabitants >10000']

district.drop(columns=['no. of municipalities with inhabitants 2000-9999',
'no. of municipalities with inhabitants >10000'], inplace=True)

cols = district.columns.tolist()
cols = cols[:6] + cols[-1:] + cols[6:-1]
district = district[cols]

district.head()

Unnamed: 0,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
0,1,Hl.m. Praha,Prague,1204953,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,8,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,5,5,41.7,8980,1.95,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,8,6,67.4,9753,4.64,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,5,6,51.4,9307,3.85,4.43,118,2616,3040


### Feature Engineering - Ratios of entrepreneurs, urban inhabitants and rate growth

In [353]:
# Create ratios and convert percentage to 0-1
district['ratio enterpreneurs'] = district['no. of enterpreneurs per 1000 inhabitants'] / 1000
district['ratio of urban inhabitants'] = district['ratio of urban inhabitants'] / 100

# Change rate to ratio growth
district['unemploymant rate growth'] = pd.to_numeric(district["unemploymant rate '96 "]) - pd.to_numeric(district["unemploymant rate '95"])
district['crime rate growth'] = (pd.to_numeric(district["no. of commited crimes '96 "]) - pd.to_numeric(district["no. of commited crimes '95"]))/pd.to_numeric(district["no. of inhabitants"])

district.drop(columns=['name', 'no. of enterpreneurs per 1000 inhabitants', 'no. of inhabitants',
"unemploymant rate '96 ", "no. of commited crimes '96 ",
"unemploymant rate '95", "no. of commited crimes '95"], inplace=True)

district.head()

Unnamed: 0,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth
0,1,Prague,0,0,1,1,1.0,12541,0.167,0.14,0.011146
1,2,central Bohemia,80,26,8,5,0.467,8507,0.132,0.18,0.005794
2,3,central Bohemia,55,26,5,5,0.417,8980,0.111,0.26,-0.000146
3,4,central Bohemia,63,29,8,6,0.674,9753,0.109,0.41,0.004323
4,5,central Bohemia,65,30,5,6,0.514,9307,0.118,0.58,0.004434


### Save

In [354]:
# TODO - clean district table
district.to_csv('data_processed/district.csv', index=False)
district

Unnamed: 0,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth
0,1,Prague,0,0,1,1,1.000,12541,0.167,0.14,0.011146
1,2,central Bohemia,80,26,8,5,0.467,8507,0.132,0.18,0.005794
2,3,central Bohemia,55,26,5,5,0.417,8980,0.111,0.26,-0.000146
3,4,central Bohemia,63,29,8,6,0.674,9753,0.109,0.41,0.004323
4,5,central Bohemia,65,30,5,6,0.514,9307,0.118,0.58,0.004434
...,...,...,...,...,...,...,...,...,...,...,...
72,73,north Moravia,17,49,14,7,0.564,8746,0.090,0.41,0.000429
73,74,north Moravia,0,0,1,1,1.000,10673,0.100,0.69,-0.001343
74,75,north Moravia,67,30,6,5,0.646,8819,0.099,0.28,0.003202
75,76,north Moravia,31,32,15,7,0.512,8369,0.107,1.15,-0.007294


## Transaction preparation

### Replace missing values

In [355]:
# Replace type 'withdrawal in cash' with 'withdrawal'
trans.replace({'type': 'withdrawal in cash'}, 'withdrawal', inplace=True)
# Replace k_symbol '' with nan
trans.replace({'k_symbol': ' '}, np.nan, inplace=True)
# Replace operation 'credit card withdrawal' with 'withdrawal in cash'
trans.replace({'operation': 'credit card withdrawal'}, 'withdrawal in cash', inplace=True)

In [356]:
# Replace NaN of operation with mode for each type
credit_operation_mode = trans[trans['type'] == 'credit']['operation'].mode()[0]
withdrawal_operation_mode = trans[trans['type'] == 'withdrawal']['operation'].mode()[0]
trans['operation'] = trans.apply(lambda x: credit_operation_mode if (x['type'] == 'credit' and pd.isnull(x['operation'])) else (withdrawal_operation_mode if (x['type'] == 'withdrawal' and pd.isnull(x['operation'])) else x['operation']), axis=1)

In [357]:
# Drop columns with more than 70% of missing values
trans.dropna(thresh=len(trans) * 0.3, axis=1, inplace=True)
trans

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,
...,...,...,...,...,...,...,...,...
396680,515914,1763,961231,withdrawal,withdrawal in cash,14.6,67769.5,payment for statement
396681,516262,1765,961231,withdrawal,withdrawal in cash,14.6,19708.1,payment for statement
396682,520019,1775,961231,withdrawal,withdrawal in cash,14.6,15944.5,payment for statement
396683,517894,1769,961231,withdrawal,withdrawal in cash,14.6,34679.4,payment for statement


In [358]:
trans.to_csv('data_processed/' + ('trans_dev_no_mv' if DEV else 'trans_comp_no_mv') + '.csv', index=False)

### Feature Engineering - Monthly Income and last balance for each account_id

In [359]:
trans['date'] = trans['date'].apply(lambda x: '19'+x[:2]+'-'+x[2:4]+'-'+x[4:])
trans['date'] = pd.to_datetime(trans['date']).dt.date

In [360]:
aux = trans.groupby(['account_id']).agg({'date': ['min', 'max']})
aux.columns = ['min_date', 'max_date']
aux = aux.reset_index(drop=False)
aux['date_diff'] = (aux['max_date']-aux['min_date'])/np.timedelta64(1, 'M')
aux['date_diff'] = aux['date_diff'].astype(int)
trans = pd.merge(trans, aux[['account_id', 'date_diff']], on='account_id', how='left')
display(trans)

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,date_diff
0,1548749,5270,1993-01-13,credit,credit in cash,800.0,800.0,,10
1,1548750,5270,1993-01-14,credit,collection from another bank,44749.0,45549.0,,10
2,3393738,11265,1993-01-14,credit,credit in cash,1000.0,1000.0,,7
3,3122924,10364,1993-01-17,credit,credit in cash,1100.0,1100.0,,9
4,1121963,3834,1993-01-19,credit,credit in cash,700.0,700.0,,19
...,...,...,...,...,...,...,...,...,...
396680,515914,1763,1996-12-31,withdrawal,withdrawal in cash,14.6,67769.5,payment for statement,9
396681,516262,1765,1996-12-31,withdrawal,withdrawal in cash,14.6,19708.1,payment for statement,5
396682,520019,1775,1996-12-31,withdrawal,withdrawal in cash,14.6,15944.5,payment for statement,46
396683,517894,1769,1996-12-31,withdrawal,withdrawal in cash,14.6,34679.4,payment for statement,23


In [361]:
def agg_func(data):
    credits = data[(data['type'] == 'credit') & (data['operation'] != 'collection from another bank')]
    withdrawals = data[(data['type'] == 'withdrawal') & (data['operation'] != 'remittance to another bank')]

    return pd.Series({
        'monthly_credit': (credits['amount'].sum() / (abs(data['date_diff'].iat[0]) + 1)) if credits.shape[0] != 0 else 0, 
        'monthly_withdrawal': (withdrawals['amount'].sum() / (abs(data['date_diff'].iat[0]) + 1)) if withdrawals.shape[0] != 0 else 0,
        'last_balance': data['balance'].iat[-1],
        'mean_balance': data['balance'].mean(),
        'min_balance': data['balance'].min(),
    })
trans_agg = trans.groupby(['account_id']).apply(agg_func)
trans_agg = trans_agg.reset_index(drop=False)
trans_agg['monthly_diff'] = (trans_agg['monthly_credit'] - trans_agg['monthly_withdrawal']).astype(int)
trans_agg = trans_agg.drop(['monthly_credit', 'monthly_withdrawal'], axis=1)
trans_agg

Unnamed: 0,account_id,last_balance,mean_balance,min_balance,monthly_diff
0,1,12674.5,17558.700000,1000.0,-1153
1,2,27855.2,32590.624074,1100.0,-14402
2,4,24957.6,23648.623077,800.0,-1135
3,6,33523.8,30313.279839,900.0,-1986
4,7,28305.4,23088.850000,900.0,14152
...,...,...,...,...,...
3365,11333,19647.7,36827.658173,-1291.0,618
3366,11349,24704.4,59352.833333,200.0,-39068
3367,11359,25697.2,36480.185034,1000.0,-24614
3368,11362,24199.5,30869.781308,1000.0,5510


### Save

In [362]:
trans_agg.to_csv('data_processed/' + ('trans_dev' if DEV else 'trans_comp') + '.csv', index=False)

## Data Integration

In [363]:
loans = pd.read_csv('data_processed/loan_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('data_processed/loan_comp.csv')
account = pd.read_csv('data_processed/account.csv', dtype={'date':object})
cards = pd.read_csv('data_processed/card_dev.csv', dtype={'issued':object}) if DEV else pd.read_csv('data_processed/card_comp.csv')
client = pd.read_csv('data_processed/client.csv')
disposition = pd.read_csv('data_processed/disp.csv')
district = pd.read_csv('data_processed/district.csv')
trans = pd.read_csv('data_processed/trans_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('data_processed/trans_comp.csv')

pd.set_option('display.max_columns', None)


### Join tables

In [364]:
data = pd.merge(loans, account, left_on="account_id", right_on="account_id", how='left')
data = pd.merge(data, disp_owners, left_on="account_id", right_on="account_id", how='left')
data = pd.merge(data, client, left_on="client_id", right_on="client_id", how='left')
data = pd.merge(data, cards, left_on="disp_id", right_on="card_disp_id", how='left')
data = pd.merge(data, trans, left_on='account_id', right_on='account_id', how='left')
data = pd.merge(data, district, left_on="client_district_id", right_on="code", how='left')
display(data.head())

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,status,account_district_id,account_frequency,disp_id,client_id,has_disponent,birth_number,client_district_id,gender,card_id,card_disp_id,card_type,last_balance,mean_balance,min_balance,monthly_diff,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,2166,2166,False,470722,30,Female,,,,20100.0,12250.0,1100.0,6700,30,west Bohemia,15,13,10,10,0.818,9650,0.1,0.29,-0.001909
1,5316,1801,930711,165960,36,4610,1,46,monthly issuance,2181,2181,False,680722,46,Male,,,,52208.9,52083.859459,700.0,14733,46,east Bohemia,48,20,10,10,0.735,8369,0.117,0.52,-0.002094
2,6863,9188,930728,127080,60,2118,1,45,monthly issuance,11006,11314,False,360602,45,Male,,,,20272.8,30060.954167,800.0,-6859,45,east Bohemia,85,19,7,5,0.535,8390,0.132,0.61,0.000539
3,5325,1843,930803,105804,36,2939,1,12,monthly issuance,2235,2235,False,400420,14,Female,,,,34292.7,41297.48,1000.0,-11054,14,south Bohemia,69,27,11,9,0.748,10045,0.135,0.29,-0.001739
4,7240,11013,930906,274740,60,4579,1,1,weekly issuance,13231,13539,False,780907,63,Male,,,,41142.9,57188.211111,600.0,-30531,63,south Moravia,38,36,6,5,0.505,8288,0.11,0.73,-0.001179


### Feature Engineering - Has credit card or not

In [365]:
data['has_card'] = data['card_id'].notnull()
display(data.head())

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,status,account_district_id,account_frequency,disp_id,client_id,has_disponent,birth_number,client_district_id,gender,card_id,card_disp_id,card_type,last_balance,mean_balance,min_balance,monthly_diff,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth,has_card
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,2166,2166,False,470722,30,Female,,,,20100.0,12250.0,1100.0,6700,30,west Bohemia,15,13,10,10,0.818,9650,0.1,0.29,-0.001909,False
1,5316,1801,930711,165960,36,4610,1,46,monthly issuance,2181,2181,False,680722,46,Male,,,,52208.9,52083.859459,700.0,14733,46,east Bohemia,48,20,10,10,0.735,8369,0.117,0.52,-0.002094,False
2,6863,9188,930728,127080,60,2118,1,45,monthly issuance,11006,11314,False,360602,45,Male,,,,20272.8,30060.954167,800.0,-6859,45,east Bohemia,85,19,7,5,0.535,8390,0.132,0.61,0.000539,False
3,5325,1843,930803,105804,36,2939,1,12,monthly issuance,2235,2235,False,400420,14,Female,,,,34292.7,41297.48,1000.0,-11054,14,south Bohemia,69,27,11,9,0.748,10045,0.135,0.29,-0.001739,False
4,7240,11013,930906,274740,60,4579,1,1,weekly issuance,13231,13539,False,780907,63,Male,,,,41142.9,57188.211111,600.0,-30531,63,south Moravia,38,36,6,5,0.505,8288,0.11,0.73,-0.001179,False


### Drop columns with more than 70% of missing values

In [366]:
# Drop columns with more than 70% of missing values
status = data['status']
data.drop(['status'], axis=1, inplace=True)
data.dropna(thresh=len(data) * 0.3, axis=1, inplace=True)
data = pd.concat([status, data], axis=1)
display(data.head())

Unnamed: 0,status,loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,account_district_id,account_frequency,disp_id,client_id,has_disponent,birth_number,client_district_id,gender,last_balance,mean_balance,min_balance,monthly_diff,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth,has_card
0,-1,5314,1787,930705,96396,12,8033,30,weekly issuance,2166,2166,False,470722,30,Female,20100.0,12250.0,1100.0,6700,30,west Bohemia,15,13,10,10,0.818,9650,0.1,0.29,-0.001909,False
1,1,5316,1801,930711,165960,36,4610,46,monthly issuance,2181,2181,False,680722,46,Male,52208.9,52083.859459,700.0,14733,46,east Bohemia,48,20,10,10,0.735,8369,0.117,0.52,-0.002094,False
2,1,6863,9188,930728,127080,60,2118,45,monthly issuance,11006,11314,False,360602,45,Male,20272.8,30060.954167,800.0,-6859,45,east Bohemia,85,19,7,5,0.535,8390,0.132,0.61,0.000539,False
3,1,5325,1843,930803,105804,36,2939,12,monthly issuance,2235,2235,False,400420,14,Female,34292.7,41297.48,1000.0,-11054,14,south Bohemia,69,27,11,9,0.748,10045,0.135,0.29,-0.001739,False
4,1,7240,11013,930906,274740,60,4579,1,weekly issuance,13231,13539,False,780907,63,Male,41142.9,57188.211111,600.0,-30531,63,south Moravia,38,36,6,5,0.505,8288,0.11,0.73,-0.001179,False


### Feature Engineering - Age of the client at the loan time

In [367]:
def get_datetime(date):
    year = int(str(date)[0:2]) + 1900
    month = int(str(date)[2:4])
    day = int(str(date)[4:])
    return datetime(year, month, day)

def calc_age(birth_date, other_date):
    diff:timedelta = get_datetime(other_date) - get_datetime(birth_date)
    return round(diff.days / 365.25)


In [368]:
data['age_on_loan_request'] = data.apply(lambda x: calc_age(x['birth_number'], x['loan_date']), axis=1)
data.drop('loan_date', inplace=True, axis=1)
data.drop('birth_number', inplace=True, axis=1)
data

Unnamed: 0,status,loan_id,account_id,loan_amount,loan_duration,loan_payments,account_district_id,account_frequency,disp_id,client_id,has_disponent,client_district_id,gender,last_balance,mean_balance,min_balance,monthly_diff,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth,has_card,age_on_loan_request
0,-1,5314,1787,96396,12,8033,30,weekly issuance,2166,2166,False,30,Female,20100.0,12250.000000,1100.0,6700,30,west Bohemia,15,13,10,10,0.818,9650,0.100,0.29,-0.001909,False,46
1,1,5316,1801,165960,36,4610,46,monthly issuance,2181,2181,False,46,Male,52208.9,52083.859459,700.0,14733,46,east Bohemia,48,20,10,10,0.735,8369,0.117,0.52,-0.002094,False,25
2,1,6863,9188,127080,60,2118,45,monthly issuance,11006,11314,False,45,Male,20272.8,30060.954167,800.0,-6859,45,east Bohemia,85,19,7,5,0.535,8390,0.132,0.61,0.000539,False,57
3,1,5325,1843,105804,36,2939,12,monthly issuance,2235,2235,False,14,Female,34292.7,41297.480000,1000.0,-11054,14,south Bohemia,69,27,11,9,0.748,10045,0.135,0.29,-0.001739,False,53
4,1,7240,11013,274740,60,4579,1,weekly issuance,13231,13539,False,63,Male,41142.9,57188.211111,600.0,-30531,63,south Moravia,38,36,6,5,0.505,8288,0.110,0.73,-0.001179,False,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,1,6818,9030,155616,48,3242,72,monthly issuance,10813,11121,True,72,Female,60694.1,44197.509884,200.0,5313,72,north Moravia,32,50,11,4,0.626,8994,0.110,0.99,-0.002052,False,26
324,-1,5625,3189,222180,60,3703,29,monthly issuance,3855,3855,False,29,Male,59578.8,55230.444068,800.0,4583,29,west Bohemia,52,10,6,6,0.556,8843,0.113,0.78,0.001531,False,58
325,1,6805,8972,45024,48,938,70,monthly issuance,10742,11050,False,70,Female,38384.3,41994.907692,800.0,8387,70,north Moravia,0,2,13,7,0.899,10177,0.081,1.12,0.000806,False,40
326,1,7233,10963,115812,36,3217,16,monthly issuance,13172,13480,False,16,Male,41878.1,56646.516129,1100.0,2204,16,south Bohemia,74,21,11,8,0.569,8427,0.107,0.42,0.000415,False,44


### Discretizations

##### Age on loan request

In [369]:
data['age_on_loan_request_disc'] = pd.cut(x=data['age_on_loan_request'], bins=[0, 19, 29, 39, 49, 59, 69, 79, 99], labels=['0-19', '21-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-99'])
data['age_on_loan_request_disc'] = data['age_on_loan_request_disc'].astype(str)
data

Unnamed: 0,status,loan_id,account_id,loan_amount,loan_duration,loan_payments,account_district_id,account_frequency,disp_id,client_id,has_disponent,client_district_id,gender,last_balance,mean_balance,min_balance,monthly_diff,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth,has_card,age_on_loan_request,age_on_loan_request_disc
0,-1,5314,1787,96396,12,8033,30,weekly issuance,2166,2166,False,30,Female,20100.0,12250.000000,1100.0,6700,30,west Bohemia,15,13,10,10,0.818,9650,0.100,0.29,-0.001909,False,46,40-49
1,1,5316,1801,165960,36,4610,46,monthly issuance,2181,2181,False,46,Male,52208.9,52083.859459,700.0,14733,46,east Bohemia,48,20,10,10,0.735,8369,0.117,0.52,-0.002094,False,25,21-29
2,1,6863,9188,127080,60,2118,45,monthly issuance,11006,11314,False,45,Male,20272.8,30060.954167,800.0,-6859,45,east Bohemia,85,19,7,5,0.535,8390,0.132,0.61,0.000539,False,57,50-59
3,1,5325,1843,105804,36,2939,12,monthly issuance,2235,2235,False,14,Female,34292.7,41297.480000,1000.0,-11054,14,south Bohemia,69,27,11,9,0.748,10045,0.135,0.29,-0.001739,False,53,50-59
4,1,7240,11013,274740,60,4579,1,weekly issuance,13231,13539,False,63,Male,41142.9,57188.211111,600.0,-30531,63,south Moravia,38,36,6,5,0.505,8288,0.110,0.73,-0.001179,False,15,0-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,1,6818,9030,155616,48,3242,72,monthly issuance,10813,11121,True,72,Female,60694.1,44197.509884,200.0,5313,72,north Moravia,32,50,11,4,0.626,8994,0.110,0.99,-0.002052,False,26,21-29
324,-1,5625,3189,222180,60,3703,29,monthly issuance,3855,3855,False,29,Male,59578.8,55230.444068,800.0,4583,29,west Bohemia,52,10,6,6,0.556,8843,0.113,0.78,0.001531,False,58,50-59
325,1,6805,8972,45024,48,938,70,monthly issuance,10742,11050,False,70,Female,38384.3,41994.907692,800.0,8387,70,north Moravia,0,2,13,7,0.899,10177,0.081,1.12,0.000806,False,40,40-49
326,1,7233,10963,115812,36,3217,16,monthly issuance,13172,13480,False,16,Male,41878.1,56646.516129,1100.0,2204,16,south Bohemia,74,21,11,8,0.569,8427,0.107,0.42,0.000415,False,44,40-49


### Save Data

In [370]:
if DEV: data.to_csv('data_processed/complete/data.csv', index=False)
else: data.to_csv('data_processed/complete/data_comp.csv', index=False)

### Encoding

In [371]:
def encode_data(df, onehot_columns, other_columns):
    from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder(sparse=False)
    onehot_cols = [col for col in onehot_columns if col in df.keys()]
    onehot_encoded = ohe.fit_transform(df[onehot_cols])
    onehot_encoded = pd.DataFrame(onehot_encoded, columns=ohe.get_feature_names(onehot_cols))
    df = df.drop(onehot_cols, axis=1)
    df = pd.concat([df, onehot_encoded], axis=1)    

    #from category_encoders import CatBoostEncoder
    #cat_cols = [col for col in other_columns if col in df.keys()]
    #cbe = CatBoostEncoder(return_df=True)
    #df[cat_cols] = cbe.fit_transform(df[cat_cols], get_target(df))

    from sklearn.preprocessing import OrdinalEncoder
    le = OrdinalEncoder()
    ordinal_cols = [col for col in other_columns if col in df.keys()]
    df[ordinal_cols] = le.set_params(encoded_missing_value=-1).fit_transform(df[ordinal_cols])
    
    return df

In [372]:
# categorical_columns = list(data.select_dtypes("object").columns)
data = encode_data(data, ['account_frequency', 'gender'], ['region', 'age_on_loan_request_disc'])

if DEV: data.to_csv('data_processed/complete/enc_data.csv', index=False)
else: data.to_csv('data_processed/complete/enc_data_comp.csv', index=False)

data



Unnamed: 0,status,loan_id,account_id,loan_amount,loan_duration,loan_payments,account_district_id,disp_id,client_id,has_disponent,client_district_id,last_balance,mean_balance,min_balance,monthly_diff,code,region,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants > 2000,no. of cities,ratio of urban inhabitants,average salary,ratio enterpreneurs,unemploymant rate growth,crime rate growth,has_card,age_on_loan_request,age_on_loan_request_disc,account_frequency_issuance after transaction,account_frequency_monthly issuance,account_frequency_weekly issuance,gender_Female,gender_Male
0,-1,5314,1787,96396,12,8033,30,2166,2166,False,30,20100.0,12250.000000,1100.0,6700,30,7.0,15,13,10,10,0.818,9650,0.100,0.29,-0.001909,False,46,3.0,0.0,0.0,1.0,1.0,0.0
1,1,5316,1801,165960,36,4610,46,2181,2181,False,46,52208.9,52083.859459,700.0,14733,46,2.0,48,20,10,10,0.735,8369,0.117,0.52,-0.002094,False,25,1.0,0.0,1.0,0.0,0.0,1.0
2,1,6863,9188,127080,60,2118,45,11006,11314,False,45,20272.8,30060.954167,800.0,-6859,45,2.0,85,19,7,5,0.535,8390,0.132,0.61,0.000539,False,57,4.0,0.0,1.0,0.0,0.0,1.0
3,1,5325,1843,105804,36,2939,12,2235,2235,False,14,34292.7,41297.480000,1000.0,-11054,14,5.0,69,27,11,9,0.748,10045,0.135,0.29,-0.001739,False,53,4.0,0.0,1.0,0.0,1.0,0.0
4,1,7240,11013,274740,60,4579,1,13231,13539,False,63,41142.9,57188.211111,600.0,-30531,63,6.0,38,36,6,5,0.505,8288,0.110,0.73,-0.001179,False,15,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,1,6818,9030,155616,48,3242,72,10813,11121,True,72,60694.1,44197.509884,200.0,5313,72,4.0,32,50,11,4,0.626,8994,0.110,0.99,-0.002052,False,26,1.0,0.0,1.0,0.0,1.0,0.0
324,-1,5625,3189,222180,60,3703,29,3855,3855,False,29,59578.8,55230.444068,800.0,4583,29,7.0,52,10,6,6,0.556,8843,0.113,0.78,0.001531,False,58,4.0,0.0,1.0,0.0,0.0,1.0
325,1,6805,8972,45024,48,938,70,10742,11050,False,70,38384.3,41994.907692,800.0,8387,70,4.0,0,2,13,7,0.899,10177,0.081,1.12,0.000806,False,40,3.0,0.0,1.0,0.0,1.0,0.0
326,1,7233,10963,115812,36,3217,16,13172,13480,False,16,41878.1,56646.516129,1100.0,2204,16,5.0,74,21,11,8,0.569,8427,0.107,0.42,0.000415,False,44,3.0,0.0,1.0,0.0,0.0,1.0
