# Data Preparation

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

DEV = True

loans = pd.read_csv('data/loan_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('kaggle/loan_comp.csv', dtype={'date':object}, sep=';')
account = pd.read_csv('data/account.csv', dtype={'date':object})
cards = pd.read_csv('data/card_dev.csv', dtype={'issued':object}) if DEV else pd.read_csv('kaggle/card_comp.csv', dtype={'issued':object}, sep=';')
client = pd.read_csv('data/client.csv')
disposition = pd.read_csv('data/disp.csv')
district = pd.read_csv('data/district.csv')
trans = pd.read_csv('data/trans_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('kaggle/trans_comp.csv', dtype={'date':object}, sep=';')

pd.set_option('display.max_columns', None)


## Loan preparation

### Save

In [None]:
loans.rename(columns={'date':'loan_date', 'amount': 'loan_amount', 'duration': 'loan_duration', 'payments': 'loan_payments' }, inplace=True)
loans.to_csv('data_processed/' + ('loan_dev' if DEV else 'loan_comp') + '.csv', index=False)
loans

## Account preparation

In [None]:
account.drop(['date'], inplace=True, axis=1)

### Save

In [None]:
account.rename(columns={'frequency':'account_frequency', 'district_id': 'account_district_id'}, inplace=True)
account.to_csv('data_processed/account.csv', index=False)
account

## Card preparation

There are only a few loans associated with a credit card

In [None]:
cards.drop(["issued"], inplace=True, axis=1)
cards = cards.fillna({'type': 'unkown'})

### Save

In [None]:
cards.rename(columns={'type':'card_type', 'disp_id': 'card_disp_id'}, inplace=True)
cards.to_csv('data_processed/' + ('card_dev' if DEV else 'card_comp') + '.csv', index=False)

## Client preparation

### Extract gender and normalize dates
Since the birth date of each client is represented as YYMMDD when 'Male' or YY(MM+50)DD when 'Female' the gender was extracted as a new feature and the Female client's birth date modified to YYMMDD, the same as Male clients.
Female = False
Male = True

In [None]:
from datetime import datetime, timedelta

def extract_gender(date):
    month = int(str(date)[2:4])
    gender = ''
    if month > 12:
        month -= 50
        gender = 'Female'
    else: 
        gender = 'Male'
    new_date = str(date)[:2] + str(month).zfill(2) + str(date)[4:]
    return (new_date, gender)

In [None]:
genders = []
dates = []

for birth_number in client['birth_number']:
    (new_date, gender) = extract_gender(birth_number)
    dates.append(new_date)
    genders.append(gender)

client['birth_number'] = dates
client['gender'] = genders

### Save

In [None]:
client.rename(columns={'district_id': 'client_district_id'}, inplace=True)
client.to_csv('data_processed/client.csv', index=False)

## Disposition preparation

### Remove disponent clients and add to the account's owner entry
Without this chenage more than one client were associated to a single account_id. This was a problem when merging Dispositions Loans tables since this would lead to several entries with the same loan_id and target classes replicated.

In [None]:
disp_owners = disposition.loc[disposition['type'] == 'OWNER'].copy()
disp_disponent = disposition.loc[disposition['type'] == 'DISPONENT'].copy()

has_disponent = [id in disp_disponent['account_id'].values for id in disp_owners['account_id'].values]

disp_owners['has_disponent'] = has_disponent
disp_owners.drop('type', inplace=True, axis=1)
display(disp_owners)

### Save

In [None]:
disp_owners.to_csv('data_processed/disp.csv', index=False)

## District preparation

### Clean missing values

By analysing the box plots, we found out that the data values for "no. of commited crimes '95" and "unemploymant rate '95" are not symmetrical and there are outliers, therefore it would be best to use median instead of mean to replace the cells with '?'. 
Even tho, the missing values will be replaced according to a **linear regression**.

In [None]:

def replace_by_median(district):
    district_clean = district.loc[(district["unemploymant rate '95"] != '?') & (district["no. of commited crimes '95"] != '?')]
    median_unemploymant = district_clean["unemploymant rate '95"].astype(float).median()
    median_commited_crimes = district_clean["no. of commited crimes '95"].astype(int).median()

    # "no. of commited crimes '95" = '?' => median_commited_crimes
    # "unemploymant rate '95" = '?' => median_unemploymant
    district.loc[district["no. of commited crimes '95"] == '?', "no. of commited crimes '95"] = median_commited_crimes
    district.loc[district["unemploymant rate '95"] == '?', "unemploymant rate '95"] = median_unemploymant

def replace_by_regression(district):
    from sklearn.preprocessing import OrdinalEncoder
    from sklearn.linear_model import LinearRegression
    oe = OrdinalEncoder()
    lr = LinearRegression()

    district_copy = district.copy()
    ordinal_cols = ['name', 'region']
    district_copy[ordinal_cols] = oe.set_params(encoded_missing_value=-1).fit_transform(district_copy[ordinal_cols])

    district_clean = district_copy.loc[(district["unemploymant rate '95"] != '?') & (district["no. of commited crimes '95"] != '?')]
    X = district_clean.drop(["no. of commited crimes '95", "unemploymant rate '95"], axis=1)
    
    y1 = district_clean["no. of commited crimes '95"].astype(int)
    lr.fit(X, y1)
    y1_pred = lr.predict(district_copy.loc[(district["no. of commited crimes '95"] == '?')].drop(["no. of commited crimes '95", "unemploymant rate '95"], axis=1))
    district.loc[(district["no. of commited crimes '95"] == '?'), "no. of commited crimes '95"] = y1_pred
    
    y2 = district_clean["unemploymant rate '95"].astype(float)
    lr.fit(X, y2)
    y2_pred = lr.predict(district_copy.loc[(district["unemploymant rate '95"] == '?')].drop(["no. of commited crimes '95", "unemploymant rate '95"], axis=1))
    district.loc[(district["unemploymant rate '95"] == '?'), "unemploymant rate '95"] = y2_pred

replace_by_regression(district)  

district.head()

### Data Cleaning

In [None]:
# values of "no. of municipalities with inhabitants 2000-9999" and "no. of municipalities with inhabitants >10000" are really small compared to the other ones
district['no. of municipalities with inhabitants > 2000'] = district['no. of municipalities with inhabitants 2000-9999'] + district['no. of municipalities with inhabitants >10000']

district.drop(columns=['no. of municipalities with inhabitants 2000-9999',
'no. of municipalities with inhabitants >10000'], inplace=True)

cols = district.columns.tolist()
cols = cols[:6] + cols[-1:] + cols[6:-1]
district = district[cols]

district.head()

### Feature Engineering - Ratios of entrepreneurs, urban inhabitants and rate growth

In [None]:
# Create ratios and convert percentage to 0-1
district['ratio enterpreneurs'] = district['no. of enterpreneurs per 1000 inhabitants'] / 1000
district['ratio of urban inhabitants'] = district['ratio of urban inhabitants'] / 100

# Change rate to ratio growth
district['unemploymant rate growth'] = pd.to_numeric(district["unemploymant rate '96 "]) - pd.to_numeric(district["unemploymant rate '95"])
district['crime rate growth'] = (pd.to_numeric(district["no. of commited crimes '96 "]) - pd.to_numeric(district["no. of commited crimes '95"]))/pd.to_numeric(district["no. of inhabitants"])

district.drop(columns=['name', 'no. of enterpreneurs per 1000 inhabitants', 'no. of inhabitants',
"unemploymant rate '96 ", "no. of commited crimes '96 ",
"unemploymant rate '95", "no. of commited crimes '95"], inplace=True)

district.head()

### Save

In [None]:
district.to_csv('data_processed/district.csv', index=False)
district

## Transaction preparation

### Replace missing values

In [None]:
# Replace type 'withdrawal in cash' with 'withdrawal'
trans.replace({'type': 'withdrawal in cash'}, 'withdrawal', inplace=True)
# Replace k_symbol '' with nan
trans.replace({'k_symbol': ' '}, np.nan, inplace=True)
# Replace operation 'credit card withdrawal' with 'withdrawal in cash'
trans.replace({'operation': 'credit card withdrawal'}, 'withdrawal in cash', inplace=True)

In [None]:
# Replace NaN of operation with mode for each type
credit_operation_mode = trans[trans['type'] == 'credit']['operation'].mode()[0]
withdrawal_operation_mode = trans[trans['type'] == 'withdrawal']['operation'].mode()[0]
trans['operation'] = trans.apply(lambda x: credit_operation_mode if (x['type'] == 'credit' and pd.isnull(x['operation'])) else (withdrawal_operation_mode if (x['type'] == 'withdrawal' and pd.isnull(x['operation'])) else x['operation']), axis=1)

In [None]:
# Drop columns with more than 70% of missing values
trans.dropna(thresh=len(trans) * 0.3, axis=1, inplace=True)
trans

In [None]:
trans.to_csv('data_processed/' + ('trans_dev_no_mv' if DEV else 'trans_comp_no_mv') + '.csv', index=False)

### Feature Engineering - Monthly Income and last balance for each account_id

In [None]:
trans['date'] = trans['date'].apply(lambda x: '19'+x[:2]+'-'+x[2:4]+'-'+x[4:])
trans['date'] = pd.to_datetime(trans['date']).dt.date

In [None]:
aux = trans.groupby(['account_id']).agg({'date': ['min', 'max']})
aux.columns = ['min_date', 'max_date']
aux = aux.reset_index(drop=False)
aux['date_diff'] = (aux['max_date']-aux['min_date'])/np.timedelta64(1, 'M')
aux['date_diff'] = aux['date_diff'].astype(int)
trans = pd.merge(trans, aux[['account_id', 'date_diff']], on='account_id', how='left')
display(trans)

In [None]:
def agg_func(data):
    credits = data[(data['type'] == 'credit') & (data['operation'] != 'collection from another bank')]
    withdrawals = data[(data['type'] == 'withdrawal') & (data['operation'] != 'remittance to another bank')]

    return pd.Series({
        'monthly_credit': (credits['amount'].sum() / (abs(data['date_diff'].iat[0]) + 1)) if credits.shape[0] != 0 else 0, 
        'monthly_withdrawal': (withdrawals['amount'].sum() / (abs(data['date_diff'].iat[0]) + 1)) if withdrawals.shape[0] != 0 else 0,
        'last_balance': data['balance'].iat[-1],
        'mean_balance': data['balance'].mean(),
        'min_balance': data['balance'].min(),
        'negative_balance': 1 if data['balance'].min() < 0 else 0,
    })
trans_agg = trans.groupby(['account_id']).apply(agg_func)
trans_agg = trans_agg.reset_index(drop=False)
trans_agg['monthly_diff'] = (trans_agg['monthly_credit'] - trans_agg['monthly_withdrawal']).astype(int)
trans_agg = trans_agg.drop(['monthly_credit', 'monthly_withdrawal'], axis=1)
trans_agg

### Save

In [None]:
trans_agg.to_csv('data_processed/' + ('trans_dev' if DEV else 'trans_comp') + '.csv', index=False)

## Data Integration

In [None]:
loans = pd.read_csv('data_processed/loan_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('data_processed/loan_comp.csv')
account = pd.read_csv('data_processed/account.csv', dtype={'date':object})
cards = pd.read_csv('data_processed/card_dev.csv', dtype={'issued':object}) if DEV else pd.read_csv('data_processed/card_comp.csv')
client = pd.read_csv('data_processed/client.csv')
disposition = pd.read_csv('data_processed/disp.csv')
district = pd.read_csv('data_processed/district.csv')
trans = pd.read_csv('data_processed/trans_dev.csv', dtype={'date':object}) if DEV else pd.read_csv('data_processed/trans_comp.csv')

pd.set_option('display.max_columns', None)


### Join tables

In [None]:
data = pd.merge(loans, account, left_on="account_id", right_on="account_id", how='left')
data = pd.merge(data, disp_owners, left_on="account_id", right_on="account_id", how='left')
data = pd.merge(data, client, left_on="client_id", right_on="client_id", how='left')
data = pd.merge(data, cards, left_on="disp_id", right_on="card_disp_id", how='left')
data = pd.merge(data, trans, left_on='account_id', right_on='account_id', how='left')
data = pd.merge(data, district, left_on="client_district_id", right_on="code", how='left')
display(data.head())

### Feature Engineering - Has credit card or not

In [None]:
data['has_card'] = data['card_id'].notnull()
display(data.head())

### Drop columns with more than 70% of missing values

In [None]:
# Drop columns with more than 70% of missing values
status = data['status']
data.drop(['status'], axis=1, inplace=True)
data.dropna(thresh=len(data) * 0.3, axis=1, inplace=True)
data = pd.concat([status, data], axis=1)
display(data.head())

### Feature Engineering - Age of the client at the loan time

In [None]:
def get_datetime(date):
    year = int(str(date)[0:2]) + 1900
    month = int(str(date)[2:4])
    day = int(str(date)[4:])
    return datetime(year, month, day)

def calc_age(birth_date, other_date):
    diff:timedelta = get_datetime(other_date) - get_datetime(birth_date)
    return round(diff.days / 365.25)


In [None]:
data['age_on_loan_request'] = data.apply(lambda x: calc_age(x['birth_number'], x['loan_date']), axis=1)
data.drop('loan_date', inplace=True, axis=1)
data.drop('birth_number', inplace=True, axis=1)
data

### Discretizations

##### Age on loan request

In [None]:
data['age_on_loan_request_disc'] = pd.cut(x=data['age_on_loan_request'], bins=[0, 9, 19, 29, 39, 49, 59, 69, 79, 89], labels=['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89'])
data['age_on_loan_request_disc'] = data['age_on_loan_request_disc'].astype(str)
data

#### Payements

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

# create discretizer
kbins = KBinsDiscretizer(n_bins=5, strategy='kmeans', encode='ordinal')
age_bin = kbins.fit_transform(np.array(data['loan_payments']).reshape(-1,1))
data['loan_payments_disc'] = age_bin

#### Monthly Diff

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

# create discretizer
kbins = KBinsDiscretizer(n_bins=5, strategy='kmeans', encode='ordinal')
age_bin = kbins.fit_transform(np.array(data['monthly_diff']).reshape(-1,1))
data['monthly_diff_disc'] = age_bin

### Save Data

In [None]:
if DEV: data.to_csv('data_processed/complete/data.csv', index=False)
else: data.to_csv('data_processed/complete/data_comp.csv', index=False)

### Encoding

In [None]:
def encode_data(df, onehot_columns, other_columns):
    from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder(sparse=False)
    onehot_cols = [col for col in onehot_columns if col in df.keys()]
    onehot_encoded = ohe.fit_transform(df[onehot_cols])
    onehot_encoded = pd.DataFrame(onehot_encoded, columns=ohe.get_feature_names(onehot_cols))
    df = df.drop(onehot_cols, axis=1)
    df = pd.concat([df, onehot_encoded], axis=1)    

    #from category_encoders import CatBoostEncoder
    #cat_cols = [col for col in other_columns if col in df.keys()]
    #cbe = CatBoostEncoder(return_df=True)
    #df[cat_cols] = cbe.fit_transform(df[cat_cols], get_target(df))

    from sklearn.preprocessing import OrdinalEncoder
    le = OrdinalEncoder()
    ordinal_cols = [col for col in other_columns if col in df.keys()]
    df[ordinal_cols] = le.set_params(encoded_missing_value=-1).fit_transform(df[ordinal_cols])
    
    return df

In [None]:
# categorical_columns = list(data.select_dtypes("object").columns)
data = encode_data(data, ['account_frequency'], ['region', 'age_on_loan_request_disc', 'gender'])

data.rename(columns={'account_frequency_monthly issuance': 'a_freq_monthly_issuance', 'account_frequency_weekly issuance': 'a_freq_weekly_issuance', 'account_frequency_issuance after transaction': 'a_freq_issuance_after_transaction'}, inplace=True)

if DEV: data.to_csv('data_processed/complete/enc_data.csv', index=False)
else: data.to_csv('data_processed/complete/enc_data_comp.csv', index=False)

data