# Data Preparation

In [None]:
import pandas as pd
import numpy as np


loan_dev = pd.read_csv('data/loan_dev.csv', dtype={'date':object})
client = pd.read_csv('data/client.csv')
disposition = pd.read_csv('data/disp.csv')
district = pd.read_csv('data/district.csv')
trans_dev = pd.read_csv('data/trans_dev.csv', dtype={'date':object})


There was a row with missing values '?' in the district table for both "no. of commited crimes '95" and "unemploymant rate '95" columns. We needed to find a value for it, either replacing it with median or mean value of the column. To decide between median or mean, we need to analyse symmetry and the outliers of each column.

In [None]:
import matplotlib.pyplot as plt

district_clean = district.loc[(district["unemploymant rate '95"] != '?') & (district["no. of commited crimes '95"] != '?')]


plt.title("Box plot of unemploymant rate '95")
plt.boxplot(district_clean["unemploymant rate '95"].astype('float'))


In [None]:
plt.title("Box plot of no. of commited crimes '95")
plt.boxplot(district_clean["no. of commited crimes '95"].astype('int'))

By analysing the box plots, we found out that the data values for "no. of commited crimes '95" and "unemploymant rate '95" are not symmetrical and there are outliers, therefore is would be best to use median instead of mean to replace the cells with '?'

In [None]:

median_unemploymant = district_clean["unemploymant rate '95"].astype(float).median()
median_commited_crimes = district_clean["no. of commited crimes '95"].astype(int).median()


district.loc[district["no. of commited crimes '95"] == '?', "no. of commited crimes '95"] = median_commited_crimes
district.loc[district["unemploymant rate '95"] == '?', "unemploymant rate '95"] = median_unemploymant

# Convert Object Type to Numeric Type
district["unemploymant rate '95"] = pd.to_numeric(district["unemploymant rate '95"])
district["no. of commited crimes '95"] = pd.to_numeric(district["no. of commited crimes '95"])


# Add ratios
# district_df['ratio enterpreneurs'] = district_df['no. of enterpreneurs per 1000 inhabitants'] / 1000
# district_df['ratio of urban inhabitants'] = district_df['ratio of urban inhabitants'] / 100

# district.drop(columns=['name', 'no. of enterpreneurs per 1000 inhabitants', 'no. of inhabitants',
# "unemploymant rate '96", "no. of commited crimes '96",
# "unemploymant rate '95", "no. of commited crimes '95"], inplace=True)

### Extract gender and normalize dates
Since the birth date of each client is represented as YYMMDD when 'Male' or YY(MM+50)DD when 'Female' the gender was extracted as a new feature and the Female client's birth date modified to YYMMDD, the same as Male clients.
Female = False
Male = True

In [None]:
from datetime import datetime, timedelta

def extract_gender(date):
    month = int(str(date)[2:4])
    gender = ''
    if month > 12:
        month -= 50
        gender =False
    else: 
        gender = True
    new_date = str(date)[:2] + str(month).zfill(2) + str(date)[4:]
    return (new_date, gender)

In [None]:
genders = []
dates = []

for birth_number in client['birth_number']:
    (new_date, gender) = extract_gender(birth_number)
    dates.append(new_date)
    genders.append(gender)

client['birth_number'] = dates
client['gender'] = genders

### Remove disponent clients and add to the account's owner entry
Without this chenage more than one client were associated to a single account_id. This was a problem when merging Dispositions Loans tables since this would lead to several entries with the same loan_id and target classes replicated.

In [None]:
disp_owners = disposition.query("type == 'OWNER'")
disp_disponent = disposition.query("type == 'DISPONENT'")

has_disponent = []
for id in disp_owners['account_id']:
    if id in disp_disponent['account_id']:
        has_disponent.append(True)
    else:
        has_disponent.append(False)

disp_owners['has_disponent'] = has_disponent
disp_owners = disp_owners.drop('type', axis=1)
print(disp_owners)

In [None]:
data = pd.merge(client, disp_owners, left_on="client_id", right_on="client_id", how='inner')
data = pd.merge(loan_dev, data, left_on="account_id", right_on="account_id", how='inner')
data

### Merge the date of the loan and the birth date of the client resulting on the age of the client at the loan time

In [None]:
def get_datetime(date):
    year = int(str(date)[0:2]) + 1900
    month = int(str(date)[2:4])
    day = int(str(date)[4:])
    return datetime(year, month, day)

def calc_age(birth_date, other_date):
    diff:timedelta = get_datetime(other_date) - get_datetime(birth_date)
    return round(diff.days / 365.25)


In [None]:
data['age_on_loan_request'] = data.apply(lambda x: calc_age(x['birth_number'], x['date']), axis=1)
data.drop('date', inplace=True, axis=1)
data.drop('birth_number', inplace=True, axis=1)
data

### Prepare Transaction table

In [None]:
# Drop columns with more than 70% of missing values
trans_dev.dropna(thresh=len(trans_dev) * 0.3, axis=1, inplace=True)
trans_dev

In [None]:
# Replace type 'withdrawal in cash' with 'withdrawal'
trans_dev.replace({'type': 'withdrawal in cash'}, 'withdrawal', inplace=True)

# Replace NaN of operation with mode for each type
credit_operation_mode = trans_dev[trans_dev['type'] == 'credit']['operation'].mode()[0]
withdrawal_operation_mode = trans_dev[trans_dev['type'] == 'withdrawal']['operation'].mode()[0]
trans_dev['operation'] = trans_dev.apply(lambda x: credit_operation_mode if (x['type'] == 'credit' and pd.isnull(x['operation'])) else (withdrawal_operation_mode if (x['type'] == 'withdrawal' and pd.isnull(x['operation'])) else x['operation']), axis=1)


In [None]:
loan_clone = loan_dev.copy()
trans_clone = trans_dev.copy()

loan_clone['date'] = loan_clone['date'].apply(lambda x: '19'+x[:2]+'-'+x[2:4]+'-'+x[4:])
loan_clone['date'] = pd.to_datetime(loan_clone['date'])
trans_clone['date'] = trans_clone['date'].apply(lambda x: '19'+x[:2]+'-'+x[2:4]+'-'+x[4:])
trans_clone['date'] = pd.to_datetime(trans_clone['date'])

loan_trans = pd.merge(loan_clone, trans_clone, left_on='account_id', right_on='account_id', how='left')
loan_trans['date_diff'] = (loan_trans['date_y']-loan_trans['date_x'])/np.timedelta64(1, 'M')
loan_trans['date_diff'] = loan_trans['date_diff'].astype(int)

def agg_func(data):
    credits = data[(data['type'] == 'credit') & (data['operation'] != 'collection from another bank')]
    withdrawals = data[((data['type'] == 'withdrawal') | (data['type'] == 'withdrawal in cash')) & (data['operation'] != 'remittance to another bank')]
    return pd.Series({
        'mensal_credit': (credits['amount_y'].sum() / (credits['date_diff'].iat[-1] - credits['date_diff'].iat[0] + 1)) if credits.shape[0] != 0 else 0, 
        'mensal_withdrawal': (withdrawals['amount_y'].sum() / (withdrawals['date_diff'].iat[-1] - withdrawals['date_diff'].iat[0] + 1)) if withdrawals.shape[0] != 0 else 0,
        'last_balance': data['balance'].iat[-1],
    })
res = loan_trans.groupby(['loan_id']).apply(agg_func)
res = res.reset_index(drop=False)
res['monthly_diff'] = (res['mensal_credit'] - res['mensal_withdrawal']).astype(int)
res = res.drop(['mensal_credit', 'mensal_withdrawal'], axis=1)

res

In [None]:
# Merge data with transaction preparation
data = pd.merge(data, res, left_on='loan_id', right_on='loan_id', how='left')
data

### Save Data

In [None]:
data.to_csv('data_processed/data.csv', index=False)