<a href="https://colab.research.google.com/github/SiddharthaPand4/bad_loan_prediction/blob/main/Loan_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip senior_ds_test.zip

In [None]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf

In [None]:
train_flag = pd.read_csv("./senior_ds_test/data/train/train_flag.csv")

In [None]:
train_flag.head()

In [None]:
train_flag.NAME_CONTRACT_TYPE.unique()

In [None]:
train_flag_list = train_flag.values.tolist()

In [None]:
applicant_data = {}
for row in train_flag_list:
    applicant_data[row[0]] = {
        "contract_type": row[1],
        "target": row[2]
    }

print(len(applicant_data))

In [None]:
print(train_flag.shape)
print(train_flag.NAME_CONTRACT_TYPE.describe())
print(train_flag.uid.describe())
print(train_flag.TARGET.describe())

In [None]:
json_file = open("./senior_ds_test/data/train/accounts_data_train.json")
data = json.loads(json_file.read())

In [None]:
'''
  checking if all columns in a row in the json data correspond to the same applicant
   and if all rows have unique applicant history
'''

print(len(data))
uid_set = set()
for p in data:
    assert p[0]["uid"] not in uid_set
    small_set = set()
    for l in p:
        small_set.add(l["uid"])
        uid_set.add(l["uid"])
        assert len(small_set) == 1

assert len(data) == len(uid_set)
print(len(uid_set))

In [None]:
data[0]

In [None]:
def preprocess_account_data(account_data, res):
    for person_data in account_data:
        uid = person_data[0]["uid"]
        res[uid]["total_loan_amount"] = 0
        res[uid]["total_amount_overdue"] = 0
        res[uid]["loans_running"] = 0
        res[uid]["total_days_overdue"] = 0
        res[uid]["credit_type_count"] = {}
        
        for loan_data in person_data:
            res[uid]["total_loan_amount"] += loan_data["loan_amount"]

            res[uid]["total_amount_overdue"] += loan_data["amount_overdue"]

            res[uid]["loans_running"] += 1 if not loan_data["closed_date"] else 0

            credit_type = loan_data["credit_type"]
            count = res[uid]["credit_type_count"].get(credit_type, 0)
            res[uid]["credit_type_count"][credit_type] = count + 1

            payment_hist = loan_data["payment_hist_string"]
            assert len(payment_hist) % 3 == 0
            for i in range(0, len(payment_hist), 3):
                due_days = int(payment_hist[i: i+3])
                res[uid]["total_days_overdue"] += due_days
        
    return res

In [None]:
account_data = preprocess_account_data(data, applicant_data)
print(len(account_data))

In [None]:
account_data[list(account_data.keys())[8]]

In [None]:
json_file = open("./senior_ds_test/data/train/enquiry_data_train.json")
data = json.loads(json_file.read())

In [None]:
'''
  checking if all columns in a row in the json data correspond to the same applicant
   and if all rows have unique applicant history
'''

print(len(data))
uid_set = set()
for p in data:
    assert p[0]["uid"] not in uid_set
    small_set = set()
    for l in p:
        small_set.add(l["uid"])
        uid_set.add(l["uid"])
        assert len(small_set) == 1

assert len(data) == len(uid_set)
print(len(uid_set))

In [None]:
data[10]

In [None]:
def preprocess_enquiry_data(enquiry_data, account_data_dict):
    for person_data in enquiry_data:
        uid = person_data[0]["uid"]
        if account_data_dict[uid].get("credit_type_count", -1) == -1:
            account_data_dict[uid]["total_loan_amount"] = 0
            account_data_dict[uid]["total_amount_overdue"] = 0
            account_data_dict[uid]["loans_running"] = 0
            account_data_dict[uid]["total_days_overdue"] = 0
            account_data_dict[uid]["credit_type_count"] = {}
        
        account_data_dict[uid]["total_enquiry_amount"] = 0
        account_data_dict[uid]["enquiry_type_count"] = {}
        for enquiry in person_data:
            account_data_dict[uid]["total_enquiry_amount"] += enquiry["enquiry_amt"]

            enquiry_type = enquiry["enquiry_type"]
            count = account_data_dict[uid]["enquiry_type_count"].get(enquiry_type, 0)
            account_data_dict[uid]["enquiry_type_count"][enquiry_type] = count + 1
        
    return account_data_dict

In [None]:
applicant_data = preprocess_enquiry_data(data, account_data)
print(len(applicant_data))

In [None]:
applicant_data[list(applicant_data.keys())[0]]

In [None]:
enquiry_types = set()
for k,v in applicant_data.items():
    for e in v["enquiry_type_count"].keys():
        enquiry_types.add(e)

print(len(enquiry_types))

In [None]:
enquiry_type_index = {e:i for i, e in enumerate(enquiry_types)}
enquiry_type_index

In [None]:
credit_types = set()
for k,v in applicant_data.items():
    # if not v.get("credit_type_count"):
    #     print(k)
    #     break
    for e in v["credit_type_count"].keys():
        credit_types.add(e)

print(len(credit_types))

In [None]:
credit_type_index = {e:i for i, e in enumerate(credit_types)}
credit_type_index

In [None]:
applicant_data[list(applicant_data.keys())[0]]

In [None]:
train_data = []
for uid, details in applicant_data.items():
    total_loan_amount = details["total_loan_amount"]
    total_amount_overdue = details["total_amount_overdue"]
    loans_running = details["loans_running"]
    total_days_overdue = details["total_days_overdue"]
    total_enquiry_amount = details["total_enquiry_amount"]
    contract_type = details["contract_type"] == "Cash loans"
    target = details["target"]

    row = [total_loan_amount, total_amount_overdue, loans_running, total_days_overdue, total_enquiry_amount,
           contract_type]

    credit_type_counts = [0]*len(credit_types)
    for c_type, count in details["credit_type_count"].items():
        credit_type_counts[credit_type_index[c_type]] = count
    row.extend(credit_type_counts)

    enquiry_type_counts = [0]*len(enquiry_types)
    for e_type, count in details["enquiry_type_count"].items():
        enquiry_type_counts[enquiry_type_index[e_type]] = count
    row.extend(enquiry_type_counts)
    row.append(target)

    train_data.append(row)

In [None]:
print(len(train_data), len(train_data[0]))

In [None]:
data = np.array(train_data)

In [None]:
data = data[~np.isnan(data).any(axis=1)]

print(data.shape)
print(type(data))

In [None]:
pos_data = data[data[:, 36]==1]
pos_data.shape

In [None]:
non_pos_data = data[data[:, 36]==0]
print(non_pos_data.shape)

idx = np.random.randint(non_pos_data.shape[0], size=pos_data.shape[0])
neg_data = non_pos_data[idx, :]
print(neg_data.shape)

In [None]:
data = np.concatenate((pos_data, neg_data))
np.random.shuffle(data)

print(data.shape)
print(type(data))

In [None]:
split = 0.8
train = data[:int(split*data.shape[0])]
val = data[int(split*data.shape[0]):int(0.9*data.shape[0])]
test = data[int(0.9*data.shape[0]):]
print(data.shape, train.shape, val.shape, test.shape)

In [None]:
train_x, train_y = train[:, :36], train[:, 36]
print(train_x.shape, train_y.shape)
val_x, val_y = val[:, :36], val[:, 36]
print(val_x.shape, val_y.shape)
test_x, test_y = test[:, :36], test[:, 36]
print(test_x.shape, test_y.shape)

In [None]:
means = np.mean(train_x, axis=0)
stds = np.std(train_x, axis=0)  # [:5]
print(means.shape, stds.shape)

In [None]:
train_x = (train_x-means)/(stds + 0.000000001)
val_x = (val_x-means)/(stds + 0.000000001)
test_x = (test_x-means)/(stds + 0.000000001)

In [None]:
sum(train_y == 0)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=128, activation="relu"),
    tf.keras.layers.Dense(units=32, activation="relu"),
    tf.keras.layers.Dense(units=1, activation="sigmoid")
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])  #, tf.keras.metrics.AUC()])

In [None]:
model.fit(train_x, train_y, epochs=50, validation_data=(val_x, val_y))