<a href="https://colab.research.google.com/github/SiddharthaPand4/bad_loan_prediction/blob/main/Loan_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip senior_ds_test.zip

In [None]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf

In [None]:
def preprocess_account_data(account_data, res):
    for person_data in account_data:
        uid = person_data[0]["uid"]
        res[uid]["total_loan_amount"] = 0
        res[uid]["total_amount_overdue"] = 0
        res[uid]["loans_running"] = 0
        res[uid]["total_days_overdue"] = 0
        res[uid]["credit_type_count"] = {}
        
        for loan_data in person_data:
            res[uid]["total_loan_amount"] += loan_data["loan_amount"]

            res[uid]["total_amount_overdue"] += loan_data["amount_overdue"]

            res[uid]["loans_running"] += 1 if not loan_data["closed_date"] else 0

            credit_type = loan_data["credit_type"]
            count = res[uid]["credit_type_count"].get(credit_type, 0)
            res[uid]["credit_type_count"][credit_type] = count + 1

            payment_hist = loan_data["payment_hist_string"]
            assert len(payment_hist) % 3 == 0
            for i in range(0, len(payment_hist), 3):
                due_days = int(payment_hist[i: i+3])
                res[uid]["total_days_overdue"] += due_days
        
    return res

In [None]:
def preprocess_enquiry_data(enquiry_data, account_data_dict):
    for person_data in enquiry_data:
        uid = person_data[0]["uid"]
        if account_data_dict[uid].get("credit_type_count", -1) == -1:
            account_data_dict[uid]["total_loan_amount"] = 0
            account_data_dict[uid]["total_amount_overdue"] = 0
            account_data_dict[uid]["loans_running"] = 0
            account_data_dict[uid]["total_days_overdue"] = 0
            account_data_dict[uid]["credit_type_count"] = {}
        
        account_data_dict[uid]["total_enquiry_amount"] = 0
        account_data_dict[uid]["enquiry_type_count"] = {}
        for enquiry in person_data:
            account_data_dict[uid]["total_enquiry_amount"] += enquiry["enquiry_amt"]

            enquiry_type = enquiry["enquiry_type"]
            count = account_data_dict[uid]["enquiry_type_count"].get(enquiry_type, 0)
            account_data_dict[uid]["enquiry_type_count"][enquiry_type] = count + 1
        
    return account_data_dict

In [None]:
def load_data(mode="train", enquiry_index={}, credit_index={}):
    train_flag = pd.read_csv(f"./senior_ds_test/data/{mode}/{mode}_flag.csv")
    train_flag_list = train_flag.values.tolist()
    applicant_data = {}
    for row in train_flag_list:
        applicant_data[row[0]] = {
            "contract_type": row[1],
            "target": row[2]
        } if mode == "train" else {
            "contract_type": row[1]
        }
    print(len(applicant_data))
    json_file = open(f"./senior_ds_test/data/{mode}/accounts_data_{mode}.json")
    data = json.loads(json_file.read())
    account_data = preprocess_account_data(data, applicant_data)
    print(len(account_data))
    json_file = open(f"./senior_ds_test/data/{mode}/enquiry_data_{mode}.json")
    data = json.loads(json_file.read())
    applicant_data = preprocess_enquiry_data(data, account_data)
    print(len(applicant_data))
    enquiry_types = set()
    for k,v in applicant_data.items():
        for e in v["enquiry_type_count"].keys():
            enquiry_types.add(e)

    print(len(enquiry_types))
    enquiry_type_index = {e:i for i, e in enumerate(enquiry_types)}
    print(enquiry_type_index)
    credit_types = set()
    for k,v in applicant_data.items():
        for e in v["credit_type_count"].keys():
            credit_types.add(e)

    print(len(credit_types))
    credit_type_index = {e:i for i, e in enumerate(credit_types)}
    print(credit_type_index)
    train_data = []
    uid_list = []
    for uid, details in applicant_data.items():
        total_loan_amount = details["total_loan_amount"]
        total_amount_overdue = details["total_amount_overdue"]
        loans_running = details["loans_running"]
        total_days_overdue = details["total_days_overdue"]
        total_enquiry_amount = details["total_enquiry_amount"]
        contract_type = details["contract_type"] == "Cash loans"
        if mode == "train":
            target = details["target"]

        row = [total_loan_amount, total_amount_overdue, loans_running, total_days_overdue, 
               total_enquiry_amount, contract_type]

        credit_type_counts = [0]*len(credit_types if mode=="train" else credit_index)
        for c_type, count in details["credit_type_count"].items():
            credit_type_counts[credit_type_index[c_type] if mode=="train" else credit_index[c_type]] = count
        row.extend(credit_type_counts)

        enquiry_type_counts = [0]*len(enquiry_types if mode=="train" else enquiry_index)
        for e_type, count in details["enquiry_type_count"].items():
            enquiry_type_counts[enquiry_type_index[e_type] if mode=="train" else enquiry_index[c_type]] = count
        row.extend(enquiry_type_counts)
        if mode == "train":
            row.append(target)
        else:
            uid_list.append(uid)
        train_data.append(row)

    arr_data = np.array(train_data)
    arr_data = arr_data[~np.isnan(arr_data).any(axis=1)]
    print(type(arr_data))
    print(arr_data.shape)
    uid_arr = np.array(uid_list)
    return arr_data, enquiry_type_index, credit_type_index, uid_arr

In [None]:
data, enquiry_type_index, credit_type_index, uids = load_data(mode="train")
print()
print("test data starts now")
print()
test_data, _, _, test_uids = load_data("test", enquiry_type_index, credit_type_index)

In [None]:
print(test_data.shape)

In [None]:
pos_data = data[data[:, data.shape[1]-1]==1]
pos_data.shape

In [None]:
non_pos_data = data[data[:, 36]==0]
print(non_pos_data.shape)

idx = np.random.randint(non_pos_data.shape[0], size=pos_data.shape[0])
neg_data = non_pos_data[idx, :]
print(neg_data.shape)

In [None]:
data = np.concatenate((pos_data, neg_data))
np.random.shuffle(data)

print(data.shape)
print(type(data))

In [None]:
split = 0.8
train = data[:int(split*data.shape[0])]
val = data[int(split*data.shape[0]):int(0.9*data.shape[0])]
test = data[int(0.9*data.shape[0]):]
print(data.shape, train.shape, val.shape, test.shape)

In [None]:
train_x, train_y = train[:, :train.shape[1]-1], train[:, train.shape[1]-1]
print(train_x.shape, train_y.shape)
val_x, val_y = val[:, :val.shape[1]-1], val[:, val.shape[1]-1]
print(val_x.shape, val_y.shape)
test_x, test_y = test[:, :test.shape[1]-1], test[:, test.shape[1]-1]
print(test_x.shape, test_y.shape)

In [None]:
means = np.mean(train_x, axis=0)
stds = np.std(train_x, axis=0)  # [:5]
print(means.shape, stds.shape)

In [None]:
train_x = (train_x-means)/(stds + 0.000000001)
val_x = (val_x-means)/(stds + 0.000000001)
test_x = (test_x-means)/(stds + 0.000000001)
real_test = (test_data-means)/(stds + 0.000000001)

In [None]:
sum(train_y == 0)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=16),
    tf.keras.layers.LeakyReLU(alpha=0.1),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=4),
    tf.keras.layers.LeakyReLU(alpha=0.1),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(units=1, activation="sigmoid")
])

model.compile(loss='binary_crossentropy', metrics=["accuracy", tf.keras.metrics.AUC()])

In [None]:
model.fit(train_x, train_y, epochs=500, validation_data=(val_x, val_y), batch_size=4096)

In [None]:
model.evaluate(test_x, test_y, batch_size=test_x.shape[0])

In [None]:
preds = model.predict(real_test)
print(preds.shape)

In [None]:
uid_preds = dict(zip(test_uids, preds))
print(len(uid_preds))

In [None]:
pred_df = pd.DataFrame(uid_preds.items(), columns=["uid", "pred"])

In [None]:
pred_df.head()

In [None]:
pred_df.pred = pred_df.pred.map(lambda arr: arr[0])
pred_df.head()

In [None]:
pred_df.to_csv("final_submission.csv", index=False)