# Proposal of Learning To Rank Models as a Method for Credit Scoring

In [264]:
import pandas as pd
from utils.MockData import create_data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRanker
from xgboost import XGBRanker
import numpy as np 

## Format of the Input Data
### Data Table
* Key:
    + ID :
* Input:
    + Sex :
    + Age :
    + Days_of_Delay :
    + Installments :
    + Value :
* Target :
    + Debt_Status : 
### Query Table
* Key :
    + ID :
* Input :
    + QID :

In [265]:
# Constants of data set generation
data_size = 100000
debt_rate = 0.5
gender_rate = 0.5

In [266]:
# Create Data DataFrame
df_data = pd.DataFrame()
df_data["ID"] = [x for x in range(1,data_size+1)]
df_data['Gender'] = np.random.choice([0, 1], data_size, p=[1-gender_rate, gender_rate])
df_data["Age"] = [np.random.randint(18, 75) for _ in df_data.ID]
df_data["Debt_Status"] = np.random.choice([0, 1], data_size, p=[1-debt_rate, debt_rate])
df_data['Days_of_Delay'] = [0 if x == 0 else np.random.randint(10, 180) for x in df_data.Debt_Status]
df_data['Installments'] = [0 if x == 0 else np.random.randint(1, 24) for x in df_data.Debt_Status]
df_data['Value'] = [0 if x == 0 else np.random.randint(100, 5000) for x in df_data.Debt_Status]
df_data.head()

Unnamed: 0,ID,Gender,Age,Debt_Status,Days_of_Delay,Installments,Value
0,1,1,51,1,32,6,1549
1,2,1,47,0,0,0,0
2,3,0,48,0,0,0,0
3,4,0,60,1,124,1,3795
4,5,0,38,0,0,0,0


In [267]:
# Create Query Dataframe
df_query = pd.DataFrame()
df_query["ID"] = df_data.ID.copy(deep = True)
df_query['QID'] = np.random.choice([1,2,3], data_size,p = [0.34,0.33,0.33])
df_query.head()

Unnamed: 0,ID,QID
0,1,2
1,2,2
2,3,1
3,4,1
4,5,2


In [268]:
# Create Merged Dataframe
df_merged = pd.merge(df_data, df_query, how = "inner",on="ID")
df_merged.head()

Unnamed: 0,ID,Gender,Age,Debt_Status,Days_of_Delay,Installments,Value,QID
0,1,1,51,1,32,6,1549,2
1,2,1,47,0,0,0,0,2
2,3,0,48,0,0,0,0,1
3,4,0,60,1,124,1,3795,1
4,5,0,38,0,0,0,0,2


### Split Data

In [269]:
train, test = train_test_split(df_merged, test_size=0.2)
# Order Dataframes by "QID" column
train = train.sort_values('QID').reset_index(drop=True)
test = test.sort_values('QID').reset_index(drop=True)
# Count the total of each instance in each dataframe
train_query = train["QID"].value_counts().sort_index()
test_query = test["QID"].value_counts().sort_index()

### Generate Model of LTR

In [270]:
# Define feature and target columns
features = ["Gender","Age","Days_of_Delay","Installments","Value"]
target_col = ['Debt_Status']

In [271]:
model = XGBRanker(n_estimators=10,objective="rank:pairwise")
model.fit(
    train[features],
    train[target_col],
    group=train_query,
    eval_set=[(test[features], test[target_col])],
    eval_group=[list(test_query)],
    verbose = 2
)
 

[0]	validation_0-ndcg@32:1.00000
[2]	validation_0-ndcg@32:1.00000
[4]	validation_0-ndcg@32:1.00000
[6]	validation_0-ndcg@32:1.00000
[8]	validation_0-ndcg@32:1.00000
[9]	validation_0-ndcg@32:1.00000


In [272]:
preds = model.predict(test.iloc[:][features])
topk_idx = np.argsort(preds)[::-1][:]
topk_idx

array([19999,  8907,  8853, ...,  8219, 15846,  9999])