# Proposal of Learning To Rank Models as a Method for Credit Security

In [26]:
import pandas as pd
from utils.MockData import create_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from lightgbm import LGBMRanker
from xgboost import XGBRanker
import numpy as np 

## Format of the Input Data
### Data Table
* Key:
    + ID : Unique identificator for client in the database
* Input:
    + Gender : If 1 client identifies themselves as Female, if 0 client identifies themselves as Male
    + Age : Age o client between 18 e 75
    + Days_of_Delay : Total days of delay of the client debt.
    + Installments : Total number of installments for the client current debts.
    + Value : Total value to be paid of the client debt.
* Target :
    + Debt_Status : If 1 client has overdue payments of debts.
    + Debt Chance : Defined as the probability that a given client would be into debt at the next month.
### Query Table
* Key :
    + ID : Unique identificator for client in the database
* Input :
    + QID : Initiative to each the client is been analyzed for credit (Telhanorte, Ponto Frio, etc)

In [27]:
# Constants of data set generation
data_size = 100000
debt_rate = 0.5
gender_rate = 0.5

In [28]:
# Create Data DataFrame
df_data = pd.DataFrame()
df_data["ID"] = [x for x in range(1,data_size+1)]
df_data['Gender'] = np.random.choice([0, 1], data_size, p=[1-gender_rate, gender_rate])
df_data["Age"] = [np.random.randint(18, 75) for _ in df_data.ID]
df_data["Debt_Status"] = np.random.choice([0, 1], data_size, p=[1-debt_rate, debt_rate])
df_data["Debt_Chance"] = [np.random.randint(0,80) if x == 0 else np.random.randint(70,100) for x in df_data.Debt_Status]
df_data['Days_of_Delay'] = [0 if x == 0 else np.random.randint(10, 180) for x in df_data.Debt_Status]
df_data['Installments'] = [0 if x == 0 else np.random.randint(1, 24) for x in df_data.Debt_Status]
df_data['Value'] = [0 if x == 0 else np.random.randint(100, 5000) for x in df_data.Debt_Status]
df_data.head()

Unnamed: 0,ID,Gender,Age,Debt_Status,Debt_Chance,Days_of_Delay,Installments,Value
0,1,0,50,1,80,20,17,3105
1,2,1,43,0,24,0,0,0
2,3,1,37,0,2,0,0,0
3,4,1,68,1,74,63,4,953
4,5,0,32,0,63,0,0,0


In [29]:
# Create Query Dataframe
df_query = pd.DataFrame()
df_query["ID"] = df_data.ID.copy(deep = True)
df_query['QID'] = np.random.choice([1,2,3], data_size,p = [0.34,0.33,0.33])
df_query.head()

Unnamed: 0,ID,QID
0,1,3
1,2,3
2,3,2
3,4,2
4,5,3


In [30]:
# Create Merged Dataframe
df_merged = pd.merge(df_data, df_query, how = "inner",on="ID")
df_merged.head()

Unnamed: 0,ID,Gender,Age,Debt_Status,Debt_Chance,Days_of_Delay,Installments,Value,QID
0,1,0,50,1,80,20,17,3105,3
1,2,1,43,0,24,0,0,0,3
2,3,1,37,0,2,0,0,0,2
3,4,1,68,1,74,63,4,953,2
4,5,0,32,0,63,0,0,0,3


### Split Data

In [31]:
train, test = train_test_split(df_merged, test_size=0.2, random_state=42)
# Order Dataframes by "QID" column
train = train.sort_values('QID').reset_index(drop=True)
test = test.sort_values('QID').reset_index(drop=True)
# Count the total of each instance in each dataframe
train_query = train["QID"].value_counts().sort_index()
test_query = test["QID"].value_counts().sort_index()

## Generate Model of LTR

In [32]:
# Define feature and target columns
features = ["Gender","Age","Days_of_Delay","Installments","Value"]
target_col = ['Debt_Chance']

In [33]:
model = XGBRanker(
    n_estimators=1000,
    objective="rank:ndcg",
    ndcg_exp_gain = False,
    random_state = 42
    )
model.fit(
    train[features],
    train[target_col],
    group=train_query,
    eval_set=[(test[features], test[target_col])],
    eval_group=[list(test_query)],
    verbose = 100,
)
 

[0]	validation_0-ndcg@32:0.85009
[100]	validation_0-ndcg@32:0.85589
[200]	validation_0-ndcg@32:0.85920
[300]	validation_0-ndcg@32:0.85799
[400]	validation_0-ndcg@32:0.86568
[500]	validation_0-ndcg@32:0.86263
[600]	validation_0-ndcg@32:0.86720
[700]	validation_0-ndcg@32:0.86456
[800]	validation_0-ndcg@32:0.86237
[900]	validation_0-ndcg@32:0.86228
[999]	validation_0-ndcg@32:0.86393


## Evaluation Of Results


### Discounted Cumulative Gain (CG)
Cumulative Gain (CG) is defined as the sum of the relevance score, in our case the 'Debt_Chance', for a given query.
$$ CG = \sum_{i=1}^{N} G(i) $$
The problem with this metric is that independent of the order the result will be the same, so to account to the position of a given item on the list we add a penalty per position.
$$ DCG = \sum_{i=1}^{N} \frac{G(i)}{\log_{2}(i+1)} $$
With this adjust maximum DCG can only be achieved when the items where sorted with descending order.

### Ideal Discounted Cumulative Gain (IDCG)
Given a ranking of any kind, we can correctly assume that a ideal order of items exist and that for that given order, the DCG will be maximum. That value is defined as Ideal Discounted Cumulative Gain.


### Normalized Discounted Cumulative Gain (NDCG)
NDCG is a metric for quality of ranking that takes to account relative position of all items when evaluating the results. The value of NDCG is determined by comparing the relevance of the items returned by the algorithm to the relevance of the item that a hypothetical “ideal” algorithm would return.
$$ NDCG =\frac{DCG}{IDCG} $$
The NDCG can only range between 0 and 1, since is a relative metric to the ideal ranking for a query, it allow us to compare any query to any other query disregarding size as a relevant factor for the result

### Example
Consider a list of item where the relevance score are as shows :
$$ G_{X} = [3,1,5,2,4]$$
A ideal ranking of this items based on the relevance score provided is  :
$$ G_{X_{ranked}} = [5,4,3,2,1]$$
For the pure Cumulative Gain we can see that the result of $X$ and $X_{ranked}$ is the same:
$$ CG(G_{X}) = \sum^{5}_{i=1} G_{X}(i) = 3+1+5+2+4 = 15 $$
$$ CG(G_{X_{ranked}}) = \sum^{5}_{i=1} G_{X}(i) = 5+4+3+2+1 = 15 $$
Now if we consider de Discounted Cumulative Gain we can see that $G_{X_{ranked}}$ is better ranked than $G_{X}$:
$$ DCG(G_{X}) = \sum_{i=1}^{N} \frac{G_{X}(i)}{\log_{2}(i+1)}$$
$$ DCG(G_{X}) = \frac{3}{\log_{2}(2)} + \frac{1}{\log_{2}(3)} + \frac{5}{\log_{2}(4)} + \frac{2}{\log_{2}(5)} + \frac{4}{\log_{2}(6)}$$
$$ DCG(G_{X}) \approx 8.53 $$
$$ DCG(G_{X_{ranked}}) = \sum_{i=1}^{N} \frac{G_{X_{ranked}}(i)}{\log_{2}(i+1)}$$
$$ DCG(G_{X_{ranked}}) = \frac{5}{\log_{2}(2)} + \frac{4}{\log_{2}(3)} + \frac{3}{\log_{2}(4)} + \frac{2}{\log_{2}(5)} + \frac{1}{\log_{2}(6)}$$
$$ DCG(G_{X_{ranked}}) \approx 10.27 $$
Since $DCG(G_{X_{ranked}})$ is the result for the ideal ranking order we can confirm that :
$$ DCG(G_{X_{ranked}}) = IDCG(G_{X}) $$
So if we decided to calculate the NDCG for the both lists we would have:
$$ NDCG(G_{X}) = \frac{DCG(G_{X})}{IDCG(G_{X})} \approx \frac{8.53}{10.27} \approx 0.85 $$
$$ NDCG(G_{X_{ranked}}) = \frac{DCG(G_{X_{ranked}})}{IDCG(G_{X})} \approx \frac{10.27}{10.27} \approx 1 $$

In [34]:
results = test.copy(deep=True)
results.drop(columns = ["Gender","Age","Days_of_Delay","Installments"], inplace=True)
# results.drop(columns = 'Debt_Chance', inplace=True)
# results.drop(columns = 'Debt_Status', inplace=True)
results['score']= model.predict(test.iloc[:][features])

In [35]:
df_class_1 = results[results.QID == 1].copy(deep = True)
df_class_1.sort_values(by = 'score', ascending = True, inplace=True)
df_class_1.reset_index(drop = True, inplace=True)
df_class_1

Unnamed: 0,ID,Debt_Status,Debt_Chance,Value,QID,score
0,81671,0,7,0,1,-10.434545
1,25268,0,51,0,1,-10.434545
2,44366,0,46,0,1,-10.434545
3,82248,0,7,0,1,-10.434545
4,16051,0,55,0,1,-10.434545


In [36]:
df_class_2 = results[results.QID == 2].copy(deep = True)
df_class_2.sort_values(by = 'score', ascending = True, inplace=True)
df_class_2.reset_index(drop = True, inplace=True)
df_class_2

Unnamed: 0,ID,Debt_Status,Debt_Chance,Value,QID,score
0,11314,0,79,0,2,-10.434545
1,52666,0,65,0,2,-10.434545
2,8247,0,65,0,2,-10.434545
3,30045,0,79,0,2,-10.434545
4,79380,0,52,0,2,-10.434545


In [37]:
df_class_3 = results[results.QID == 3].copy(deep = True)
df_class_3.sort_values(by = 'score', ascending = True, inplace=True)
df_class_3.reset_index(drop = True, inplace=True)
df_class_3

Unnamed: 0,ID,Debt_Status,Debt_Chance,Value,QID,score
0,72879,0,6,0,3,-10.434545
1,84936,0,58,0,3,-10.434545
2,73788,0,12,0,3,-10.434545
3,97107,0,10,0,3,-10.434545
4,88401,0,69,0,3,-10.434545
...,...,...,...,...,...,...
6748,80492,1,92,891,3,7.593489
6749,74893,1,88,882,3,7.598911
6750,13853,1,72,870,3,7.821071
6751,95721,1,92,885,3,7.860017
