# Proposal of Learning To Rank Models as a Method for Credit Security

In [166]:
import pandas as pd
from utils.MockData import create_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from lightgbm import LGBMRanker
from xgboost import XGBRanker
import numpy as np 

## Format of the Input Data
### Data Table
* Key:
    + ID : Unique identificator for client in the database
* Input:
    + Gender : If 1 client identifies themselves, if 0 client identifies themselves as Male
    + Age : Age o client between 18 e 75
    + Days_of_Delay : Total days of delay of the client debt.
    + Installments : Total number of installments for the client current debts.
    + Value : Total value to be paid of the client debt.
* Target :
    + Debt_Status : If 1 client has overdue payments of debts.
### Query Table
* Key :
    + ID : Unique identificator for client in the database
* Input :
    + QID : Initiative to each the client is been analyzed for credit (Telhanorte, Ponto Frio, etc)

In [167]:
# Constants of data set generation
data_size = 100000
debt_rate = 0.5
gender_rate = 0.5

In [168]:
# Create Data DataFrame
df_data = pd.DataFrame()
df_data["ID"] = [x for x in range(1,data_size+1)]
df_data['Gender'] = np.random.choice([0, 1], data_size, p=[1-gender_rate, gender_rate])
df_data["Age"] = [np.random.randint(18, 75) for _ in df_data.ID]
df_data["Debt_Status"] = np.random.choice([0, 1], data_size, p=[1-debt_rate, debt_rate])
df_data["Debt_Chance"] = [np.random.randint(0,50) if x == 0 else np.random.randint(60,100) for x in df_data.Debt_Status]
df_data['Days_of_Delay'] = [0 if x == 0 else np.random.randint(10, 180) for x in df_data.Debt_Status]
df_data['Installments'] = [0 if x == 0 else np.random.randint(1, 24) for x in df_data.Debt_Status]
df_data['Value'] = [0 if x == 0 else np.random.randint(100, 5000) for x in df_data.Debt_Status]
df_data.head()

Unnamed: 0,ID,Gender,Age,Debt_Status,Debt_Chance,Days_of_Delay,Installments,Value
0,1,0,37,1,68,177,17,4058
1,2,0,39,1,72,105,10,661
2,3,1,22,1,95,15,21,3128
3,4,0,35,0,47,0,0,0
4,5,1,70,0,15,0,0,0


In [169]:
# Create Query Dataframe
df_query = pd.DataFrame()
df_query["ID"] = df_data.ID.copy(deep = True)
df_query['QID'] = np.random.choice([1,2,3], data_size,p = [0.34,0.33,0.33])
df_query.head()

Unnamed: 0,ID,QID
0,1,1
1,2,1
2,3,1
3,4,2
4,5,2


In [170]:
# Create Merged Dataframe
df_merged = pd.merge(df_data, df_query, how = "inner",on="ID")
df_merged.head()

Unnamed: 0,ID,Gender,Age,Debt_Status,Debt_Chance,Days_of_Delay,Installments,Value,QID
0,1,0,37,1,68,177,17,4058,1
1,2,0,39,1,72,105,10,661,1
2,3,1,22,1,95,15,21,3128,1
3,4,0,35,0,47,0,0,0,2
4,5,1,70,0,15,0,0,0,2


### Split Data

In [171]:
train, test = train_test_split(df_merged, test_size=0.2, random_state=42)
# Order Dataframes by "QID" column
train = train.sort_values('QID').reset_index(drop=True)
test = test.sort_values('QID').reset_index(drop=True)
# Count the total of each instance in each dataframe
train_query = train["QID"].value_counts().sort_index()
test_query = test["QID"].value_counts().sort_index()

## Generate Model of LTR

In [172]:
# Define feature and target columns
features = ["Gender","Age","Days_of_Delay","Installments","Value"]
target_col = ['Debt_Chance']

In [173]:
model = XGBRanker(
    n_estimators=1000,
    objective="rank:ndcg",
    ndcg_exp_gain = False
    )
model.fit(
    train[features],
    train[target_col],
    group=train_query,
    eval_set=[(test[features], test[target_col])],
    eval_group=[list(test_query)],
    verbose = 100,
)
 

[0]	validation_0-ndcg@32:0.82615
[100]	validation_0-ndcg@32:0.78985
[200]	validation_0-ndcg@32:0.77928
[300]	validation_0-ndcg@32:0.78245
[400]	validation_0-ndcg@32:0.78262
[500]	validation_0-ndcg@32:0.78582
[600]	validation_0-ndcg@32:0.79360
[700]	validation_0-ndcg@32:0.79715
[800]	validation_0-ndcg@32:0.79936
[900]	validation_0-ndcg@32:0.79728
[999]	validation_0-ndcg@32:0.79818


## Evaluation Of Results


### Discounted Cumulative Gain (CG)
Cumulative Gain (CG) is defined as the sum of the relevance score, in our case the 'Debt_Chance', for a given query.
$$ CG = \sum_{i=1}^{N} G(i) $$
The problem with this metric is that independent of the order the result will be the same, so to account to the position of a given item on the list we add a penalty per position.
$$ DCG = \sum_{i=1}^{N} \frac{G(i)}{\log_{2}(i+1)} $$
With this adjust maximum DCG can only be achieved when the items where sorted with descending order.

### Ideal Discounted Cumulative Gain (IDCG)
Given a ranking of any kind, we can correctly assume that a ideal order of items exist and that for that given order, the DCG will be maximum. That value is defined as Ideal Discounted Cumulative Gain.


### Normalized Discounted Cumulative Gain (NDCG)
NDCG is a metric for quality of ranking that takes to account relative position of all items when evaluating the results. The value of NDCG is determined by comparing the relevance of the items returned by the algorithm to the relevance of the item that a hypothetical “ideal” algorithm would return.
$$ NDCG =\frac{DCG}{IDCG} $$
The NDCG can only range between 0 and 1, since is a relative metric to the ideal ranking for a query, it allow us to compare any query to any other query disregarding size as a relevant factor for the result

In [174]:
results = test.copy(deep=True)
results.drop(columns = ["Gender","Age","Days_of_Delay","Installments"], inplace=True)
# results.drop(columns = 'Debt_Chance', inplace=True)
# results.drop(columns = 'Debt_Status', inplace=True)
results['score']= model.predict(test.iloc[:][features])

In [175]:
df_class_1 = results[results.QID == 1].copy(deep = True)
df_class_1.sort_values(by = 'score', ascending = True, inplace=True)
df_class_1.reset_index(drop = True, inplace=True)
df_class_1.head()

Unnamed: 0,ID,Debt_Status,Debt_Chance,Value,QID,score
0,49788,0,44,0,1,-5.436675
1,64779,0,17,0,1,-5.436675
2,46686,0,12,0,1,-5.436675
3,49079,0,26,0,1,-5.436675
4,47310,0,29,0,1,-5.436675


In [176]:
df_class_2 = results[results.QID == 2].copy(deep = True)
df_class_2.sort_values(by = 'score', ascending = True, inplace=True)
df_class_2.reset_index(drop = True, inplace=True)
df_class_2.head()

Unnamed: 0,ID,Debt_Status,Debt_Chance,Value,QID,score
0,59931,0,12,0,2,-5.436675
1,81671,0,27,0,2,-5.436675
2,45194,0,0,0,2,-5.436675
3,48079,0,20,0,2,-5.436675
4,56543,0,44,0,2,-5.436675


In [177]:
df_class_3 = results[results.QID == 3].copy(deep = True)
df_class_3.sort_values(by = 'score', ascending = True, inplace=True)
df_class_3.reset_index(drop = True, inplace=True)
df_class_3.head()

Unnamed: 0,ID,Debt_Status,Debt_Chance,Value,QID,score
0,15393,0,41,0,3,-5.436675
1,38078,0,18,0,3,-5.436675
2,82752,0,25,0,3,-5.436675
3,90656,0,29,0,3,-5.436675
4,7221,0,21,0,3,-5.436675
