In [1]:
import pandas as pd
data = pd.read_csv('./Data/20240205fullsample_new.csv')
data.head()

Unnamed: 0,group,id,loan_id,EID,installment,action,action_num_should,action_num_actual,cumu_action0,cumu_action1,...,reward_recovery_rate_nn13,reward_recovery_rate_nn14,delay_due_days,delay_due_days_diff,recovery_rate_total,loan_done,installment_timestep,recovery_rate_weights,recovery_rate_weighted,sample
0,train,51,/+6C2lDDYJgzzCXpn96AFA==,4,1,no_action,0,0,0,0,...,0.033353,0.034837,0,0,0.0,0,1,1.062157,0.20535,rlsimulator
1,train,53,/+6C2lDDYJgzzCXpn96AFA==,4,2,self,1,1,1,0,...,0.052457,0.031081,2,2,0.0,0,1,0.868746,0.0,rlsimulator
2,train,54,/+6C2lDDYJgzzCXpn96AFA==,4,2,family,2,2,1,1,...,0.052457,0.019864,7,5,0.0,0,2,0.868746,0.0,rlsimulator
3,train,55,/+6C2lDDYJgzzCXpn96AFA==,4,2,acquiantance,3,3,1,1,...,0.088502,0.045893,7,0,0.0,0,3,0.868746,0.0,rlsimulator
4,train,56,/+6C2lDDYJgzzCXpn96AFA==,4,2,sms,4,4,1,1,...,0.088502,0.097007,8,1,0.185833,0,4,0.868746,0.177079,rlsimulator


The purpose of `installment_timestep` is to generate a timestep for each installment of each loan (loan id), indicating the order in which that installment is repaid in the current loan.

In [2]:
data['installment_timestep'] = data.groupby(
    ['loan_id', 'installment']).cumcount() + 1

In [3]:
data_sim = data.loc[data['sample'] == 'rlsimulator']
data_sim

Unnamed: 0,group,id,loan_id,EID,installment,action,action_num_should,action_num_actual,cumu_action0,cumu_action1,...,reward_recovery_rate_nn13,reward_recovery_rate_nn14,delay_due_days,delay_due_days_diff,recovery_rate_total,loan_done,installment_timestep,recovery_rate_weights,recovery_rate_weighted,sample
0,train,51,/+6C2lDDYJgzzCXpn96AFA==,4,1,no_action,0,0,0,0,...,0.033353,0.034837,0,0,0.000000,0,1,1.062157,0.205350,rlsimulator
1,train,53,/+6C2lDDYJgzzCXpn96AFA==,4,2,self,1,1,1,0,...,0.052457,0.031081,2,2,0.000000,0,1,0.868746,0.000000,rlsimulator
2,train,54,/+6C2lDDYJgzzCXpn96AFA==,4,2,family,2,2,1,1,...,0.052457,0.019864,7,5,0.000000,0,2,0.868746,0.000000,rlsimulator
3,train,55,/+6C2lDDYJgzzCXpn96AFA==,4,2,acquiantance,3,3,1,1,...,0.088502,0.045893,7,0,0.000000,0,3,0.868746,0.000000,rlsimulator
4,train,56,/+6C2lDDYJgzzCXpn96AFA==,4,2,sms,4,4,1,1,...,0.088502,0.097007,8,1,0.185833,0,4,0.868746,0.177079,rlsimulator
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200009,test,63266,ZWocy2KIKqJbgKZVoXZIFA==,59,4,sms,4,0,4,4,...,,,15,3,0.000000,0,4,0.109583,0.000000,rlsimulator
200010,test,63268,ZWocy2KIKqJbgKZVoXZIFA==,59,5,self,1,1,5,4,...,0.031653,0.045248,2,2,0.000000,0,1,0.148145,0.000000,rlsimulator
200011,test,63269,ZWocy2KIKqJbgKZVoXZIFA==,59,5,family,2,0,5,5,...,,,5,3,0.000000,0,2,0.148145,0.000000,rlsimulator
200012,test,63270,ZWocy2KIKqJbgKZVoXZIFA==,59,5,acquiantance,3,0,6,5,...,,,8,3,0.000000,0,3,0.148145,0.000000,rlsimulator


In [4]:
# specific columns names
loan_id = ['loan_id']
bank_features = ['action_num_actual']
user_features = ['gender',
                 'age',
                 'amount',
                 'num_loan',
                 'duration',
                 'year_ratio',
                 'diff_city',
                 'marriage',
                 'kids',
                 'month_in',
                 'housing',
                 'edu',
                 'motivation']
current_state = ['installment',
                 'installment_timestep',
                 'state_cum_overduelength',
                 'remaining_debt',
                 'state_capital',
                 'state_interests',
                 'state_penalty',
                 ]
other_labels = ['installment_done',
                'loan_done',
                'recovery_rate_weighted']

In [5]:
from tqdm.auto import tqdm


loan_id_list = data_sim["loan_id"].unique().tolist()
# len(loan_id_list)
# loan_id_list
target_state = pd.DataFrame()


col_matching = {
    "installment": "y_installment",
    "installment_timestep": "y_installment_timestep",
    "state_cum_overduelength": "y_state_cum_overduelength",
    "remaining_debt": "y_remaining_debt",
    "state_capital": "y_state_capital",
    "state_interests": "y_state_interests",
    "state_penalty": "y_state_penalty",
}


for example_id in tqdm(loan_id_list):
    example_data = data_sim.loc[data_sim["loan_id"] == example_id]
    y_train = pd.DataFrame()
    y_train = example_data[current_state]
    y_train = y_train.rename(columns=col_matching)

    if y_train.shape[0] > 1:
        y_train = y_train[1:]
        y_train = pd.concat([y_train, y_train.iloc[[-1]]], ignore_index=True)
    target_state = pd.concat([target_state, y_train], ignore_index=True)


target_state

  0%|          | 0/13920 [00:00<?, ?it/s]

Unnamed: 0,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty
0,2,1,2,2000.000000,666.666667,53.333333,3.333333
1,2,2,7,2000.000000,666.666667,53.333333,11.666667
2,2,3,7,2000.000000,666.666667,53.333333,11.666667
3,2,4,8,2000.000000,666.666667,53.333333,13.333333
4,3,1,0,1666.666667,666.666667,53.333333,0.000000
...,...,...,...,...,...,...,...
200009,5,1,2,1740.000000,1740.000000,130.500000,8.700000
200010,5,2,-15,1740.000000,1740.000000,130.500000,-37.700000
200011,5,3,24,1740.000000,1740.000000,130.500000,78.300000
200012,5,4,-1,1740.000000,1740.000000,130.500000,8.700000


In [6]:
data_sim_full = pd.DataFrame()
data_sim_full = pd.concat(
    [
        data_sim[["group"]],
        data_sim[loan_id + bank_features + user_features + current_state],
        target_state,
        data_sim[other_labels],
    ],
    axis=1,
)
data_sim_full

Unnamed: 0,group,loan_id,action_num_actual,gender,age,amount,num_loan,duration,year_ratio,diff_city,...,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty,installment_done,loan_done,recovery_rate_weighted
0,train,/+6C2lDDYJgzzCXpn96AFA==,0,1,18,2000,1,6,16,40421.53,...,2,1,2,2000.000000,666.666667,53.333333,3.333333,1,0,0.205350
1,train,/+6C2lDDYJgzzCXpn96AFA==,1,1,18,2000,1,6,16,40421.53,...,2,2,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
2,train,/+6C2lDDYJgzzCXpn96AFA==,2,1,18,2000,1,6,16,40421.53,...,2,3,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
3,train,/+6C2lDDYJgzzCXpn96AFA==,3,1,18,2000,1,6,16,40421.53,...,2,4,8,2000.000000,666.666667,53.333333,13.333333,0,0,0.000000
4,train,/+6C2lDDYJgzzCXpn96AFA==,4,1,18,2000,1,6,16,40421.53,...,3,1,0,1666.666667,666.666667,53.333333,0.000000,1,0,0.177079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200009,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,1,2,1740.000000,1740.000000,130.500000,8.700000,1,0,0.000000
200010,test,ZWocy2KIKqJbgKZVoXZIFA==,1,1,30,2900,1,5,18,0.00,...,5,2,-15,1740.000000,1740.000000,130.500000,-37.700000,0,0,0.000000
200011,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,3,24,1740.000000,1740.000000,130.500000,78.300000,0,0,0.000000
200012,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,4,-1,1740.000000,1740.000000,130.500000,8.700000,0,0,0.000000


## Save the data

In [7]:
data_sim_full.to_csv('./Res/simulator_data.csv', index=False)

In [8]:
data_sim_full.to_excel('./Res/simulator_data.xlsx', index=False)