In [69]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder


In [70]:
data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [71]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,customer_id,firstname,lastname,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency,primary_close_flag,final_close_flag
0,4814dfa2-45a8-46b9-8102-98ecbbec2d2a,Navya,Vasa,7,10,8,12,10,13,5,11,2,3,2,4,6,13,5,4,8,Yes,No,Yes,Yes,Yes,16,2,17,Yes,Yes,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,3.0,4.0,1,3,4,1,0,0
1,bb0abe41-cc89-4a1d-bb1c-48bd212ae00d,Azad,Warrior,5,15,6,7,6,9,1,5,6,3,2,7,6,16,5,4,8,Yes,Yes,Yes,Yes,Yes,16,2,17,Yes,Yes,Yes,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
2,edfd1a62-05da-4cfd-b8ae-521f05f0fd5f,Piya,Jha,1,15,17,8,5,9,1,6,2,3,2,4,6,16,5,4,8,Yes,Yes,Yes,Yes,Yes,16,2,17,Yes,Yes,Yes,0.0,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
3,3eaa4c2a-2316-48f4-b1b0-4aafe2286630,Rhea,Varma,11,7,2,11,13,14,8,14,3,5,2,0,6,16,5,4,8,Yes,Yes,Yes,Yes,Yes,18,2,17,No,Yes,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,2,4,1,0,0
4,c9a73239-baae-40eb-8855-dab07767ea86,Sahil,Dave,12,14,9,8,11,6,13,5,6,3,2,4,6,16,5,4,8,Yes,Yes,Yes,Yes,Yes,16,2,17,Yes,Yes,Yes,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0


In [72]:
print(data['loans_within_60_to_90_days'].unique()) 
print(data['is_zero_loans_within_5_days'].unique()) 
print(data['is_zero_loans_within_5_to_30_days'].unique()) 
print(data['is_zero_loans_within_30_to_60_days'].unique()) 
print(data['is_zero_loans_within_60_to_90_days'].unique()) 
print(data['is_zero_loans_over_90_days'].unique()) 
print(data['is_zero_utilization'].unique())
print(data['is_zero_over_limit_count'].unique())
print(data['is_zero_max_over_limit_count'].unique())

[4 1]
['Yes' 'No']
['No' 'Yes']
['Yes' 'No']
['Yes' 'No']
['Yes' 'No']
['Yes' 'No']
['Yes' 'No']
['Yes' 'No']


In [73]:
print(data['loans_within_60_to_90_days'].value_counts()) 


loans_within_60_to_90_days
4    1106641
1         33
Name: count, dtype: int64


In [74]:
data['is_zero_loans_within_5_days'] = data['is_zero_loans_within_5_days'].map({'Yes': 1, 'No': 0})
data['is_zero_loans_within_5_to_30_days'] = data['is_zero_loans_within_5_to_30_days'].map({'Yes': 1, 'No': 0})
data['is_zero_loans_within_30_to_60_days'] = data['is_zero_loans_within_30_to_60_days'].map({'Yes': 1, 'No': 0})
data['is_zero_loans_within_60_to_90_days'] = data['is_zero_loans_within_60_to_90_days'].map({'Yes': 1, 'No': 0})
data['is_zero_loans_over_90_days'] = data['is_zero_loans_over_90_days'].map({'Yes': 1, 'No': 0})
data['is_zero_utilization'] = data['is_zero_utilization'].map({'Yes': 1, 'No': 0})
data['is_zero_over_limit_count'] = data['is_zero_over_limit_count'].map({'Yes': 1, 'No': 0})
data['is_zero_max_over_limit_count'] = data['is_zero_max_over_limit_count'].map({'Yes': 1, 'No': 0})

In [75]:
test_data['is_zero_loans_within_5_days'] = test_data['is_zero_loans_within_5_days'].map({'Yes': 1, 'No': 0})
test_data['is_zero_loans_within_5_to_30_days'] = test_data['is_zero_loans_within_5_to_30_days'].map({'Yes': 1, 'No': 0})
test_data['is_zero_loans_within_30_to_60_days'] = test_data['is_zero_loans_within_30_to_60_days'].map({'Yes': 1, 'No': 0})
test_data['is_zero_loans_within_60_to_90_days'] = test_data['is_zero_loans_within_60_to_90_days'].map({'Yes': 1, 'No': 0})
test_data['is_zero_loans_over_90_days'] = test_data['is_zero_loans_over_90_days'].map({'Yes': 1, 'No': 0})
test_data['is_zero_utilization'] = test_data['is_zero_utilization'].map({'Yes': 1, 'No': 0})
test_data['is_zero_over_limit_count'] = test_data['is_zero_over_limit_count'].map({'Yes': 1, 'No': 0})
test_data['is_zero_max_over_limit_count'] = test_data['is_zero_max_over_limit_count'].map({'Yes': 1, 'No': 0})

In [76]:
data.tail()

Unnamed: 0,customer_id,firstname,lastname,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency,primary_close_flag,final_close_flag
1106669,c341a4e3-b4e2-455b-80b2-2be2655e6e85,Gatik,Boase,19,1,5,8,11,2,6,18,2,3,2,10,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106670,29d8337c-13c9-4bb6-b794-762c386b1866,Jhanvi,Andra,10,5,8,4,3,5,7,18,2,3,2,6,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106671,e3b374f2-d96e-4c8b-901a-be80a8bea3cc,Lakshit,Kulkarni,3,8,7,13,0,4,9,3,2,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
1106672,b81e5b47-5c1b-4302-897c-2edc179d583f,Parinaaz,Chaudry,4,10,4,16,9,6,13,3,2,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
1106673,62419cd3-7d86-4484-a684-93ace05cbbc0,Faiyaz,Mani,12,1,10,10,13,10,8,8,2,3,2,2,6,16,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,2,3,1,0,0


In [77]:
test_data.tail()

Unnamed: 0,customer_id,firstname,lastname,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency
474284,cd83731c-a9ec-47de-9071-36914c800f35,Yashvi,Saini,1,10,4,7,11,6,13,5,6,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1
474285,562f1181-ad6e-467e-82fb-e38b443e9c3a,Ranbir,Sarna,8,3,11,7,3,11,12,13,2,3,2,6,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1
474286,558ae4c7-e2d1-4e92-936d-ca0dda807e9f,Inaaya,Varghese,12,6,13,12,10,16,7,16,2,3,2,7,6,13,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,1,3,4,1
474287,26fa2f73-8c6d-4280-8610-13170a3062dc,Prisha,Kale,16,2,16,0,7,10,8,8,2,3,2,7,6,16,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,2,3,1
474288,9541aae2-7992-4e5d-810a-33db4444e3bb,Divij,Upadhyay,8,10,14,12,10,13,5,12,2,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,3,4,1


In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106674 entries, 0 to 1106673
Data columns (total 62 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   customer_id                         1106674 non-null  object 
 1   firstname                           1106674 non-null  object 
 2   lastname                            1106674 non-null  object 
 3   record_number                       1106674 non-null  int64  
 4   days_since_opened                   1106674 non-null  int64  
 5   days_since_confirmed                1106674 non-null  int64  
 6   primary_term                        1106674 non-null  int64  
 7   final_term                          1106674 non-null  int64  
 8   days_till_primary_close             1106674 non-null  int64  
 9   days_till_final_close               1106674 non-null  int64  
 10  loans_credit_limit                  1106674 non-null  int64  
 11  loans_next_

In [79]:
data.isnull()

Unnamed: 0,customer_id,firstname,lastname,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency,primary_close_flag,final_close_flag
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106669,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1106670,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1106671,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1106672,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


In [80]:
for column in data.columns:
    data[column].fillna(data[column].mode(), inplace=True)


In [81]:
data

Unnamed: 0,customer_id,firstname,lastname,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency,primary_close_flag,final_close_flag
0,4814dfa2-45a8-46b9-8102-98ecbbec2d2a,Navya,Vasa,7,10,8,12,10,13,5,11,2,3,2,4,6,13,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,3.0,4.0,1,3,4,1,0,0
1,bb0abe41-cc89-4a1d-bb1c-48bd212ae00d,Azad,Warrior,5,15,6,7,6,9,1,5,6,3,2,7,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
2,edfd1a62-05da-4cfd-b8ae-521f05f0fd5f,Piya,Jha,1,15,17,8,5,9,1,6,2,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
3,3eaa4c2a-2316-48f4-b1b0-4aafe2286630,Rhea,Varma,11,7,2,11,13,14,8,14,3,5,2,0,6,16,5,4,8,1,1,1,1,1,18,2,17,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,2,4,1,0,0
4,c9a73239-baae-40eb-8855-dab07767ea86,Sahil,Dave,12,14,9,8,11,6,13,5,6,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106669,c341a4e3-b4e2-455b-80b2-2be2655e6e85,Gatik,Boase,19,1,5,8,11,2,6,18,2,3,2,10,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106670,29d8337c-13c9-4bb6-b794-762c386b1866,Jhanvi,Andra,10,5,8,4,3,5,7,18,2,3,2,6,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106671,e3b374f2-d96e-4c8b-901a-be80a8bea3cc,Lakshit,Kulkarni,3,8,7,13,0,4,9,3,2,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
1106672,b81e5b47-5c1b-4302-897c-2edc179d583f,Parinaaz,Chaudry,4,10,4,16,9,6,13,3,2,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0


In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106674 entries, 0 to 1106673
Data columns (total 62 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   customer_id                         1106674 non-null  object 
 1   firstname                           1106674 non-null  object 
 2   lastname                            1106674 non-null  object 
 3   record_number                       1106674 non-null  int64  
 4   days_since_opened                   1106674 non-null  int64  
 5   days_since_confirmed                1106674 non-null  int64  
 6   primary_term                        1106674 non-null  int64  
 7   final_term                          1106674 non-null  int64  
 8   days_till_primary_close             1106674 non-null  int64  
 9   days_till_final_close               1106674 non-null  int64  
 10  loans_credit_limit                  1106674 non-null  int64  
 11  loans_next_

In [83]:
data = data.drop(['customer_id', 'firstname', 'lastname'], axis = 1)

In [84]:
data

Unnamed: 0,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency,primary_close_flag,final_close_flag
0,7,10,8,12,10,13,5,11,2,3,2,4,6,13,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,3.0,4.0,1,3,4,1,0,0
1,5,15,6,7,6,9,1,5,6,3,2,7,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
2,1,15,17,8,5,9,1,6,2,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
3,11,7,2,11,13,14,8,14,3,5,2,0,6,16,5,4,8,1,1,1,1,1,18,2,17,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,2,4,1,0,0
4,12,14,9,8,11,6,13,5,6,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106669,19,1,5,8,11,2,6,18,2,3,2,10,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106670,10,5,8,4,3,5,7,18,2,3,2,6,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106671,3,8,7,13,0,4,9,3,2,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
1106672,4,10,4,16,9,6,13,3,2,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0


In [85]:
test_data = test_data.drop(['customer_id', 'firstname', 'lastname'], axis = 1)

In [86]:
test_data.tail()

Unnamed: 0,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency
474284,1,10,4,7,11,6,13,5,6,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1
474285,8,3,11,7,3,11,12,13,2,3,2,6,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1
474286,12,6,13,12,10,16,7,16,2,3,2,7,6,13,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,1,3,4,1
474287,16,2,16,0,7,10,8,8,2,3,2,7,6,16,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,2,3,1
474288,8,10,14,12,10,13,5,12,2,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,3,4,1


In [87]:
for column in data.columns:
    print(data[column].unique())

[ 7  5  1 11 12 10 33  8 19  3 18  4  2 15  6  9 16 14 21 13 23 20 29 17
 30 24 27 22 25 32 26 34 35 28 36 31 45 38 40 37 39 41 43 42 47 46]
[10 15  7 14 11 13  6  9  5  4  1  2  0  8 19 17 18  3 16 12]
[ 8  6 17  2  9 13  5  7  0 12  4 16  1 14 11 10  3 15]
[12  7  8 11 14 13  9 17  4  3  6  2  1 10  0 15 16  5]
[10  6  5 13 11  7  0 12 14 15  8 16  1  9  3  2  4]
[13  9 14  6  3  8  2 16  7  1  0  4  5 11 15 12 10]
[ 5  1  8 13  6 12 15 14  3 11  9  0  4  2 10  7]
[11  5  6 14  9  1 13  3 12  4 16  8 19  2  0 10 18 17  7 15]
[2 6 3 4 1 5 0]
[3 5 2 4 1]
[2 1 3]
[ 4  7  0 11  5 10  2 13  9  6  3  8  1 12]
[ 6  0  3  5  2 13 16  7  1]
[13 16 18  0  3 12  6 15  2 11  4 14  7 19  1 10]
[5 8 2 7 1 9]
[4 1]
[ 8 13 14 10 19  2]
[1 0]
[0 1]
[1 0]
[1 0]
[1 0]
[16 18  1  7  9  6 13 12  0  5  3 15  2 11 19 10  8  4 17 14]
[ 2  5  1  8 17  6 13  3  9  7 18  4 15 10 16 14  0 11 12 19]
[17  4  0  3  5 11  9 14 19 15 13  1 16 12  6 10  7  8  2 18]
[1 0]
[1 0]
[1 0]
[ 0. nan  3.  1.  2.]
[ 0. nan  1.

In [88]:
data['encoded_payment_0'] = data['encoded_payment_0'].apply(pd.to_numeric, errors='coerce')

In [89]:
payments = ['encoded_payment_0',
            'encoded_payment_1',
            'encoded_payment_2',
            'encoded_payment_3',
            'encoded_payment_4',
            'encoded_payment_5',
            'encoded_payment_6',
            'encoded_payment_7',
            'encoded_payment_8',
            'encoded_payment_9',
            'encoded_payment_10',
            'encoded_payment_11',
            'encoded_payment_12',
            'encoded_payment_13',
            'encoded_payment_14',
            'encoded_payment_15',
            'encoded_payment_16',
            'encoded_payment_17',
            'encoded_payment_18',
            'encoded_payment_19',
            'encoded_payment_20',
            'encoded_payment_21',
            'encoded_payment_22',
            'encoded_payment_23',
            'encoded_payment_24']

for column in payments:
    data[column].fillna(data[column].mode(), inplace=True)


In [90]:
data

Unnamed: 0,record_number,days_since_opened,days_since_confirmed,primary_term,final_term,days_till_primary_close,days_till_final_close,loans_credit_limit,loans_next_payment_summary,loans_outstanding_balance,loans_max_overdue_amount,loans_credit_cost_rate,loans_within_5_days,loans_within_5_to_30_days,loans_within_30_to_60_days,loans_within_60_to_90_days,loans_over_90_days,is_zero_loans_within_5_days,is_zero_loans_within_5_to_30_days,is_zero_loans_within_30_to_60_days,is_zero_loans_within_60_to_90_days,is_zero_loans_over_90_days,utilization,over_limit_count,max_over_limit_count,is_zero_utilization,is_zero_over_limit_count,is_zero_max_over_limit_count,encoded_payment_0,encoded_payment_1,encoded_payment_2,encoded_payment_3,encoded_payment_4,encoded_payment_5,encoded_payment_6,encoded_payment_7,encoded_payment_8,encoded_payment_9,encoded_payment_10,encoded_payment_11,encoded_payment_12,encoded_payment_13,encoded_payment_14,encoded_payment_15,encoded_payment_16,encoded_payment_17,encoded_payment_18,encoded_payment_19,encoded_payment_20,encoded_payment_21,encoded_payment_22,encoded_payment_23,encoded_payment_24,encoded_loans_account_holder_type,encoded_loans_credit_status,encoded_loans_credit_type,encoded_loans_account_currency,primary_close_flag,final_close_flag
0,7,10,8,12,10,13,5,11,2,3,2,4,6,13,5,4,8,1,0,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,3.0,4.0,1,3,4,1,0,0
1,5,15,6,7,6,9,1,5,6,3,2,7,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
2,1,15,17,8,5,9,1,6,2,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
3,11,7,2,11,13,14,8,14,3,5,2,0,6,16,5,4,8,1,1,1,1,1,18,2,17,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,2,4,1,0,0
4,12,14,9,8,11,6,13,5,6,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106669,19,1,5,8,11,2,6,18,2,3,2,10,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106670,10,5,8,4,3,5,7,18,2,3,2,6,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,5,1,0,0
1106671,3,8,7,13,0,4,9,3,2,3,2,2,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0
1106672,4,10,4,16,9,6,13,3,2,3,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,3.0,3.0,3.0,3.0,,3.0,4.0,3.0,3.0,3.0,4.0,1,3,4,1,0,0


In [91]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106674 entries, 0 to 1106673
Data columns (total 59 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   record_number                       1106674 non-null  int64  
 1   days_since_opened                   1106674 non-null  int64  
 2   days_since_confirmed                1106674 non-null  int64  
 3   primary_term                        1106674 non-null  int64  
 4   final_term                          1106674 non-null  int64  
 5   days_till_primary_close             1106674 non-null  int64  
 6   days_till_final_close               1106674 non-null  int64  
 7   loans_credit_limit                  1106674 non-null  int64  
 8   loans_next_payment_summary          1106674 non-null  int64  
 9   loans_outstanding_balance           1106674 non-null  int64  
 10  loans_max_overdue_amount            1106674 non-null  int64  
 11  loans_credi

In [92]:
categorical_columns = ['is_zero_loans_within_5_days', 'is_zero_loans_within_5_to_30_days', 'is_zero_loans_within_30_to_60_days', 'is_zero_loans_within_60_to_90_days', 'is_zero_loans_over_90_days', 'is_zero_utilization', 'is_zero_over_limit_count', 'is_zero_max_over_limit_count', 'encoded_loans_account_holder_type', 'encoded_loans_credit_status', 'encoded_loans_credit_type', 'encoded_loans_account_currency']
label_encoder = LabelEncoder()
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])
    test_data[column] = label_encoder.transform(test_data[column])

In [93]:
labels = ['primary_close_flag', 'final_close_flag']
y_train = data[labels]

In [94]:
X_train = data.drop(labels, axis=1)
X_test = test_data

In [95]:
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', enable_categorical=True)

In [96]:
model.fit(X_train, y_train)

In [97]:
y_pred = model.predict(X_test)

In [98]:
y_pred

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [99]:
result_df = pd.DataFrame(y_pred, columns = ['primary_close_flag', 'final_close_flag'])
result_df.to_csv('submission.csv', index=False)