In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/debt-default-prediction/X_test.csv
/kaggle/input/debt-default-prediction/DataDictionary.xlsx
/kaggle/input/debt-default-prediction/valid.csv
/kaggle/input/debt-default-prediction/train.csv


# Train dataset is read and the rows with missing target value is removed from the dataset.

In [2]:
X_train = pd.read_csv('/kaggle/input/debt-default-prediction/train.csv')
X_train.dropna(axis=0, subset=['loan_status'], inplace=True)
X_train.head()
y_train = X_train.loan_status
X_train.drop(['loan_status'], axis = 1,inplace= True )


In [3]:
print(X_train.shape)
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column >0])

(517788, 144)
id                       517788
member_id                517788
emp_title                 34051
emp_length                31300
url                      517788
                          ...  
settlement_status        498528
settlement_date          498528
settlement_amount        498528
settlement_percentage    498528
settlement_term          498528
Length: 104, dtype: int64


In [4]:
num_cols_with_missing = (missing_val_count_by_column >200000).sum()
num_cols_with_missing

58

In [5]:
# Identify columns with more than 50,000 missing values
cols_to_drop = missing_val_count_by_column[missing_val_count_by_column > 200000].index

# Ensure that the columns to drop actually exist in X_train
cols_to_drop = [col for col in cols_to_drop if col in X_train.columns]

# Drop columns from X_train
X_train.drop(cols_to_drop, axis=1, inplace=True)

# Print the shape of X_train after dropping columns
print(X_train.shape)

(517788, 86)


In [6]:
X_valid = pd.read_csv('/kaggle/input/debt-default-prediction/valid.csv')
X_valid.dropna(axis=0, subset=['loan_status'], inplace=True)

y_valid = X_valid.loan_status
X_valid.drop(['loan_status'], axis = 1,inplace= True )
train_columns  = X_train.columns
X_valid = X_valid[train_columns]
X_valid.shape

(172596, 86)

In [None]:
X_test = pd.read_csv('/kaggle/input/debt-default-prediction/X_test.csv')
X_test = X_test[train_columns]
X_test.shape

* **Let us handle the rest of the missing values with imputations , numerical values with the mean of the column and the object columns with the most frequent value.**

In [7]:
unique_values_counts = X_train.nunique()

# Find columns where number of unique values is equal to 1
columns_with_same_value = unique_values_counts[unique_values_counts == 1].index.tolist()

# Print columns with the same value for every entry

X_train = X_train.drop(columns=columns_with_same_value)
X_valid = X_valid.drop(columns=columns_with_same_value)
X_test = X_test.drop(columns=columns_with_same_value)
print(X_train.shape, X_valid.shape, X_test.shape)

(517788, 81) (172596, 81)


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X_train.columns if 
                    X_train[cname].dtype == 'object']

numerical_imputer = SimpleImputer(strategy='mean')
X_train_numerical = X_train[numerical_cols].copy()
X_valid_numerical = X_valid[numerical_cols].copy()
X_test_numerical = X_test[numerical_cols].copy()
X_train_numerical = numerical_imputer.fit_transform(X_train_numerical)
X_valid_numerical = numerical_imputer.transform(X_valid_numerical)
X_test_numerical = numerical_imputer.transform(X_test_numerical)


# Preprocessing for categorical data
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train_categorical = X_train[categorical_cols].copy()
X_valid_categorical = X_valid[categorical_cols].copy()
X_test_categorical = X_test[categorical_cols].copy()
X_train_categorical = categorical_imputer.fit_transform(X_train_categorical)
X_valid_categorical = categorical_imputer.transform(X_valid_categorical)
X_test_categorical = categorical_imputer.transform(X_test_categorical)

# Convert back to DataFrame

X_train_numerical = pd.DataFrame(X_train_numerical, columns=numerical_cols)
X_train_categorical = pd.DataFrame(X_train_categorical, columns=categorical_cols)
X_valid_numerical = pd.DataFrame(X_valid_numerical, columns=numerical_cols)
X_valid_categorical = pd.DataFrame(X_valid_categorical, columns=categorical_cols)
X_test_numerical = pd.DataFrame(X_test_numerical, columns=numerical_cols)
X_test_categorical = pd.DataFrame(X_test_categorical, columns=categorical_cols)



# Now you can combine the numerical and categorical data
X_train = pd.concat([X_train_numerical, X_train_categorical], axis=1)
X_valid =  pd.concat([X_valid_numerical, X_valid_categorical], axis=1)
X_test=  pd.concat([X_test_numerical, X_test_categorical], axis=1)



In [9]:
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column >0])
missing_val_count_by_column_valid = (X_valid.isnull().sum())
print(missing_val_count_by_column_valid[missing_val_count_by_column_valid >0])
print(X_train.shape, X_valid.shape)

Series([], dtype: int64)
Series([], dtype: int64)
(517788, 81) (172596, 81)


In [10]:
X_train[categorical_cols].head()

Unnamed: 0,term,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,purpose,title,zip_code,addr_state,earliest_cr_line,initial_list_status,last_pymnt_d,last_credit_pull_d,application_type,disbursement_method,debt_settlement_flag
0,36 months,A,A4,Paralegal,1 year,MORTGAGE,Not Verified,Aug-2017,debt_consolidation,Debt consolidation,740xx,OK,Feb-2003,f,Dec-2018,Dec-2018,Individual,Cash,N
1,36 months,D,D2,Teacher,10+ years,MORTGAGE,Verified,Jul-2014,debt_consolidation,Debt consolidation,337xx,FL,Mar-1982,w,Jul-2017,Jul-2017,Individual,Cash,N
2,36 months,A,A4,owner,4 years,MORTGAGE,Not Verified,Mar-2016,credit_card,Credit card refinancing,786xx,TX,Jul-1997,f,Oct-2017,Oct-2017,Individual,Cash,N
3,36 months,B,B5,Teacher,10+ years,MORTGAGE,Source Verified,Jan-2015,debt_consolidation,Debt consolidation,780xx,TX,Apr-1998,f,Jan-2018,Dec-2017,Individual,Cash,N
4,36 months,A,A4,Senior UX designer,< 1 year,MORTGAGE,Source Verified,Mar-2016,credit_card,Credit card refinancing,191xx,PA,Jan-2001,w,Dec-2017,Jul-2018,Individual,Cash,N


In [11]:
X_train['emp_length'].unique()

array(['1 year', '10+ years', '4 years', '< 1 year', '5 years', '2 years',
       '7 years', '9 years', '3 years', '8 years', '6 years'],
      dtype=object)

In [12]:
from sklearn.preprocessing import OrdinalEncoder
#custom_order_grade = ['A', 'B','C','D','E','F','G']
custom_order_subgrade = ['A1','A2','A3','A4','A5','B1','B2','B3','B4','B5','C1','C2','C3','C4','C5','D1','D2','D3','D4','D5','E1','E2','E3','E4','E5','F1','F2','F3','F4','F5','G1','G2','G3','G4','G5']
ordinal_encoder = OrdinalEncoder(categories=[custom_order_subgrade])
X_train['sub_grade'] = ordinal_encoder.fit_transform(X_train[['sub_grade']])


In [13]:
X_valid['sub_grade'] = ordinal_encoder.transform(X_valid[['sub_grade']]) 
X_test['sub_grade'] = ordinal_encoder.transform(X_test[['sub_grade']]) 

custom_order_emp_length = ['< 1 year', '1 year', '2 years', '3 years','4 years',  '5 years','6 years', '7 years','8 years','9 years','10+ years']
ordinal_encoder2 = OrdinalEncoder(categories=[custom_order_emp_length])
X_train['emp_length'] = ordinal_encoder2.fit_transform(X_train[['emp_length']])
X_valid['emp_length'] = ordinal_encoder2.transform(X_valid[['emp_length']])
X_test['emp_length'] = ordinal_encoder2.transform(X_test[['emp_length']])
print('done')

done


In [14]:
X_valid['sub_grade'].head()

0    16.0
1    11.0
2    11.0
3     6.0
4     8.0
Name: sub_grade, dtype: float64

In [15]:

X_train_modified= X_train.drop(columns = ['grade'])
X_valid_modified = X_valid.drop(columns = ['grade'])
X_test_modified = X_test.drop(columns = ['grade'])# redundant feature with grade


In [16]:
X_valid_modified.shape

(172596, 80)

In [17]:
# List to store features with unequal unique values
columns_with_different_unique_values = []

categorical_cols = [cname for cname in X_train_modified.columns if 
                    X_train_modified[cname].dtype == 'object']
#print(categorical_cols)
# Iterate over each feature
for col in categorical_cols:
    # Get unique values of the feature in train and validation sets
    train_unique_values = set(X_train_modified[col].unique())
    valid_unique_values = set(X_valid_modified[col].unique())
    #print(col,X_train[col].nunique() ,X_valid[col].nunique())
    
    # Check if unique values are not equal
    if not valid_unique_values.issubset(train_unique_values):
        columns_with_different_unique_values.append(col)
        print(col, X_train_modified[col].nunique(),X_valid_modified[col].nunique())


# Print features with unequal unique values
print("Features with unequal unique values between X_train and X_valid:")
print(columns_with_different_unique_values)

emp_title 175168 70130
title 27117 10637
zip_code 909 886
earliest_cr_line 709 670
last_pymnt_d 133 131
last_credit_pull_d 132 128
Features with unequal unique values between X_train and X_valid:
['emp_title', 'title', 'zip_code', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d']


In [18]:
good_label_cols = [col for col in categorical_cols if 
                   set(X_valid_modified[col]).issubset(set(X_train_modified[col]))]

bad_label_cols = list(set(categorical_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)        

Categorical columns that will be ordinal encoded: ['term', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method', 'debt_settlement_flag']

Categorical columns that will be dropped from the dataset: ['emp_title', 'earliest_cr_line', 'last_pymnt_d', 'zip_code', 'last_credit_pull_d', 'title']


The bad labels are removed since in the valid data set those features contain different labels from the train dataset

In [19]:
X_train_pre_encoding = X_train_modified.drop(bad_label_cols, axis=1)
X_valid_pre_encoding= X_valid_modified.drop(bad_label_cols, axis=1)

In [20]:
set(X_train_pre_encoding['term'].unique()) == set(X_valid_pre_encoding['term'].unique())

True

In [21]:
X_valid_pre_encoding['term'].unique()

array([' 60 months', ' 36 months'], dtype=object)

In [22]:
X_train_pre_encoding['term'].unique()

array([' 36 months', ' 60 months'], dtype=object)

In [23]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
X_train_encoded = X_train_pre_encoding.copy()
X_valid_encoded = X_valid_pre_encoding.copy()
print(set(X_train_encoded['term'].unique()) == set(X_valid_encoded['term'].unique()))
X_train_encoded[good_label_cols] = ordinal_encoder.fit_transform(X_train_pre_encoding[good_label_cols])
X_valid_encoded[good_label_cols] = ordinal_encoder.transform(X_valid_pre_encoding[good_label_cols])


True


In [24]:
categorical_cols = [cname for cname in X_train_encoded.columns if X_train_encoded[cname].dtype == 'object']
categorical_cols

[]

In [30]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

X_train_new  = X_train_encoded
X_valid_new = X_valid_encoded

# Define and train an XGBoost classifier
model = xgb.XGBClassifier()
model.fit(X_train_new, y_train)
y_pred_initial = model.predict(X_valid_new)
accuracy_initial = accuracy_score(y_valid, y_pred_initial)
precision_initial = precision_score(y_valid, y_pred_initial)
print(f"Accuracy with initial features: {accuracy_initial}")
print(f"precision with initial features: {precision_initial}")
# Get feature importance scores
feature_importances = model.feature_importances_

# Print feature importance scores
# for i, score in enumerate(feature_importances):
#     print(f"Feature '{X_train_new.columns[i]}' importance: {score}")

# Use feature importance scores to select features
selected_features = [X_train_new.columns[i] for i, score in enumerate(feature_importances) if score > 0.00005]

# Select only the selected features for both training and validation sets
X_train_selected = X_train_new[selected_features]
X_valid_selected = X_valid_new[selected_features]

# Train and evaluate the model using only selected features
model_selected = xgb.XGBClassifier()
model_selected.fit(X_train_selected, y_train)
y_pred_selected = model_selected.predict(X_valid_selected)
accuracy_selected = accuracy_score(y_valid, y_pred_selected)
precision = precision_score(y_valid, y_pred_selected)
print(f"Accuracy with selected features: {accuracy_selected}")
print(f"precision with selected features: {precision}")


Accuracy with initial features: 0.9994669633131706
precision with initial features: 0.9992604963814177
Accuracy with selected features: 0.9994727571902013
precision with selected features: 0.9992687993352721


In [31]:
X_train_new.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,emp_length,home_ownership,verification_status,issue_d,purpose,addr_state,initial_list_status,application_type,disbursement_method,debt_settlement_flag
0,14000.0,14000.0,14000.0,7.35,434.53,44000.0,34.7,0.0,0.0,7.0,...,1.0,1.0,0.0,21.0,2.0,36.0,0.0,0.0,0.0,0.0
1,2000.0,2000.0,2000.0,16.29,70.61,11420.0,20.61,0.0,0.0,10.0,...,10.0,1.0,2.0,64.0,2.0,9.0,1.0,0.0,0.0,0.0
2,5000.0,5000.0,4750.0,7.39,155.28,65000.0,11.04,0.0,0.0,7.0,...,4.0,1.0,0.0,88.0,1.0,43.0,0.0,0.0,0.0,0.0
3,20600.0,20600.0,20600.0,11.99,684.12,46956.0,24.43,0.0,0.0,11.0,...,10.0,1.0,1.0,53.0,2.0,43.0,0.0,0.0,0.0,0.0
4,10000.0,10000.0,10000.0,7.39,310.56,115000.0,17.94,0.0,0.0,17.0,...,0.0,1.0,1.0,88.0,1.0,38.0,1.0,0.0,0.0,0.0


Now let us try a PCA analaysis

In [32]:
X_valid_new.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,emp_length,home_ownership,verification_status,issue_d,purpose,addr_state,initial_list_status,application_type,disbursement_method,debt_settlement_flag
0,14575.0,14575.0,14575.0,17.57,366.71,36500.0,20.55,0.0,0.0,14.0,...,10.0,5.0,1.0,86.0,2.0,34.0,0.0,0.0,0.0,0.0
1,10000.0,10000.0,10000.0,13.59,339.79,66000.0,14.73,0.0,0.0,5.0,...,3.0,5.0,1.0,78.0,6.0,9.0,1.0,0.0,0.0,0.0
2,17000.0,17000.0,17000.0,13.35,575.67,65000.0,25.06,0.0,0.0,13.0,...,10.0,1.0,2.0,121.0,2.0,43.0,1.0,0.0,0.0,0.0
3,4500.0,4500.0,4500.0,9.93,145.06,21600.0,11.39,0.0,0.0,10.0,...,10.0,5.0,1.0,124.0,2.0,36.0,0.0,0.0,0.0,0.0
4,6250.0,6250.0,6250.0,12.99,210.56,35000.0,11.49,0.0,3.0,15.0,...,10.0,5.0,0.0,29.0,2.0,14.0,0.0,0.0,0.0,0.0


In [27]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 30)
X_train_pca = pca.fit_transform(X_train_selected)
component_names  = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]
X_train_pca = pd.DataFrame(X_train_pca, columns= component_names)
X_valid_pca = pca.transform(X_valid_selected)
X_valid_pca = pd.DataFrame(X_valid_pca, columns= component_names)
X_valid_pca.head() 
#X_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30
0,-172636.165911,-16956.493622,5690.137564,16445.02834,-3283.793656,-11101.573946,9405.835633,5241.698347,1831.155843,-4211.143775,...,79.722847,-7.905431,-16.34996,1.567377,-7.980903,3.744865,2.041882,-4.609702,3.241429,-20.069712
1,-173517.112414,10038.653755,1069.741881,-23635.311012,1189.725468,1021.789765,4310.628143,1459.5036,3994.160455,-1614.140163,...,-88.044186,-53.02967,-6.783695,1.410515,14.185092,-0.770911,5.255154,-2.449595,11.190191,-5.325664
2,-98780.876521,18586.093497,-50000.898131,-15362.811735,-7412.970509,-12752.566155,6201.656092,-2499.136006,2023.705463,-2617.34213,...,-47.259163,57.5898,-51.141792,8.682393,-20.694235,1.772766,4.67967,-4.95074,-2.835614,5.405579
3,-213026.274572,-36658.444172,22835.840909,6031.520282,-1503.53166,14157.155799,-378.22587,1820.876647,3038.278944,-4612.953419,...,-37.656923,-48.662101,-52.553066,-21.418673,-13.389966,4.026859,-4.1831,1.680589,3.921442,5.93911
4,-191165.782132,-23404.53698,25299.994622,25123.106486,1337.573496,14387.348197,2401.17037,8707.6882,3918.446987,-3892.393058,...,4.140626,-50.728688,41.867348,2.484935,8.090784,-2.640288,-0.567593,-1.208332,-7.820421,6.628247


In [28]:
loadings   = pd.DataFrame(pca.components_.T,
                         columns  = component_names,
                         
                         )
loadings = pd.DataFrame(pca.components_.T, columns=component_names)
loadings

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30
0,0.01256357,0.03305516,-0.004413627,0.0446978,-0.02016777,-0.3454754,0.07456085,0.002270701,-0.3022442,0.2265631,...,0.000935,0.00105,0.000555,-0.000171,1.4e-05,0.0005580946,9.298123e-05,6.221951e-05,0.000198,0.0002066279
1,0.0125632,0.03303395,-0.004422193,0.04469345,-0.02014475,-0.3454307,0.0744497,0.002387397,-0.3017526,0.2263122,...,-0.003668,0.000498,-0.000959,0.002004,0.000102,-0.00019697,-0.000748066,-0.001030106,-4e-06,0.0003520252
2,0.01255986,0.03300416,-0.004426352,0.04468475,-0.02012181,-0.3455072,0.07442168,0.002541334,-0.3013244,0.2260692,...,0.000963,4.9e-05,0.000531,-0.000396,-0.000108,-0.0004405995,0.0005329857,0.001176163,-5e-05,-0.0004435754
3,-2.174997e-06,-1.459602e-06,-7.255573e-06,-1.482441e-05,-1.830511e-05,-2.863113e-05,6.946099e-05,-1.090489e-05,-0.0001302146,3.667499e-05,...,-0.002914,0.009332,-0.000342,0.014171,-0.000213,-0.0001209417,-0.01690034,-0.0009063154,0.019328,0.02807561
4,0.0003413667,0.0009994234,-0.0001230551,0.001244501,-0.000649202,-0.009905592,0.002461035,-0.0001338449,-0.008754419,0.006243431,...,0.057051,-0.050455,-0.004332,-0.046291,-0.002133,-2.403247e-05,0.003178156,-0.01338894,-0.002684,-0.003727909
5,0.1282808,0.9259356,0.3226253,-0.1336408,0.005216145,0.06339068,-0.0006857844,0.003380857,0.00761357,-0.007625684,...,-9e-06,2e-05,-1e-06,7e-06,-2e-06,1.322188e-07,-9.021322e-07,-3.466821e-06,-1e-06,7.754663e-07
6,1.255041e-07,4.163664e-07,-3.691896e-07,-3.085662e-07,7.338015e-07,2.245097e-06,-1.287164e-06,4.120695e-06,-6.457747e-06,-5.586121e-06,...,0.000159,-0.002595,0.000285,0.002956,-0.002059,-0.003596436,-0.01053874,0.002742885,-0.005205,0.003121826
7,-1.913236e-07,1.681716e-07,2.382027e-07,-1.362958e-06,6.530813e-07,1.311712e-06,1.847897e-06,3.224005e-06,-6.603977e-07,-1.365626e-06,...,0.000668,-0.001317,1.5e-05,0.00065,0.000162,-0.001612185,-0.001374116,0.0004418704,-0.001332,-0.002073064
8,0.04331137,0.062876,-0.04097019,0.3468032,-0.332818,0.1006779,0.4288144,-0.2632321,-0.2976625,0.05551961,...,-0.000372,-0.001572,-2e-06,-0.00012,-0.000173,-0.0004169874,2.246359e-05,-1.00994e-05,-5.7e-05,-9.01432e-05
9,5.640709e-06,7.385352e-06,-2.772627e-05,-8.225003e-05,-0.0002381862,-7.35056e-05,0.0005386105,-0.0006521568,-0.0003820512,0.0002033722,...,0.055197,0.375291,0.002789,0.077919,0.266165,0.6601191,-0.1206357,0.04847354,0.013918,0.5682512


In [29]:
model_with_pca = xgb.XGBClassifier()
model_with_pca.fit(X_train_pca, y_train)
y_pred_pca = model_with_pca.predict(X_valid_pca)
accuracy_pca = accuracy_score(y_valid, y_pred_pca)
precision_pca = precision_score(y_valid, y_pred_pca)
print(f"Accuracy with selected features: {accuracy_pca}")
print(f"precision with selected features: {precision_pca}")

Accuracy with selected features: 0.9950462351387054
precision with selected features: 0.9933237486159998
