In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [28]:
df = pd.read_csv("credit_data_10000.csv")
df.head()

Unnamed: 0,customer_id,age,gender,monthly_income,total_credit_limit,monthly_emi_outflow,current_outstanding,credit_utilization_ratio,repayment_history_score,dpd_last_3_months,num_hard_inquiries_last_6m,target
0,1,21,Female,42863,239939,6432,121620,0.506879,5,2,1,stable
1,2,51,Male,45232,207422,9739,160254,0.772601,50,35,2,stable
2,3,47,Male,27875,137476,8724,87092,0.633511,92,0,1,stable
3,4,23,Male,16292,114662,3909,101615,0.886212,9,28,1,stable
4,5,61,Female,22203,152953,3811,65782,0.43008,51,51,0,stable


In [29]:
money_cols = ['monthly_income', 'total_credit_limit', 'monthly_emi_outflow','current_outstanding' ]

df[money_cols] = df[money_cols].astype(int)

print(df[money_cols].head())

   monthly_income  total_credit_limit  monthly_emi_outflow  \
0           42863              239939                 6432   
1           45232              207422                 9739   
2           27875              137476                 8724   
3           16292              114662                 3909   
4           22203              152953                 3811   

   current_outstanding  
0               121620  
1               160254  
2                87092  
3               101615  
4                65782  


In [30]:
df.head()

Unnamed: 0,customer_id,age,gender,monthly_income,total_credit_limit,monthly_emi_outflow,current_outstanding,credit_utilization_ratio,repayment_history_score,dpd_last_3_months,num_hard_inquiries_last_6m,target
0,1,21,Female,42863,239939,6432,121620,0.506879,5,2,1,stable
1,2,51,Male,45232,207422,9739,160254,0.772601,50,35,2,stable
2,3,47,Male,27875,137476,8724,87092,0.633511,92,0,1,stable
3,4,23,Male,16292,114662,3909,101615,0.886212,9,28,1,stable
4,5,61,Female,22203,152953,3811,65782,0.43008,51,51,0,stable


In [31]:
# 3. Encode categoricals
le_gender = LabelEncoder()
le_target = LabelEncoder()
df['gender_enc'] = le_gender.fit_transform(df['gender'])
df['target_enc'] = le_target.fit_transform(df['target'])

In [32]:
# 4. Split and train XGBoost
features = [
    'age','gender_enc','monthly_income','total_credit_limit',
    'monthly_emi_outflow','current_outstanding','credit_utilization_ratio',
    'repayment_history_score','dpd_last_3_months','num_hard_inquiries_last_6m'
]
X = df[features]
y = df['target_enc']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [33]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [18]:
y_pred = model.predict(X_test)
print(df.head(10))
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

   customer_id  age gender  monthly_income  total_credit_limit  \
0            1   59   Male           18667               96797   
1            2   42   Male           44482              235349   
2            3   46   Male           10614               31066   
3            4   43   Male           12370               36907   
4            5   26   Male           24071              189757   
5            6   52  Other           16078              126769   
6            7   37   Male           12291               61889   
7            8   56   Male           52339              107954   
8            9   27   Male           23461              121520   
9           10   63   Male           54814              147411   

   monthly_emi_outflow  current_outstanding  credit_utilization_ratio  \
0                 2937                79777                  0.824171   
1                 5315               135654                  0.576397   
2                 2733                22530           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
from sklearn.metrics import accuracy_score

# 1. Predict on the held-out test set
y_pred = model.predict(X_test)

# 2a. Using accuracy_score
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy (accuracy_score): {acc:.3f}")

# 2b. By manual comparison
acc_manual = (y_pred == y_test).mean()
print(f"Test accuracy (manual):         {acc_manual:.3f}")

Test accuracy (accuracy_score): 0.996
Test accuracy (manual):         0.996
