In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
import joblib

In [2]:
# Cell 2: Load Data
df = pd.read_csv("Creditscore_train_cleaned.csv")
X = df.drop('Credit_Score', axis=1)
y = df['Credit_Score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [5]:
# Cell 3: Preprocessing Pipeline
class CreditScorePreprocessor:
    def __init__(self):
        self.num_features = ['Annual_Income', 'Num_Bank_Accounts', 
                           'Interest_Rate', 'Outstanding_Debt', 'Monthly_Balance', 'Monthly_expense']
        self.cat_features = ['Occupation', 'Credit_Mix']
        self.ord_features = ['Payment_Behaviour']
        
        self.scaler = MinMaxScaler()
        self.onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.ordinal = OrdinalEncoder()
        
    def fit_transform(self, X):
        # Scale numeric
        X_num = self.scaler.fit_transform(X[self.num_features])
        
        # Encode categorical
        X_cat = self.onehot.fit_transform(X[self.cat_features])
        
        # Encode ordinal
        X_ord = self.ordinal.fit_transform(X[self.ord_features])
        
        return np.hstack([X_num, X_cat, X_ord])
        
    def transform(self, X):
        X_num = self.scaler.transform(X[self.num_features])
        X_cat = self.onehot.transform(X[self.cat_features])
        X_ord = self.ordinal.transform(X[self.ord_features])
        return np.hstack([X_num, X_cat, X_ord])

preprocessor = CreditScorePreprocessor()
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [7]:
# Cell 2: Model Configuration
lgbm_params = {
    'n_estimators': 500,
    'learning_rate': 0.05,
    'max_depth': 7,
    'random_state': 42,
    'force_row_wise': True,  # Remove overhead warning
    'verbose': -1,          # Reduce verbosity
    'n_jobs': -1,          # Use all cores
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

In [10]:
# Initialize and train model
model = LGBMClassifier(**lgbm_params)
model.fit(
    X_train_processed, 
    y_train,
    eval_set=[(X_test_processed, y_test)],
    eval_metric='multi_logloss',
)



In [11]:
# Cell 5: Evaluation
def evaluate_model(model, X, y, label=""):
    preds = model.predict(X)
    print(f"\n{label} Results:")
    print(classification_report(y, preds))
    
evaluate_model(model, X_train_processed, y_train, "Training")
evaluate_model(model, X_test_processed, y_test, "Test")

# Save artifacts
joblib.dump(preprocessor, 'credit_score_preprocessor.joblib')
joblib.dump(model, 'credit_score_model.joblib')




Training Results:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75     21610
           1       0.81      0.79      0.80     39435
           2       0.64      0.74      0.69     12783

    accuracy                           0.76     73828
   macro avg       0.74      0.75      0.75     73828
weighted avg       0.77      0.76      0.77     73828


Test Results:
              precision    recall  f1-score   support

           0       0.73      0.67      0.70      5403
           1       0.77      0.76      0.76      9858
           2       0.57      0.69      0.62      3196

    accuracy                           0.72     18457
   macro avg       0.69      0.70      0.69     18457
weighted avg       0.72      0.72      0.72     18457





['credit_score_model.joblib']

In [12]:
# Cell 6: Predictions on New Data
def predict_credit_score(data):
    preprocessor = joblib.load('credit_score_preprocessor.joblib')
    model = joblib.load('credit_score_model.joblib')
    
    X_processed = preprocessor.transform(data)
    predictions = model.predict(X_processed)
    probabilities = model.predict_proba(X_processed)
    
    return predictions, probabilities

# Example usage:
test_data = pd.read_csv("Creditscore_test_cleaned.csv")
preds, probs = predict_credit_score(test_data)

results_df = pd.DataFrame({
    'Predicted_Score': preds,
    'Good_Prob': probs[:,0],
    'Standard_Prob': probs[:,1],
    'Poor_Prob': probs[:,2]
})



In [13]:
print(results_df.head())

   Predicted_Score  Good_Prob  Standard_Prob  Poor_Prob
0                2   0.198790       0.305899   0.495311
1                2   0.131193       0.332902   0.535905
2                2   0.124516       0.315959   0.559525
3                2   0.087142       0.315362   0.597496
4                2   0.137323       0.327387   0.535290


In [15]:
import xgboost as xgb
xgb_params = {
    'max_depth': 6,
    'learning_rate': 0.01,
    'n_estimators': 1000,
    'objective': 'multi:softprob',
    'num_class': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'gamma': 0.1,
    'random_state': 42,
    'n_jobs': -1
}

model = xgb.XGBClassifier(**xgb_params)
model.fit(
    X_train_processed,
    y_train,
    eval_set=[(X_test_processed, y_test)],
    verbose=1
)


[0]	validation_0-mlogloss:1.09334
[1]	validation_0-mlogloss:1.08845
[2]	validation_0-mlogloss:1.08348
[3]	validation_0-mlogloss:1.07848
[4]	validation_0-mlogloss:1.07354
[5]	validation_0-mlogloss:1.06915
[6]	validation_0-mlogloss:1.06437
[7]	validation_0-mlogloss:1.05961
[8]	validation_0-mlogloss:1.05494
[9]	validation_0-mlogloss:1.05039
[10]	validation_0-mlogloss:1.04587
[11]	validation_0-mlogloss:1.04185
[12]	validation_0-mlogloss:1.03785
[13]	validation_0-mlogloss:1.03356
[14]	validation_0-mlogloss:1.02975
[15]	validation_0-mlogloss:1.02565
[16]	validation_0-mlogloss:1.02153
[17]	validation_0-mlogloss:1.01777
[18]	validation_0-mlogloss:1.01394
[19]	validation_0-mlogloss:1.01000
[20]	validation_0-mlogloss:1.00612
[21]	validation_0-mlogloss:1.00297
[22]	validation_0-mlogloss:1.00004
[23]	validation_0-mlogloss:0.99634
[24]	validation_0-mlogloss:0.99287
[25]	validation_0-mlogloss:0.98940
[26]	validation_0-mlogloss:0.98590
[27]	validation_0-mlogloss:0.98254
[28]	validation_0-mlogloss:0.9



AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=1000, n_jobs=-1, num_class=3,
              num_parallel_tree=None, ...)

In [16]:
def evaluate_model(model, X, y, label=""):
    preds = model.predict(X)
    print(f"\n{label} Results:")
    print(classification_report(y, preds))

evaluate_model(model, X_train_processed, y_train, "Training")
evaluate_model(model, X_test_processed, y_test, "Test")

# Cell 7: Save Model
import joblib
joblib.dump(preprocessor, 'credit_score_preprocessor.joblib')
joblib.dump(model, 'credit_score_xgb_model.joblib')

# Cell 8: Prediction Function
def predict_credit_score(data):
    preprocessor = joblib.load('credit_score_preprocessor.joblib')
    model = joblib.load('credit_score_xgb_model.joblib')
    
    X_processed = preprocessor.transform(data)
    predictions = model.predict(X_processed)
    probabilities = model.predict_proba(X_processed)
    
    return predictions, probabilities


Training Results:
              precision    recall  f1-score   support

           0       0.75      0.67      0.71     21610
           1       0.79      0.75      0.77     39435
           2       0.55      0.73      0.63     12783

    accuracy                           0.72     73828
   macro avg       0.70      0.72      0.70     73828
weighted avg       0.74      0.72      0.73     73828


Test Results:
              precision    recall  f1-score   support

           0       0.72      0.64      0.67      5403
           1       0.77      0.72      0.74      9858
           2       0.52      0.71      0.60      3196

    accuracy                           0.70     18457
   macro avg       0.67      0.69      0.67     18457
weighted avg       0.71      0.70      0.70     18457

