Import **Libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

Load **Dataset**

In [31]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

  train_df=pd.read_csv('train.csv')


In [32]:
train_df.head()
# test_df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


**Feature Engineering**

In [33]:
train_df.drop(columns=['ID','Customer_ID','Name','Month','SSN'],inplace=True)
test_df.drop(columns=['ID','Customer_ID','Name','Month','SSN'],inplace=True)


In [34]:
train_df['Age'] = pd.to_numeric(train_df['Age'], errors='coerce')
train_df.loc[train_df['Age'] < 0, 'Age'] = np.nan

test_df['Age'] = pd.to_numeric(test_df['Age'], errors='coerce')
test_df.loc[test_df['Age'] < 0, 'Age'] = np.nan

In [35]:
# Clean 'Credit_History_Age': convert "22 Years and 5 Months" to months (e.g., 269)
def convert_credit_age(val):
    try:
        years = int(val.split(' ')[0])
        months = int(val.split(' ')[3])
        return years * 12 + months
    except:
        return np.nan

train_df['Credit_History_Age'] = train_df['Credit_History_Age'].apply(convert_credit_age)
test_df['Credit_History_Age'] = test_df['Credit_History_Age'].apply(convert_credit_age)


In [36]:
# Clean 'Num_of_Delayed_Payment': remove underscores and convert
train_df['Num_of_Delayed_Payment'] = train_df['Num_of_Delayed_Payment'].astype(str).str.extract('(\d+)')
train_df['Num_of_Delayed_Payment'] = pd.to_numeric(train_df['Num_of_Delayed_Payment'], errors='coerce')

test_df['Num_of_Delayed_Payment'] = test_df['Num_of_Delayed_Payment'].astype(str).str.extract('(\d+)')
test_df['Num_of_Delayed_Payment'] = pd.to_numeric(test_df['Num_of_Delayed_Payment'], errors='coerce')

In [37]:
# Handle other numeric columns with possible bad characters
numeric_cols = ['Annual_Income', 'Monthly_Inhand_Salary', 'Outstanding_Debt',
                'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Amount_invested_monthly',
                'Monthly_Balance', 'Changed_Credit_Limit', 'Num_Credit_Inquiries','Num_of_Loan']
for col in numeric_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

In [38]:
train_df.isnull().sum()

Unnamed: 0,0
Age,5825
Occupation,0
Annual_Income,6980
Monthly_Inhand_Salary,15002
Num_Bank_Accounts,0
Num_Credit_Card,0
Interest_Rate,0
Num_of_Loan,4785
Type_of_Loan,11408
Delay_from_due_date,0


In [39]:
# List numeric columns with missing values (from  data)
numeric_cols_with_nan = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_of_Delayed_Payment',
                         'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt','Num_of_Loan',
                         'Amount_invested_monthly', 'Monthly_Balance', 'Credit_History_Age', 'Total_EMI_per_month']

# List categorical columns with missing values
categorical_cols_with_nan = ['Type_of_Loan']

# Fill numeric columns with median values
for col in numeric_cols_with_nan:
    median_value = train_df[col].median()
    train_df[col].fillna(median_value, inplace=True)

    median_value = test_df[col].median()
    test_df[col].fillna(median_value, inplace=True)

# Fill categorical columns with mode (most frequent value)
for col in categorical_cols_with_nan:
    mode_value = train_df[col].mode()[0]  # mode() returns a Series; take first
    train_df[col].fillna(mode_value, inplace=True)

    mode_value = test_df[col].mode()[0]  # mode() returns a Series; take first
    test_df[col].fillna(mode_value, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting val

In [40]:
train_df.isnull().sum()

Unnamed: 0,0
Age,0
Occupation,0
Annual_Income,0
Monthly_Inhand_Salary,0
Num_Bank_Accounts,0
Num_Credit_Card,0
Interest_Rate,0
Num_of_Loan,0
Type_of_Loan,0
Delay_from_due_date,0


In [41]:
train_df.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,_,809.98,26.82262,265.0,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good
1,23.0,Scientist,19114.12,3093.745,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,...,Good,809.98,31.94496,219.0,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,Good
2,33.0,Scientist,19114.12,3093.745,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,Good,809.98,28.609352,267.0,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,Good
3,23.0,Scientist,19114.12,3093.745,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,...,Good,809.98,31.377862,268.0,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,Good
4,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,...,Good,809.98,24.797347,269.0,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good


In [42]:
print(train_df['Occupation'].unique())



['Scientist' '_______' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer'
 'Lawyer' 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant'
 'Musician' 'Mechanic' 'Writer' 'Architect']


In [43]:
train_df['Occupation'] = train_df['Occupation'].replace('_______', np.nan)
most_common = train_df['Occupation'].mode()[0]
train_df['Occupation'].fillna(most_common, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Occupation'].fillna(most_common, inplace=True)


In [44]:
print(train_df['Occupation'].unique())

['Scientist' 'Lawyer' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer'
 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant' 'Musician'
 'Mechanic' 'Writer' 'Architect']


In [45]:
print(train_df['Type_of_Loan'].unique())



['Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan'
 'Credit-Builder Loan' 'Auto Loan, Auto Loan, and Not Specified' ...
 'Home Equity Loan, Auto Loan, Auto Loan, and Auto Loan'
 'Payday Loan, Student Loan, Mortgage Loan, and Not Specified'
 'Personal Loan, Auto Loan, Mortgage Loan, Student Loan, and Student Loan']


In [46]:
import numpy as np

allowed_loans = {
    'Auto Loan',
    'Credit-Builder Loan',
    'Personal Loan',
    'Home Equity Loan',
    'Mortgage Loan',
    'Payday Loan',
    'Student Loan'
}

def pick_allowed_loan(raw):
    if pd.isna(raw) or raw.strip() == '':
        return np.nan
    raw = raw.replace(' and ', ',')
    loans = [loan.strip() for loan in raw.split(',')]
    filtered = [loan for loan in loans if loan in allowed_loans]
    return filtered[0] if filtered else np.nan

train_df['Type_of_Loan'] = train_df['Type_of_Loan'].apply(pick_allowed_loan)

# Replace NaN with mode
mode_loan = train_df['Type_of_Loan'].mode()[0]
train_df['Type_of_Loan'].fillna(mode_loan, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Type_of_Loan'].fillna(mode_loan, inplace=True)


In [47]:
print(train_df['Type_of_Loan'].unique())



['Auto Loan' 'Credit-Builder Loan' 'Personal Loan' 'Payday Loan'
 'Student Loan' 'Mortgage Loan' 'Home Equity Loan']


In [48]:
print(train_df['Payment_of_Min_Amount'].unique())



['No' 'NM' 'Yes']


In [49]:
train_df['Payment_of_Min_Amount'] = train_df['Payment_of_Min_Amount'].replace('NM', 'No')


In [50]:
print(train_df['Payment_of_Min_Amount'].unique())

['No' 'Yes']


In [51]:
print(train_df['Payment_Behaviour'].unique())



['High_spent_Small_value_payments' 'Low_spent_Large_value_payments'
 'Low_spent_Medium_value_payments' 'Low_spent_Small_value_payments'
 'High_spent_Medium_value_payments' '!@9#%8'
 'High_spent_Large_value_payments']


In [52]:
valid_behaviours = {
    'High_spent_Small_value_payments',
    'High_spent_Medium_value_payments',
    'High_spent_Large_value_payments',
    'Low_spent_Small_value_payments',
    'Low_spent_Medium_value_payments',
    'Low_spent_Large_value_payments'
}

def clean_payment_behaviour(x):
    if x in valid_behaviours:
        return x
    else:
        return None  # or np.nan

train_df['Payment_Behaviour'] = train_df['Payment_Behaviour'].apply(clean_payment_behaviour)

# Optionally, fill missing values (None/NaN) with mode or drop rows
mode_behaviour = train_df['Payment_Behaviour'].mode()[0]
train_df['Payment_Behaviour'].fillna(mode_behaviour, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Payment_Behaviour'].fillna(mode_behaviour, inplace=True)


In [53]:
print(train_df['Payment_Behaviour'].unique())

['High_spent_Small_value_payments' 'Low_spent_Large_value_payments'
 'Low_spent_Medium_value_payments' 'Low_spent_Small_value_payments'
 'High_spent_Medium_value_payments' 'High_spent_Large_value_payments']


In [54]:
print(train_df['Credit_Mix'].unique())



['_' 'Good' 'Standard' 'Bad']


In [55]:


# Replace '_' with NaN
train_df['Credit_Mix'] = train_df['Credit_Mix'].replace('_', np.nan)

# Optionally fill NaN with mode (most frequent valid value)
mode_credit_mix = train_df['Credit_Mix'].mode()[0]
train_df['Credit_Mix'].fillna(mode_credit_mix, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Credit_Mix'].fillna(mode_credit_mix, inplace=True)


In [56]:
print(train_df['Credit_Mix'].unique())

['Standard' 'Good' 'Bad']


In [57]:
print(train_df['Credit_Score'].unique())



['Good' 'Standard' 'Poor']


Encoding **Categorical Columns**

In [58]:
cat_cols_train = ['Occupation', 'Type_of_Loan', 'Payment_of_Min_Amount', 'Payment_Behaviour',
            'Credit_Mix', 'Credit_Score']
for col in cat_cols_train:
    train_df[col] = train_df[col].astype(str)
    train_df[col] = LabelEncoder().fit_transform(train_df[col])

cat_cols_test = ['Occupation', 'Type_of_Loan', 'Payment_of_Min_Amount', 'Payment_Behaviour',
            'Credit_Mix']
for col in cat_cols_test:
    test_df[col] = test_df[col].astype(str)
    test_df[col] = LabelEncoder().fit_transform(test_df[col])

In [59]:
from sklearn.preprocessing import LabelEncoder
import joblib

cat_cols_train = ['Occupation', 'Type_of_Loan', 'Payment_of_Min_Amount', 'Payment_Behaviour', 'Credit_Mix']

label_encoders = {}

for col in cat_cols_train:
    le = LabelEncoder()
    train_df[col] = train_df[col].astype(str)
    train_df[col] = le.fit_transform(train_df[col])
    label_encoders[col] = le

# Save the label encoders dictionary
joblib.dump(label_encoders, 'label_encoders.pkl')





['label_encoders.pkl']

In [60]:
numeric_cols = train_df.select_dtypes(include=['number']).columns.tolist()
print(len(numeric_cols))
for i in numeric_cols:
    print(i)


23
Age
Occupation
Annual_Income
Monthly_Inhand_Salary
Num_Bank_Accounts
Num_Credit_Card
Interest_Rate
Num_of_Loan
Type_of_Loan
Delay_from_due_date
Num_of_Delayed_Payment
Changed_Credit_Limit
Num_Credit_Inquiries
Credit_Mix
Outstanding_Debt
Credit_Utilization_Ratio
Credit_History_Age
Payment_of_Min_Amount
Total_EMI_per_month
Amount_invested_monthly
Payment_Behaviour
Monthly_Balance
Credit_Score


Scaling Using **StandardScalar**

In [61]:
X = train_df.drop('Credit_Score', axis=1)
y = train_df['Credit_Score']

#  Split into training and testing sets (e.g., 70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#  Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

Train model using **RandomForest**

In [62]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [70]:
#check for web integration
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier

# check input
default_inputs = {
    'Age': 22,
    'Occupation': '0',
    'Annual_Income': 0,
    'Monthly_Inhand_Salary': 0,
    'Num_Bank_Accounts': 0,
    'Num_Credit_Card': 8,
    'Interest_Rate': 38.0,
    'Num_of_Loan': 8,
    'Type_of_Loan': '2',          # risky loan type
    'Delay_from_due_date': 900000,
    'Num_of_Delayed_Payment': 30,
    'Changed_Credit_Limit': -4000,
    'Num_Credit_Inquiries': 20,
    'Credit_Mix': '1',
    'Outstanding_Debt': 60000,
    'Credit_Utilization_Ratio': 1.0,
    'Credit_History_Age': 0,
    'Payment_of_Min_Amount': '1',
    'Total_EMI_per_month': 0,
    'Amount_invested_monthly': 0,
    'Payment_Behaviour': '2',
    'Monthly_Balance': -5000
}



# Convert to DataFrame
input_df = pd.DataFrame([default_inputs])

# Load encoder and scale
label_encoders = joblib.load('label_encoders.pkl')
scaler = joblib.load('scaler.pkl')
model = joblib.load('credit_score_model.pkl')  # or use directly if model already in memory

# Encode categorical features
for col in label_encoders:
    le = label_encoders[col]
    input_df[col] = le.transform(input_df[col])

# Scale input
scaled_input = scaler.transform(input_df)
print(scaled_input)

# Predict
prediction = model.predict(scaled_input)
score_map = {0: 'Bad', 1: 'Standard', 2: 'Good'}
print("Predicted Credit Score:", score_map[int(prediction[0])])


[[-1.33892619e-01 -1.68419373e+00 -1.20776762e-01 -1.36589777e+00
  -1.46605723e-01 -1.12278212e-01 -7.51820414e-02  9.77068210e-02
  -3.55208469e-01  6.06588616e+04  3.62182408e-05 -5.98034169e+02
  -3.88582857e-02 -4.83823666e-01  5.11468021e+01 -6.10742301e+00
  -2.32801050e+00  9.54846886e-01 -1.70145836e-01 -9.89015984e-01
  -4.93147636e-01 -2.53706438e+01]]
Predicted Credit Score: Good


In [66]:
# Show known labels for 'Occupation'
print("Occupation labels:", label_encoders['Type_of_Loan'].classes_)


Occupation labels: ['0' '1' '2' '3' '4' '5' '6']


In [64]:
print(train_df['Payment_of_Min_Amount'].unique())


[0 1]


In [67]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_train.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,-0.135385,1.289906,-0.013594,2.974056,-0.121357,-0.120076,-0.132492,0.010999,1.167797,-0.945132,...,-0.127746,-0.483824,-0.924851,1.164648,1.449871,-1.047288,-0.143308,2.988067,0.041033,0.76329
1,-0.127923,-1.684194,-0.04977,1.436071,-0.112941,-0.151265,-0.132492,-0.041026,-0.862877,-1.214734,...,-0.132975,-0.483824,-0.373043,1.643947,1.807665,-1.047288,-0.170146,-0.180963,-1.561508,2.400732
2,-0.111505,0.374799,-0.090106,-0.160291,-0.13819,-0.151265,-0.14735,0.010999,-0.862877,-0.675531,...,-0.122518,0.794635,-0.273949,0.537745,1.786618,-1.047288,-0.161204,-0.05756,-0.493148,-0.183666
3,-0.115983,1.06113,-0.063767,-0.318854,-0.104525,-0.143468,-0.092163,0.02834,-0.862877,-0.877732,...,-0.117289,0.794635,0.056465,0.160568,-1.380909,0.954847,-0.153955,0.220207,-0.493148,0.684947
4,4.600348,-0.769086,-0.06317,1.007302,-0.087693,-0.143468,-0.140982,-0.006343,0.15246,-0.810332,...,-0.117289,0.794635,-0.422398,-0.93178,-1.107302,-1.047288,-0.159097,0.671315,-0.493148,0.696682


**Evaluation**

In [68]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import numpy as np

# Predict using  trained model
y_pred = model.predict(X_test)

# If  model supports probability prediction, get probabilities
if hasattr(model, "predict_proba"):
    y_proba = model.predict_proba(X_test)
else:
    y_proba = None

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Bad', 'Standard', 'Good']))

# Accuracy
print("\n Accuracy Score:", accuracy_score(y_test, y_pred))

# ROC-AUC Score (for multiclass)
if y_proba is not None:
    try:
        roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
        print(" ROC-AUC Score (OvR):", roc_auc)
    except:
        print(" Could not compute ROC-AUC (check label format).")
else:
    print(" Model does not support probability predictions; ROC-AUC not available.")


Confusion Matrix:
[[ 3707    52  1563]
 [  198  6864  1743]
 [ 1211  1917 12745]]

 Classification Report:
              precision    recall  f1-score   support

         Bad       0.72      0.70      0.71      5322
    Standard       0.78      0.78      0.78      8805
        Good       0.79      0.80      0.80     15873

    accuracy                           0.78     30000
   macro avg       0.77      0.76      0.76     30000
weighted avg       0.78      0.78      0.78     30000


 Accuracy Score: 0.7772
 ROC-AUC Score (OvR): 0.9043741532118381


In [69]:
import joblib

# Save the trained best RandomForest model
joblib.dump(model, 'credit_score_model.pkl')


['credit_score_model.pkl']

Try Out **XGBoost**

In [None]:
#try hyperparameter tuning  but randomforest performs better than xgboost in my case
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(" Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Bad', 'Standard', 'Good']))

print(" Accuracy Score:", accuracy_score(y_test, y_pred))


Fitting 3 folds for each of 32 candidates, totalling 96 fits
✅ Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

✅ Classification Report:
              precision    recall  f1-score   support

         Bad       0.63      0.67      0.65      5322
    Standard       0.74      0.68      0.71      8805
        Good       0.75      0.77      0.76     15873

    accuracy                           0.72     30000
   macro avg       0.71      0.70      0.70     30000
weighted avg       0.73      0.72      0.72     30000

✅ Accuracy Score: 0.724
