In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
import joblib

In [3]:
# Load dataset
file_path = "External_Cibil_Dataset.xlsx"  # Make sure this file is inside the backend folder
org_file = pd.read_excel(file_path)
df = org_file


In [4]:
# Drop non-informative columns
drop_cols = ["PROSPECTID", "last_prod_enq2", "first_prod_enq2", "Approved_Flag"]
df = df.drop(columns=drop_cols, errors="ignore")

In [5]:
# Handle categorical encoding
df["MARITALSTATUS"] = df["MARITALSTATUS"].astype("category").cat.codes
df["EDUCATION"] = df["EDUCATION"].astype("category").cat.codes
df["GENDER"] = df["GENDER"].astype("category").cat.codes

# Fill missing values with median
df.fillna(df.median(), inplace=True)

#code the category is replaced with
#maritalstatus: married-0 single-1 
#education: 12th-0 graduate-1 others-2 postgraduate-3 professional-4 SSC-5 undergraduate=6
#gender: f-0 m-1

In [6]:
# Feature Selection (top features based on RandomForest)
selected_features = ["enq_L3m", "num_std", "time_since_recent_enq", "num_std_12mts",
                     "enq_L6m", "AGE", "recent_level_of_deliq", "time_since_recent_deliquency",
                     "Time_With_Curr_Empr", "time_since_recent_payment", "NETMONTHLYINCOME"]

X = df[selected_features]
y = df["Credit_Score"]

In [7]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Apply Z-score normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train Random Forest Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


Save the trained model and scaler
joblib.dump(model, "credit_score_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model and Scaler saved successfully!")

In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
import numpy as np

# Suppose these are your regression predictions
# Convert continuous credit scores to categorical
def score_to_category(score):
    if score <= 650:
        return "Poor"
    elif score <= 750:
        return "Average"
    else:
        return "Good"

# Convert true and predicted scores
y_test_cat = y_test.apply(score_to_category)
y_pred_cat = pd.Series(y_pred).apply(score_to_category)

# Now you can compute classification metrics
accuracy = accuracy_score(y_test_cat, y_pred_cat)
cm = confusion_matrix(y_test_cat, y_pred_cat)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", cm)


Accuracy: 0.9644526684846124
Confusion Matrix:
 [[9555    1   61]
 [  20   10    0]
 [ 283    0  338]]


In [13]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)


RMSE: 11.474665328247497
R² Score: 0.6852256104870834
