In [None]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


df = pd.read_csv("Datasets\loan.csv")


df.columns = df.columns.str.strip()


cols_to_drop = [
    "application_id",
    "customer_id",
    "application_date",
    "fraud_flag",
    "fraud_type",
    "residential_address",
    "interest_rate_offered",
    "existing_emis_monthly",
    "number_of_dependents",
    "purpose_of_loan",
    "debt_to_income_ratio"
]
df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)


y = df["loan_status"]
X = df.drop("loan_status", axis=1)


cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()


X[num_cols] = X[num_cols].fillna(X[num_cols].median())
for col in cat_cols:
    X[col] = X[col].fillna(X[col].mode()[0])


allowed_categories = {
    "employment_status": ["Retired", "Unemployed", "Self-Employed", "Salaried", "Business Owner", "Student"],
    "gender": ["Female", "Other", "Male"],
    "loan_type": ["Business Loan", "Car Loan", "Education Loan", "Personal Loan", "Home Loan"],
    "property_ownership_status": ["Rented", "Owned", "Jointly Owned"]
}

for col in cat_cols:
    X[col] = X[col].astype(str).str.strip()
    if col in allowed_categories:
        X[col] = X[col].where(X[col].isin(allowed_categories[col]), "Other")


label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le


scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])


FEATURES = X.columns.tolist()


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


sample_customer = {
    "loan_type": "Personal Loan",
    "loan_amount_requested": 800000,
    "loan_tenure_months": 72,
    "employment_status": "Unemployed",
    "monthly_income": 18000,
    "cibil_score": 540,
    "property_ownership_status": "Rented",
    "applicant_age": 21,
    "gender": "Male"
}

sample_df = pd.DataFrame([sample_customer])

# Apply domain control & label encoding
for col in cat_cols:
    le = label_encoders[col]
    # map unseen to 'Other' if possible
    val = sample_df.at[0, col]
    if col in allowed_categories and val not in allowed_categories[col]:
        val = "Other"
    # transform
    sample_df[col] = le.transform([val])[0]

# Scale numeric columns
sample_df[num_cols] = scaler.transform(sample_df[num_cols])

# Ensure same feature order
sample_df = sample_df[FEATURES]

# Predict
prediction = rf.predict(sample_df)
print("Predicted Loan Status:", prediction[0])


def calculate_review_score(customer):
    score = 0
    # CIBIL (max 40)
    score += min(customer['cibil_score'] / 900 * 40, 40)
    # Income (max 25)
    score += min(customer['monthly_income'] / 100000 * 25, 25)
    # Loan affordability (max 20)
    loan_ratio = customer['loan_amount_requested'] / (customer['monthly_income'] * 12)
    score += max(0, 20 - loan_ratio * 20)
    # Age stability (max 10)
    score += 10 if 23 <= customer['applicant_age'] <= 45 else 5
    # Employment (max 5)
    score += 5 if customer['employment_status'] in ["Salaried", "Self-Employed", "Business Owner"] else 0
    return round(min(score, 100), 2)

def calculate_fraud_risk(customer):
    risk = 0
    if customer['cibil_score'] < 650:
        risk += 30
    if customer['loan_amount_requested'] > customer['monthly_income'] * 10:
        risk += 30
    if customer['employment_status'] not in ["Salaried", "Self-Employed", "Business Owner"]:
        risk += 20
    if customer['applicant_age'] < 22:
        risk += 20
    return min(risk, 100)

review_score = calculate_review_score(sample_customer)
fraud_risk = calculate_fraud_risk(sample_customer)

print("Customer Review Score:", review_score, "/ 100")
print("Fraud Risk:", fraud_risk, "%")


with open("model.pkl", "wb") as f:
    pickle.dump(rf, f)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)
with open("features.pkl", "wb") as f:
    pickle.dump(FEATURES, f)

print(" Model, scaler, encoders & features saved successfully")


Accuracy: 0.9794

Confusion Matrix:
 [[8176    0    0    0]
 [   0 1618    0    0]
 [  85   24    0    0]
 [  77   20    0    0]]

Classification Report:
                          precision    recall  f1-score   support

               Approved       0.98      1.00      0.99      8176
               Declined       0.97      1.00      0.99      1618
  Fraudulent - Detected       0.00      0.00      0.00       109
Fraudulent - Undetected       0.00      0.00      0.00        97

               accuracy                           0.98     10000
              macro avg       0.49      0.50      0.49     10000
           weighted avg       0.96      0.98      0.97     10000

Predicted Loan Status: Declined
Customer Review Score: 33.5 / 100
Fraud Risk: 100 %


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


âœ… Model, scaler, encoders & features saved successfully
