In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [31]:
# Load the data
df = pd.read_csv("C:\\Users\\user\\Downloads\\Train.csv")
df = pd.DataFrame(df)
df


Unnamed: 0,ID,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,74768,LP002231,1,1,0,1,0,8328,0.000000,17,363,1,2,1,6000
1,79428,LP001448,1,1,0,0,0,150,3857.458782,188,370,1,1,0,6000
2,70497,LP002231,0,0,0,0,0,4989,314.472511,17,348,1,0,0,6000
3,87480,LP001385,1,1,0,0,0,150,0.000000,232,359,1,1,1,3750
4,33964,LP002231,1,1,1,0,0,8059,0.000000,17,372,1,0,1,3750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5893,65288,LP002560,0,1,0,0,0,1297,3293.124489,17,373,1,1,1,3750
5894,86963,LP002585,1,1,0,0,0,7358,0.000000,17,365,1,2,1,3547
5895,69407,LP002175,1,1,2,0,0,150,0.000000,170,353,1,1,1,3750
5896,82002,LP002560,1,0,0,0,0,2230,0.000000,222,367,1,2,0,3750


In [41]:

# Fill missing values
df['Credit_History'] = df['Credit_History'].fillna(0)
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])


# Fill categorical missing values
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

# Feature engineering
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['Income_to_Loan_Ratio'] = df['Total_Income'] / df['LoanAmount']


# Encode categorical variables
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Define features and target
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']


In [43]:

# Drop Loan_ID
df.drop(['Loan_ID'], axis=1, inplace=True)

In [45]:
print(df.columns)

Index(['ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'Total_Income', 'Income_to_Loan_Ratio'],
      dtype='object')


In [47]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


In [49]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8398305084745763
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       187
           1       0.84      1.00      0.91       993

    accuracy                           0.84      1180
   macro avg       0.42      0.50      0.46      1180
weighted avg       0.71      0.84      0.77      1180

Confusion Matrix:
 [[  0 187]
 [  2 991]]


In [51]:
# Simulated Default Risk based on heuristic
df['Default_Risk'] = np.where((df['Credit_History'] == 0) | (df['LoanAmount'] > df['Total_Income'] * 0.4), 1, 0)


In [53]:
# Normalize relevant metrics
scaler = StandardScaler()
df['Credit_Score'] = (
    0.4 * scaler.fit_transform(df[['Credit_History']]) +
    0.3 * scaler.fit_transform(df[['Income_to_Loan_Ratio']]) +
    0.3 * scaler.fit_transform(df[['Total_Income']])
)

# Scale to 0–100
df['Credit_Score'] = (df['Credit_Score'] - df['Credit_Score'].min()) / (df['Credit_Score'].max() - df['Credit_Score'].min()) * 100


In [61]:
# Example model
success_features = ['Credit_Score', 'Education', 'Total_Income']
success_model = RandomForestClassifier()
success_model.fit(df[success_features], df['Loan_Status'])  # `Successful_Business` = 0/1
