In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

n = 1000  # number of applicants

data = pd.DataFrame({
    "Applicant_ID": range(1, n+1),
    "Age": np.random.randint(21, 65, n),
    "Annual_Income": np.random.normal(60000, 15000, n).astype(int),
    "Credit_Score": np.random.randint(300, 850, n),
    "Loan_Amount": np.random.normal(20000, 8000, n).astype(int),
    "Loan_Term_Months": np.random.choice([36, 60, 120, 180], n),
    "Employment_Years": np.random.randint(0, 30, n),
    "Existing_Loans": np.random.randint(0, 5, n)
})

# Generate default outcome (target variable)
data["Default"] = (
    (data["Credit_Score"] < 600).astype(int) |
    (data["Annual_Income"] < 40000).astype(int) |
    (data["Loan_Amount"] > 30000).astype(int)
)

data.head()


Unnamed: 0,Applicant_ID,Age,Annual_Income,Credit_Score,Loan_Amount,Loan_Term_Months,Employment_Years,Existing_Loans,Default
0,1,59,46727,683,28892,60,22,4,0
1,2,49,64304,744,35477,180,21,4,1
2,3,35,62477,724,20557,36,18,1,0
3,4,63,68534,460,14881,120,29,1,1
4,5,28,46877,477,11980,180,25,1,1


In [None]:
data.info()
data.describe()
data["Default"].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Applicant_ID      1000 non-null   int64
 1   Age               1000 non-null   int64
 2   Annual_Income     1000 non-null   int64
 3   Credit_Score      1000 non-null   int64
 4   Loan_Amount       1000 non-null   int64
 5   Loan_Term_Months  1000 non-null   int64
 6   Employment_Years  1000 non-null   int64
 7   Existing_Loans    1000 non-null   int64
 8   Default           1000 non-null   int64
dtypes: int64(9)
memory usage: 70.4 KB


Unnamed: 0_level_0,count
Default,Unnamed: 1_level_1
1,617
0,383


In [None]:
# Remove negative values (synthetic safety check)
data["Loan_Amount"] = data["Loan_Amount"].abs()
data["Annual_Income"] = data["Annual_Income"].abs()

# Check missing values
data.isnull().sum()


Unnamed: 0,0
Applicant_ID,0
Age,0
Annual_Income,0
Credit_Score,0
Loan_Amount,0
Loan_Term_Months,0
Employment_Years,0
Existing_Loans,0
Default,0


In [None]:
# Loan to Income Ratio
data["Loan_to_Income"] = data["Loan_Amount"] / data["Annual_Income"]

# Credit Risk Category
data["Credit_Risk_Level"] = pd.cut(
    data["Credit_Score"],
    bins=[300, 580, 670, 740, 850],
    labels=["Poor", "Fair", "Good", "Excellent"]
)

# Encode categorical variable
data = pd.get_dummies(data, columns=["Credit_Risk_Level"], drop_first=True)

data.head()


Unnamed: 0,Applicant_ID,Age,Annual_Income,Credit_Score,Loan_Amount,Loan_Term_Months,Employment_Years,Existing_Loans,Default,Loan_to_Income,Credit_Risk_Level_Fair,Credit_Risk_Level_Good,Credit_Risk_Level_Excellent
0,1,59,46727,683,28892,60,22,4,0,0.618315,False,True,False
1,2,49,64304,744,35477,180,21,4,1,0.551708,False,False,True
2,3,35,62477,724,20557,36,18,1,0,0.329033,False,True,False
3,4,63,68534,460,14881,120,29,1,1,0.217133,False,False,False
4,5,28,46877,477,11980,180,25,1,1,0.255562,False,False,False


In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(["Applicant_ID", "Default"], axis=1)
y = data["Default"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)



In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# AUC Score
auc = roc_auc_score(y_test, y_prob)
print("AUC Score:", auc)


Confusion Matrix:
 [[105  10]
 [ 24 161]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.91      0.86       115
           1       0.94      0.87      0.90       185

    accuracy                           0.89       300
   macro avg       0.88      0.89      0.88       300
weighted avg       0.89      0.89      0.89       300

AUC Score: 0.9549706227967097


In [None]:
data["Risk_Score"] = model.predict_proba(X)[:,1] * 100

data[["Applicant_ID", "Risk_Score"]].head()


Unnamed: 0,Applicant_ID,Risk_Score
0,1,45.197585
1,2,77.162241
2,3,8.091334
3,4,96.214647
4,5,95.104244


In [None]:
def risk_category(score):
    if score < 30:
        return "Low Risk"
    elif score < 60:
        return "Medium Risk"
    else:
        return "High Risk"

data["Risk_Category"] = data["Risk_Score"].apply(risk_category)

data[["Applicant_ID", "Risk_Score", "Risk_Category"]].head()


Unnamed: 0,Applicant_ID,Risk_Score,Risk_Category
0,1,45.197585,Medium Risk
1,2,77.162241,High Risk
2,3,8.091334,Low Risk
3,4,96.214647,High Risk
4,5,95.104244,High Risk


In [None]:
data.to_csv("Credit_Risk_Analysis_Output.csv", index=False)


In [None]:
from google.colab import files

files.download("Credit_Risk_Analysis_Output.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>