<a href="https://colab.research.google.com/github/SamipLC/credit-risk-classifier/blob/main/Credit_Risk_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [6]:
# 3. Generate synthetic credit‐risk data with meaningful feature names
X, y = make_classification(
    n_samples=2000,
    n_features=15,
    n_informative=8,
    n_redundant=4,
    n_clusters_per_class=2,
    weights=[0.7, 0.3],   # 30% defaults
    class_sep=1.0,
    random_state=42
)

# Use realistic credit‐risk feature names
feature_names = [
    "age",
    "annual_income",
    "loan_amount",
    "interest_rate",
    "credit_score",
    "employment_length_years",
    "installment",
    "debt_to_income_ratio",
    "delinquency_2yrs",
    "num_open_credit_lines",
    "num_credit_inquiries",
    "months_since_last_delinq",
    "public_records",
    "revol_utilization",
    "total_accounts"
]

df = pd.DataFrame(X, columns=feature_names)
df["default"] = y

df.head()



Unnamed: 0,age,annual_income,loan_amount,interest_rate,credit_score,employment_length_years,installment,debt_to_income_ratio,delinquency_2yrs,num_open_credit_lines,num_credit_inquiries,months_since_last_delinq,public_records,revol_utilization,total_accounts,default
0,3.696613,2.134596,1.270818,-2.209895,-1.486397,1.961306,0.244802,0.683735,-1.031246,-2.169115,-3.395477,0.115676,-4.238422,-0.765778,0.735264,0
1,2.734003,0.752464,0.106088,4.682087,2.825153,3.56587,1.261244,0.967867,-1.609492,-0.313692,0.197696,0.492499,5.537762,0.990556,0.081233,1
2,2.154938,0.289082,-0.280156,0.66094,0.759213,-0.158531,0.991693,0.429934,1.305848,-0.940869,-2.119106,0.974129,-0.484545,-0.041938,-0.358548,0
3,-2.519354,-2.022528,-4.105725,1.401406,5.412698,1.84234,2.17729,2.163686,-3.031622,0.574962,1.483204,-0.304442,1.068336,0.236615,-0.231392,0
4,1.631874,0.998121,-0.753432,3.254482,-0.53488,1.33343,0.37512,1.229184,1.325968,-0.661087,0.858638,-2.469877,0.680492,0.330138,2.159694,0


In [7]:
# 2. Pipeline: scaling + logistic regression
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(solver="liblinear", random_state=42))
])

# 3. Hyperparameter grid
param_grid = {
    "clf__C": [0.01, 0.1, 1, 10, 100],
    "clf__penalty": ["l1", "l2"]
}

# 4. Grid search with 5-fold CV
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)
grid.fit(df[feature_names], df["default"])

print("Best params:", grid.best_params_)
print("CV accuracy:", grid.best_score_.round(3))


Best params: {'clf__C': 100, 'clf__penalty': 'l2'}
CV accuracy: 0.785


In [8]:
best_model = grid.best_estimator_
cv_scores = cross_val_score(best_model, df[feature_names], df["default"], cv=5)
print(f"5-fold CV accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


5-fold CV accuracy: 0.785 ± 0.025
