<a href="https://colab.research.google.com/github/Shreyaaaaaa-09/Credit-Risk-Analysis/blob/main/RiskAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna
!pip install lightgbm
!pip install imblearn
!pip install tensorflow

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras import layers, models
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import ADASYN
import lightgbm as lgb
import joblib
import optuna
import matplotlib.pyplot as plt
import s
# Load dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data", sep=" ", names=[
    "status", "duration", "credit_history", "purpose", "credit_amount", "savings", "employment",
    "installment_rate", "personal_status", "other_debtors", "residence_since", "property", "age",
    "other_installment", "housing", "existing_credits", "job", "people_liable", "telephone",
    "foreign_worker", "risk"
])

# Convert target variable
df['risk'] = df['risk'].map({1: 1, 2: 0})

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Standard scaling
numerical_cols = ['duration', 'credit_amount', 'age']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split dataset
X = df.drop(columns=['risk'])
y = df['risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

# Define GAN
def build_generator(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dense(256, activation='relu'),
        layers.Dense(X_train.shape[1], activation='sigmoid')
    ])
    return model

def build_discriminator(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

latent_dim = 15  # Increased latent space
generator = build_generator(latent_dim)
discriminator = build_discriminator(X_train.shape[1])
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
discriminator.trainable = False

gan_input = layers.Input(shape=(latent_dim,))
x = generator(gan_input)
gan_output = discriminator(x)
gan = models.Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')

def train_gan(generator, discriminator, gan, X_train, epochs=100, batch_size=64):
    batch_count = max(1, X_train.shape[0] // batch_size)
    for epoch in range(epochs):
        for _ in range(batch_count):
            noise = np.random.normal(0, 1, (batch_size, latent_dim))
            gen_data = generator.predict(noise)
            real_data = X_train.sample(n=batch_size, replace=True).values
            d_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
            d_loss_fake = discriminator.train_on_batch(gen_data, np.zeros((batch_size, 1)))
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
            g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))
        print(f"Epoch {epoch+1}/{epochs}, D Loss: {d_loss[0]}, G Loss: {g_loss}")

train_gan(generator, discriminator, gan, X_train)

# Generate new features for training
z_new = np.random.normal(0, 1, (X_train.shape[0], latent_dim))
generated_features = generator.predict(z_new)

X_train_final = np.hstack((X_train, generated_features))
X_test_final = np.hstack((X_test, generator.predict(np.random.normal(0, 1, (X_test.shape[0], latent_dim)))))

# Train LightGBM with ADASYN for better balance
adasyn = ADASYN()
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_final, y_train)

def objective(trial):
    params = {
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0)
    }
    model = lgb.LGBMClassifier(**params, class_weight='balanced', random_state=42)
    model.fit(X_train_resampled, y_train_resampled)
    preds = model.predict(X_test_final)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

best_params = study.best_params
lgbm_model = lgb.LGBMClassifier(**best_params, class_weight='balanced', random_state=42)
lgbm_model.fit(X_train_resampled, y_train_resampled)

# Save model
joblib.dump(lgbm_model, "lgbm_credit_risk_model.pkl")

# Predictions
y_pred = lgbm_model.predict(X_test_final)
y_prob = lgbm_model.predict_proba(X_test_final)[:, 1]

print(f"🔥 Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"🔥 AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
print(f"🔥 Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step 




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
Epoch 1/100, D Loss: 0.6554499864578247, G Loss: 0.6148056387901306
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms

[I 2025-02-22 14:58:19,300] A new study created in memory with name: no-name-3da81056-edef-470d-b2da-45231c037d06


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:19,995] Trial 0 finished with value: 0.785 and parameters: {'num_leaves': 72, 'learning_rate': 0.08363643565681295, 'max_depth': 9, 'n_estimators': 410, 'colsample_bytree': 0.6691972320250499, 'subsample': 0.872075445814879}. Best is trial 0 with value: 0.785.
[I 2025-02-22 14:58:20,120] Trial 1 finished with value: 0.82 and parameters: {'num_leaves': 91, 'learning_rate': 0.08157610828188222, 'max_depth': 5, 'n_estimators': 133, 'colsample_bytree': 0.5551735524880075, 'subsample': 0.6664689716913033}. Best is trial 1 with value: 0.82.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:Boos

[I 2025-02-22 14:58:20,462] Trial 2 finished with value: 0.805 and parameters: {'num_leaves': 60, 'learning_rate': 0.08773716691379124, 'max_depth': 4, 'n_estimators': 464, 'colsample_bytree': 0.9422897817610806, 'subsample': 0.7137011135149693}. Best is trial 1 with value: 0.82.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000






[I 2025-02-22 14:58:20,950] Trial 3 finished with value: 0.79 and parameters: {'num_leaves': 86, 'learning_rate': 0.09461401199332406, 'max_depth': 6, 'n_estimators': 493, 'colsample_bytree': 0.62184408959544, 'subsample': 0.9588008334659315}. Best is trial 1 with value: 0.82.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000






[I 2025-02-22 14:58:21,369] Trial 4 finished with value: 0.795 and parameters: {'num_leaves': 71, 'learning_rate': 0.01926123655582511, 'max_depth': 6, 'n_estimators': 351, 'colsample_bytree': 0.7150986766850169, 'subsample': 0.6797068706297622}. Best is trial 1 with value: 0.82.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000






[I 2025-02-22 14:58:21,676] Trial 5 finished with value: 0.8 and parameters: {'num_leaves': 97, 'learning_rate': 0.08269590774933945, 'max_depth': 4, 'n_estimators': 472, 'colsample_bytree': 0.7656634531895441, 'subsample': 0.87314647875566}. Best is trial 1 with value: 0.82.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000






[I 2025-02-22 14:58:22,351] Trial 6 finished with value: 0.825 and parameters: {'num_leaves': 93, 'learning_rate': 0.036854406222625265, 'max_depth': 7, 'n_estimators': 464, 'colsample_bytree': 0.8988256290285062, 'subsample': 0.7838357242142968}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:22,521] Trial 7 finished with value: 0.79 and parameters: {'num_leaves': 50, 'learning_rate': 0.04871179065111524, 'max_depth': 3, 'n_estimators': 334, 'colsample_bytree': 0.8669409060934883, 'subsample': 0.8564679225592795}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:22,690] Trial 8 finished with value: 0.78 and parameters: {'num_leaves': 79, 'learning_rate': 0.0818088364954856, 'max_depth': 8, 'n_estimators': 102, 'colsample_bytree': 0.6940390864388186, 'subsample': 0.9470755164751926}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:23,058] Trial 9 finished with value: 0.805 and parameters: {'num_leaves': 65, 'learning_rate': 0.026968222124026102, 'max_depth': 5, 'n_estimators': 365, 'colsample_bytree': 0.8646991018419017, 'subsample': 0.9788171956501213}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:23,481] Trial 10 finished with value: 0.775 and parameters: {'num_leaves': 22, 'learning_rate': 0.04695274937829079, 'max_depth': 12, 'n_estimators': 235, 'colsample_bytree': 0.9914705213770822, 'subsample': 0.5331063277222122}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000605 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000






[I 2025-02-22 14:58:23,849] Trial 11 finished with value: 0.795 and parameters: {'num_leaves': 98, 'learning_rate': 0.06545926544336171, 'max_depth': 12, 'n_estimators': 230, 'colsample_bytree': 0.5365730188617828, 'subsample': 0.6115810270578804}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:24,107] Trial 12 finished with value: 0.825 and parameters: {'num_leaves': 44, 'learning_rate': 0.03767991658693459, 'max_depth': 15, 'n_estimators': 148, 'colsample_bytree': 0.5301919862354042, 'subsample': 0.7846574752728929}. Best is trial 6 with value: 0.825.






[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:24,660] Trial 13 finished with value: 0.805 and parameters: {'num_leaves': 43, 'learning_rate': 0.03215098520313769, 'max_depth': 15, 'n_estimators': 239, 'colsample_bytree': 0.7984931108492488, 'subsample': 0.7822272941921601}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:24,989] Trial 14 finished with value: 0.8 and parameters: {'num_leaves': 27, 'learning_rate': 0.036129270588374424, 'max_depth': 15, 'n_estimators': 165, 'colsample_bytree': 0.8752393142189858, 'subsample': 0.7835202329791743}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:25,519] Trial 15 finished with value: 0.805 and parameters: {'num_leaves': 41, 'learning_rate': 0.06753375600598044, 'max_depth': 12, 'n_estimators': 289, 'colsample_bytree': 0.6039787447084517, 'subsample': 0.8202276501707519}. Best is trial 6 with value: 0.825.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000529 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:25,770] Trial 16 finished with value: 0.83 and parameters: {'num_leaves': 54, 'learning_rate': 0.012074557597671981, 'max_depth': 8, 'n_estimators': 178, 'colsample_bytree': 0.5036704104809567, 'subsample': 0.731804054951061}. Best is trial 16 with value: 0.83.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000






[I 2025-02-22 14:58:26,503] Trial 17 finished with value: 0.79 and parameters: {'num_leaves': 55, 'learning_rate': 0.011999161647060061, 'max_depth': 8, 'n_estimators': 412, 'colsample_bytree': 0.8105122429809407, 'subsample': 0.592832469426196}. Best is trial 16 with value: 0.83.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000652 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:26,932] Trial 18 finished with value: 0.815 and parameters: {'num_leaves': 34, 'learning_rate': 0.013396469086545685, 'max_depth': 10, 'n_estimators': 192, 'colsample_bytree': 0.9352165655834248, 'subsample': 0.737315510260541}. Best is trial 16 with value: 0.83.






[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-02-22 14:58:27,260] Trial 19 finished with value: 0.815 and parameters: {'num_leaves': 81, 'learning_rate': 0.02295096817957521, 'max_depth': 7, 'n_estimators': 275, 'colsample_bytree': 0.5030357380479816, 'subsample': 0.637918209496368}. Best is trial 16 with value: 0.83.


[LightGBM] [Info] Number of positive: 559, number of negative: 522
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6429
[LightGBM] [Info] Number of data points in the train set: 1081, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




🔥 Accuracy: 0.8300
🔥 AUC-ROC: 0.8431
🔥 Confusion Matrix:
[[ 36  23]
 [ 11 130]]




In [None]:
# 12. Visualize Confusion Matrix (Optional)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Bad Risk (0)', 'Good Risk (1)'], yticklabels=['Bad Risk (0)', 'Good Risk (1)'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


NameError: name 'plt' is not defined

In [None]:
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

lgbm_model = joblib.load("lgbm_credit_risk_model.pkl")



# Sample customer details
test_customers = [
    {
        "duration": 24, "credit_amount": 5000, "installment_rate": 4, "residence_since": 2,
        "age": 30, "existing_credits": 1, "people_liable": 1,
        "status": "A12", "credit_history": "A32", "purpose": "A41",
        "savings": "A62", "employment": "A73", "personal_status": "A92",
        "other_debtors": "A102", "property": "A122", "other_installment": "A143",
        "housing": "A152", "job": "A173", "telephone": "A192", "foreign_worker": "A202"
    }
]

# Convert to DataFrame
test_customers_df = pd.DataFrame(test_customers)

# Apply label encoding (must use the same encoders from training)
for col in ["status", "credit_history", "purpose", "savings", "employment", "personal_status",
            "other_debtors", "property", "other_installment", "housing", "job", "telephone", "foreign_worker"]:
    le = LabelEncoder()
    test_customers_df[col] = le.fit_transform(test_customers_df[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_cols = ["duration", "credit_amount", "installment_rate", "residence_since",
                  "age", "existing_credits", "people_liable"]
test_customers_df[numerical_cols] = scaler.fit_transform(test_customers_df[numerical_cols])



latent_dim = 15  # Use the same latent space size as before
z_test = np.random.normal(0, 1, (test_customers_df.shape[0], latent_dim))
generated_features = generator.predict(z_test)

# Combine with test data
test_final_df = np.hstack((test_customers_df, generated_features))

loan_prediction = lgbm_model.predict(test_final_df)
loan_probability = lgbm_model.predict_proba(test_final_df)[:, 1]

print(f"🔥 Loan Approved? {'YES' if loan_prediction[0] == 1 else 'NO'}")
print(f"🔥 Approval Probability: {loan_probability[0]:.4f}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
🔥 Loan Approved? YES
🔥 Approval Probability: 0.5071




In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Generate synthetic test data
np.random.seed(42)
n_samples = 100  # Number of test samples

test_data = {
    "duration": np.random.randint(6, 72, n_samples),
    "credit_amount": np.random.randint(1000, 20000, n_samples),
    "installment_rate": np.random.randint(1, 5, n_samples),
    "residence_since": np.random.randint(1, 5, n_samples),
    "age": np.random.randint(18, 75, n_samples),
    "existing_credits": np.random.randint(1, 4, n_samples),
    "people_liable": np.random.randint(1, 3, n_samples),
    "status": np.random.choice(["A11", "A12", "A13", "A14"], n_samples),
    "credit_history": np.random.choice(["A30", "A31", "A32", "A33", "A34"], n_samples),
    "purpose": np.random.choice(["A40", "A41", "A42", "A43", "A44", "A45", "A46", "A47", "A48", "A49"], n_samples),
    "savings": np.random.choice(["A61", "A62", "A63", "A64", "A65"], n_samples),
    "employment": np.random.choice(["A71", "A72", "A73", "A74", "A75"], n_samples),
    "personal_status": np.random.choice(["A91", "A92", "A93", "A94"], n_samples),
    "other_debtors": np.random.choice(["A101", "A102", "A103"], n_samples),
    "property": np.random.choice(["A121", "A122", "A123", "A124"], n_samples),
    "other_installment": np.random.choice(["A141", "A142", "A143"], n_samples),
    "housing": np.random.choice(["A151", "A152", "A153"], n_samples),
    "job": np.random.choice(["A171", "A172", "A173", "A174"], n_samples),
    "telephone": np.random.choice(["A191", "A192"], n_samples),
    "foreign_worker": np.random.choice(["A201", "A202"], n_samples),
}

test_df = pd.DataFrame(test_data)

# Label encode categorical features
categorical_cols = ["status", "credit_history", "purpose", "savings", "employment", "personal_status", "other_debtors", "property", "other_installment", "housing", "job", "telephone", "foreign_worker"]
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col])
    label_encoders[col] = le

# Standard scale numerical features
numerical_cols = ["duration", "credit_amount", "installment_rate", "residence_since", "age", "existing_credits", "people_liable"]
scaler = StandardScaler()
test_df[numerical_cols] = scaler.fit_transform(test_df[numerical_cols])

# Save dataset to CSV
test_df.to_csv("credit_risk_test_data.csv", index=False)

print("🔥 Test dataset generated and saved as 'credit_risk_test_data.csv'")


🔥 Test dataset generated and saved as 'credit_risk_test_data.csv'


In [None]:
test_df = pd.read_csv("credit_risk_test_data.csv")

print("🔥 Test data loaded successfully!")

lgbm_model = joblib.load("lgbm_credit_risk_model.pkl")

print("🔥 Model loaded successfully!")



latent_dim = 15  # Same as training
z_test = np.random.normal(0, 1, (test_df.shape[0], latent_dim))

# Generate new features using the trained generator
generated_features = generator.predict(z_test)

# Combine with original test data
test_final_df = np.hstack((test_df, generated_features))

print("🔥 Synthetic GAN features added to test data!")

# Predict loan approval
loan_predictions = lgbm_model.predict(test_final_df)
loan_probabilities = lgbm_model.predict_proba(test_final_df)[:, 1]

# Add results to dataframe
test_df["Loan_Approved"] = ["YES" if pred == 1 else "NO" for pred in loan_predictions]
test_df["Approval_Probability"] = loan_probabilities

# Save results
test_df.to_csv("credit_risk_predictions.csv", index=False)

print("🔥 Predictions complete! Results saved to 'credit_risk_predictions.csv'")



🔥 Test data loaded successfully!
🔥 Model loaded successfully!
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
🔥 Synthetic GAN features added to test data!
🔥 Predictions complete! Results saved to 'credit_risk_predictions.csv'




In [None]:
print(test_df[["Loan_Approved", "Approval_Probability"]].head(20))


   Loan_Approved  Approval_Probability
0             NO              0.217346
1             NO              0.479417
2            YES              0.504013
3            YES              0.623528
4            YES              0.741297
5             NO              0.391666
6             NO              0.276017
7             NO              0.454371
8            YES              0.707943
9            YES              0.500577
10           YES              0.528196
11            NO              0.400218
12            NO              0.441399
13            NO              0.400870
14            NO              0.460360
15           YES              0.710044
16            NO              0.328920
17           YES              0.516864
18            NO              0.365048
19            NO              0.431753
