In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip install optuna torch numpy pandas scikit-learn
# %%
import os
import torch
import optuna
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, LabelEncoder

Here's a short summary of the model's progress and current status:

* **Initial State (Poor):** Model performed very poorly with extremely high MSE (over 100 million to 2.8 billion) and scattered prediction plots, indicating no meaningful learning.
* **Key Improvement:** Scaling the target variable (Borrowing Capacity) was introduced.
* **Current State (Excellent):**
    * **Drastic MSE Reduction:** Scaled MSE dropped to a very low value (e.g., $0.0035$), and the RMSE on the original scale is now a reasonable $\text{\textdollar}5251$.
    * **Strong Predictive Power:** Plots show predictions tightly clustered around actual values, demonstrating a clear and accurate linear relationship.
    * **Healthy Residuals:** Residuals are randomly distributed around zero, indicating no systematic errors.
* **Conclusion:** The model is now performing effectively on the synthetic dataset.

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import optuna
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
import matplotlib.pyplot as plt

np.random.seed(42)
torch.manual_seed(42)

num_samples = 5000

residence_types = ["Own", "Rent", "Mortgage"]
marital_statuses = ["Single", "Married", "Divorced", "Widowed"]
cities = ["CityA", "CityB", "CityC", "CityD", "CityE"]
states = ["State1", "State2", "State3", "State4"]

data = {
    "Residence_type": np.random.choice(residence_types, num_samples),
    "Monthly_income": np.random.normal(6000, 2000, num_samples).clip(2500, 20000),
    "Previous_loan": np.random.randint(0, 3, num_samples),
    "Marital_Status": np.random.choice(marital_statuses, num_samples),
    "Number_of_dependency": np.random.randint(0, 7, num_samples),
    "Credit_Score": np.random.normal(700, 50, num_samples).clip(500, 850),
    "Employment_Years": np.random.normal(8, 4, num_samples).clip(0, 30),
    "City": np.random.choice(cities, num_samples),
    "State": np.random.choice(states, num_samples),
}

df = pd.DataFrame(data)

df['Borrowing_capacity'] = 20000 + (df['Monthly_income'] * 5)
df['Borrowing_capacity'] += df['Residence_type'].map({'Own': 5000, 'Rent': -2000, 'Mortgage': 2000}).fillna(0)
df['Borrowing_capacity'] += (df['Credit_Score'] - 600) * 50
df['Borrowing_capacity'] += (df['Credit_Score'] - 700)**2 * 0.1
df['Borrowing_capacity'] -= df['Previous_loan'] * 3000
df['Borrowing_capacity'] += np.log1p(df['Employment_Years']) * 1000
df['Borrowing_capacity'] -= df['Number_of_dependency'] * 1500
df['Borrowing_capacity'] += np.random.normal(0, 5000, num_samples)
df['Borrowing_capacity'] = df['Borrowing_capacity'].clip(10000, 150000)

categorical_features = ["Residence_type", "Marital_Status", "City", "State"]
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop("Borrowing_capacity", axis=1).values
y = df["Borrowing_capacity"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_val = scaler_X.transform(X_val)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
y_val_scaled = scaler_y.transform(y_val.reshape(-1, 1))

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_scaled, dtype=torch.float32)

class BorrowingCapacityModel(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, dropout_rate):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim1),
            nn.BatchNorm1d(hidden_dim1),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.BatchNorm1d(hidden_dim2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim2, hidden_dim3),
            nn.BatchNorm1d(hidden_dim3),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim3, 1)
        )

    def forward(self, x):
        return self.net(x)

def train_with_early_stopping(model, optimizer, criterion, X_train, y_train, X_val, y_val, max_epochs, patience, trial=None, checkpoint_dir="checkpoints"):
    best_loss = float('inf')
    epochs_without_improvement = 0
    best_model_state = None

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20, verbose=False)

    for epoch in range(max_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val)

        scheduler.step(val_loss)

        if val_loss < best_loss:
            best_loss = val_loss
            epochs_without_improvement = 0
            best_model_state = model.state_dict()
            checkpoint_path = os.path.join(checkpoint_dir, f"best_model_trial_{trial.number if trial else 'final'}.pth")
            torch.save(best_model_state, checkpoint_path)
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            break

        if trial:
            trial.report(val_loss, epoch)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return model, best_loss.item()

def objective(trial):
    hidden_dim1 = trial.suggest_int("hidden_dim1", 64, 256)
    hidden_dim2 = trial.suggest_int("hidden_dim2", 32, 128)
    hidden_dim3 = trial.suggest_int("hidden_dim3", 16, 64)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.4)
    epochs = trial.suggest_int("epochs", 1000, 3000)

    input_dim = X_train_tensor.shape[1]
    model = BorrowingCapacityModel(input_dim, hidden_dim1, hidden_dim2, hidden_dim3, dropout_rate)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    _, val_loss = train_with_early_stopping(model, optimizer, criterion, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, max_epochs=epochs, patience=75, trial=trial)

    return val_loss

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=150)

print("Best trial:")
print(f"  Value: {study.best_trial.value}")
print("  Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

best_params = study.best_trial.params
input_dim = X_train_tensor.shape[1]
final_model = BorrowingCapacityModel(input_dim, best_params["hidden_dim1"], best_params["hidden_dim2"], best_params["hidden_dim3"], best_params["dropout_rate"])
criterion = nn.MSELoss()
optimizer = optim.Adam(final_model.parameters(), lr=best_params["lr"])

final_model, final_val_loss_scaled = train_with_early_stopping(final_model, optimizer, criterion, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, max_epochs=best_params["epochs"], patience=75, checkpoint_dir="final_model_checkpoint")

final_model.eval()
with torch.no_grad():
    val_predictions_scaled = final_model(X_val_tensor).numpy()
    val_predictions_original = scaler_y.inverse_transform(val_predictions_scaled)

final_val_mse_original_scale = np.mean((y_val - val_predictions_original.flatten())**2)

print(f"\nFinal Validation MSE (Scaled): {final_val_loss_scaled}")
print(f"Final Validation MSE (Original Scale): {final_val_mse_original_scale}")
print(f"Final Validation RMSE (Original Scale): {np.sqrt(final_val_mse_original_scale)}")

torch.save(final_model.state_dict(), "final_borrowing_capacity_model_simplified.pth")

plt.figure(figsize=(10, 7))
plt.scatter(y_val, val_predictions_original, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.xlabel("Actual Borrowing Capacity")
plt.ylabel("Predicted Borrowing Capacity")
plt.title("Actual vs. Predicted Borrowing Capacity (Simplified Model)")
plt.grid(True)
plt.show()

residuals = y_val - val_predictions_original.flatten()
plt.figure(figsize=(10, 7))
plt.scatter(val_predictions_original, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Borrowing Capacity")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residual Plot (Simplified Model)")
plt.grid(True)
plt.show()