In [1]:
scale_columns = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
    "y_installment",
    "y_installment_timestep",
    "y_state_cum_overduelength",
    "y_remaining_debt",
    "y_state_capital",
    "y_state_interests",
    "y_state_penalty",
]

In [2]:
features = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
]


targets = [
    # "y_installment",
    # "y_installment_timestep",
    # "y_state_cum_overduelength",
    # "y_remaining_debt",
    # "y_state_capital",
    # "y_state_interests",
    # "y_state_penalty",
    # "installment_done",
    # "loan_done",
    "recovery_rate_weighted",
]

In [3]:
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler


class LoanSimDataset(Dataset):
    def __init__(
        self,
        csv_file_path,
        scale_columns,
        feature_columns,
        target_columns,
        group="train",
    ):
        # -------------------------------------------------------
        # read the csv file
        data = pd.read_csv(csv_file_path)
        # choose train or test data for RLsimulator
        # rlsim_data = data.loc[(data["group"] == group)]
        rlsim_train_data = data.loc[data["group"] == "train"]
        rlsim_test_data = data.loc[
            data["group"] == "test"]
        # -------------------------------------------------------
        # scale the data
        scaler = StandardScaler()
        # we use train data to fit the scaler
        train_scaled = scaler.fit_transform(
            rlsim_train_data[scale_columns]
        )
        if group == "train":
            # rlsim_data = rlsim_train_data
            train_scaled_df = pd.DataFrame(train_scaled, columns=scale_columns)
            train_scaled_df_full = pd.concat(
                [
                    train_scaled_df.reset_index(drop=True),
                    rlsim_train_data[
                        ["installment_done", "loan_done", "recovery_rate_weighted", "loan_id"]
                    ].reset_index(drop=True),
                ],
                axis=1,
            )
            rlsim_data = train_scaled_df_full
        else:
            test_scaled = scaler.transform(rlsim_test_data[scale_columns])
            test_scaled_df = pd.DataFrame(test_scaled, columns=scale_columns)
            test_scaled_df_full = pd.concat(
                [
                    test_scaled_df.reset_index(drop=True),
                    rlsim_test_data[
                        ["installment_done", "loan_done", "recovery_rate_weighted", "loan_id"]
                    ].reset_index(drop=True),
                ],
                axis=1,
            )
            rlsim_data = test_scaled_df_full
        # print(rlsim_data.head(14))
        # -------------------------------------------------------
        # group the data by loan_id
        self.grouped_data = rlsim_data.groupby("loan_id")
        # -------------------------------------------------------
        self.features_columns = feature_columns
        self.target_columns = target_columns
        self.sequences = self.create_sequences()

    def create_sequences(self):
        sequences = []
        for loan_id, group in self.grouped_data:
            # print(loan_id)
            features = group[self.features_columns].values
            targets = group[self.target_columns].values
            sequences.append((features, targets))
        return sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(
            target, dtype=torch.float32
        )

In [4]:
from torch.nn.utils.rnn import pad_sequence


def collate_fn(batch):
    features, targets = zip(*batch)  # unzip the batch and get the features and targets
    # using pad_sequence to pad sequences to the same length
    padded_features = pad_sequence(features, batch_first=True, padding_value=0)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=0)
    # get the actual lengths of the sequences
    lengths = torch.tensor([len(f) for f in features])
    return padded_features, padded_targets, lengths

In [5]:
dataset = LoanSimDataset(
    csv_file_path="./Res/simulator_data.csv",
    scale_columns = scale_columns,
    feature_columns = features,
    target_columns = targets,
    group="train",
)

dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [6]:
# from torch.nn.utils.rnn import pack_padded_sequence

# counter = 0

# for inputs, targets, lengths in dataloader:
#     # pack the padded
#     packed_inputs = pack_padded_sequence(
#         inputs, lengths, batch_first=True, enforce_sorted=False
#     )
#     packed_targets = pack_padded_sequence(
#         targets, lengths, batch_first=True, enforce_sorted=False
#     )
#     print('---- Features ----')
#     print(packed_inputs.data)
#     print('---- Targets ----')
#     print(packed_targets.data)
#     # print(inputs)
#     # print(targets)
#     print('--------------------------------------------------------')
#     counter += 1
#     if counter == 4:
#         break

In [7]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Notes of LSTM

$
\begin{array}{ll} \\
    i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
    f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
    g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
    o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
    c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
    h_t = o_t \odot \tanh(c_t) \\
\end{array}
$


In [8]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.ReLU(),
        )

    def forward(self, x, lengths):  # Accepts raw input and lengths
        # Pack sequence
        packed_x = pack_padded_sequence(
            x, lengths, batch_first=True, enforce_sorted=False
        )
        packed_out, _ = self.lstm(packed_x)
        lstm_out, _ = pad_packed_sequence(packed_out, batch_first=True)
        # Pass through MLP
        out = self.mlp(lstm_out)
        return out

In [9]:
model = LSTMModel(input_size=21, hidden_size=5, num_layers=1, output_size=1)
print("Model Structure:")
print(model)

Model Structure:
LSTMModel(
  (lstm): LSTM(21, 5, batch_first=True)
  (mlp): Sequential(
    (0): Linear(in_features=5, out_features=5, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5, out_features=5, bias=True)
    (3): ReLU()
    (4): Linear(in_features=5, out_features=1, bias=True)
    (5): ReLU()
  )
)


In [None]:
from torch import optim

# initialize the model, loss function and optimizer
input_size = len(features)  # num of features
hidden_size = 64  # size of hidden layer state
output_size = len(targets)  # number of targets


model = LSTMModel(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=1,
    output_size=output_size,
)

criterion = nn.MSELoss()  # regression question, so use MSE as the Loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)


# training process
# train function (remove packing before passing inputs to the model)
def train(model, dataloader, criterion, optimizer, num_epochs):
    # Move model to the correct device (CPU or GPU)
    model.to(device)

    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, targets, lengths in dataloader:
            optimizer.zero_grad()

            # Move inputs and targets to the correct device
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass (now we pass raw inputs)
            outputs = model(inputs, lengths)

            # Calculate the loss
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader)}")


# train the model
train(model, dataloader, criterion, optimizer, num_epochs=10)

Epoch 1/10, Loss: 0.002179307300937172
Epoch 2/10, Loss: 0.00178769360057658
Epoch 3/10, Loss: 0.0017329893869048516
Epoch 4/10, Loss: 0.0016797294340402894
Epoch 5/10, Loss: 0.0016442554486423675
Epoch 6/10, Loss: 0.0016176066729917228
Epoch 7/10, Loss: 0.001597766231980688
Epoch 8/10, Loss: 0.0015830958009060776
Epoch 9/10, Loss: 0.0015672068462032368
Epoch 10/10, Loss: 0.001559529093430349
