In [1]:
scale_columns = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
    "y_installment",
    "y_installment_timestep",
    "y_state_cum_overduelength",
    "y_remaining_debt",
    "y_state_capital",
    "y_state_interests",
    "y_state_penalty",
]

In [2]:
features = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
]


targets = [
    "y_installment",
    "y_installment_timestep",
    "y_state_cum_overduelength",
    "y_remaining_debt",
    "y_state_capital",
    "y_state_interests",
    "y_state_penalty",
    "installment_done",
    "loan_done",
    "recovery_rate_weighted",
]

In [3]:
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler


class LoanSimDataset(Dataset):
    def __init__(
        self,
        csv_file_path,
        scale_columns,
        feature_columns,
        target_columns,
        group="train",
    ):
        # -------------------------------------------------------
        # read the csv file
        data = pd.read_csv(csv_file_path)
        # choose train or test data for RLsimulator
        # rlsim_data = data.loc[(data["group"] == group)]
        rlsim_train_data = data.loc[data["group"] == "train"]
        rlsim_test_data = data.loc[
            data["group"] == "test"]
        # -------------------------------------------------------
        # scale the data
        scaler = StandardScaler()
        # we use train data to fit the scaler
        train_scaled = scaler.fit_transform(
            rlsim_train_data[scale_columns]
        )
        if group == "train":
            # rlsim_data = rlsim_train_data
            train_scaled_df = pd.DataFrame(train_scaled, columns=scale_columns)
            train_scaled_df_full = pd.concat(
                [
                    train_scaled_df.reset_index(drop=True),
                    rlsim_train_data[
                        ["installment_done", "loan_done", "recovery_rate_weighted", "loan_id"]
                    ].reset_index(drop=True),
                ],
                axis=1,
            )
            rlsim_data = train_scaled_df_full
        else:
            test_scaled = scaler.transform(rlsim_test_data[scale_columns])
            test_scaled_df = pd.DataFrame(test_scaled, columns=scale_columns)
            test_scaled_df_full = pd.concat(
                [
                    test_scaled_df.reset_index(drop=True),
                    rlsim_test_data[
                        ["installment_done", "loan_done", "recovery_rate_weighted", "loan_id"]
                    ].reset_index(drop=True),
                ],
                axis=1,
            )
            rlsim_data = test_scaled_df_full
        # print(rlsim_data.head(14))
        # -------------------------------------------------------
        # group the data by loan_id
        self.grouped_data = rlsim_data.groupby("loan_id")
        # -------------------------------------------------------
        self.features_columns = feature_columns
        self.target_columns = target_columns
        self.sequences = self.create_sequences()

    def create_sequences(self):
        sequences = []
        for loan_id, group in self.grouped_data:
            # print(loan_id)
            features = group[self.features_columns].values
            targets = group[self.target_columns].values
            sequences.append((features, targets))
        return sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(
            target, dtype=torch.float32
        )

In [4]:
from torch.nn.utils.rnn import pad_sequence


def collate_fn(batch):
    features, targets = zip(*batch)  # unzip the batch and get the features and targets
    # using pad_sequence to pad sequences to the same length
    padded_features = pad_sequence(features, batch_first=True, padding_value=0)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=0)
    # get the actual lengths of the sequences
    lengths = torch.tensor([len(f) for f in features])
    return padded_features, padded_targets, lengths

In [5]:
dataset = LoanSimDataset(
    csv_file_path="./Res/simulator_data.csv",
    scale_columns = scale_columns,
    feature_columns = features,
    target_columns = targets,
    group="train",
)

dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [6]:
from torch.nn.utils.rnn import pack_padded_sequence

counter = 0

for inputs, targets, lengths in dataloader:
    # pack the padded
    packed_inputs = pack_padded_sequence(
        inputs, lengths, batch_first=True, enforce_sorted=False
    )
    packed_targets = pack_padded_sequence(
        targets, lengths, batch_first=True, enforce_sorted=False
    )
    print('---- Features ----')
    print(packed_inputs.data)
    print('---- Targets ----')
    print(packed_targets.data)
    # print(inputs)
    # print(targets)
    print('--------------------------------------------------------')
    counter += 1
    if counter == 4:
        break

---- Features ----
tensor([[-0.9373,  0.5497,  0.1249,  0.3590, -0.2112, -0.9555, -0.7942, -0.4098,
         -0.0082, -0.0293, -0.5255,  0.5704,  0.9974, -1.0046, -1.4363, -0.9879,
         -0.6120,  0.8437, -0.7338, -0.8503, -0.4944],
        [-0.9373,  0.5497,  0.1249,  0.3590, -0.2112, -0.9555, -0.7942, -0.4098,
         -0.0082, -0.0293, -0.5255,  0.5704,  0.9974, -1.0046, -0.9200, -0.9879,
         -0.6120,  0.8437,  0.0290, -0.3791, -0.4944],
        [-0.9373,  0.5497,  0.1249,  0.3590, -0.2112, -0.9555, -0.7942, -0.4098,
         -0.0082, -0.0293, -0.5255,  0.5704,  0.9974, -1.0046, -0.4037, -0.9879,
         -0.6120,  0.8437,  0.7918,  0.0921, -0.4944],
        [-0.9373,  0.5497,  0.1249,  0.3590, -0.2112, -0.9555, -0.7942, -0.4098,
         -0.0082, -0.0293, -0.5255,  0.5704,  0.9974, -1.0046,  0.1126, -0.9879,
         -0.6120,  0.8437,  1.5546,  0.5632, -0.4944],
        [-0.9373,  0.5497,  0.1249,  0.3590, -0.2112, -0.9555, -0.7942, -0.4098,
         -0.0082, -0.0293, -0.52