# Dataset and DataLoader in PyTorch

Example: `CustomDataset`

In [None]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        """
        Args:
        - data: data samples, can be list, NumPy array, etc.
        - labels: data labels, can be list, NumPy array, etc
        """
        self.data = data
        self.labels = labels

    def __len__(self):
        """size of the dataset"""
        return len(self.data)

    def __getitem__(self, idx):
        """return a sample and its label"""
        sample = self.data[idx]
        label = self.labels[idx]
        return sample, label

In [None]:
from torch.utils.data import DataLoader

# create some data
data = [[1, 2], [3, 4], [5, 6], [7, 8]]
labels = [0, 1, 0, 1]

# create a dataset instance
dataset = CustomDataset(data, labels)

# create a DataLoader instance
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# iterate over the DataLoader instance
for batch_idx, (batch_data, batch_labels) in enumerate(dataloader):
    print(f"Batch {batch_idx}:")
    print(f"Data: {batch_data}")
    print(f"Labels: {batch_labels}")

## In RL Simulator

In [1]:
import pandas as pd
data_simulator_full = pd.read_csv('./Res/simulator_data.csv')
data_simulator_full

Unnamed: 0,group,loan_id,action_num_actual,gender,age,amount,num_loan,duration,year_ratio,diff_city,...,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty,installment_done,loan_done,recovery_rate_weighted
0,train,/+6C2lDDYJgzzCXpn96AFA==,0,1,18,2000,1,6,16,40421.53,...,2,1,2,2000.000000,666.666667,53.333333,3.333333,1,0,0.205350
1,train,/+6C2lDDYJgzzCXpn96AFA==,1,1,18,2000,1,6,16,40421.53,...,2,2,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
2,train,/+6C2lDDYJgzzCXpn96AFA==,2,1,18,2000,1,6,16,40421.53,...,2,3,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
3,train,/+6C2lDDYJgzzCXpn96AFA==,3,1,18,2000,1,6,16,40421.53,...,2,4,8,2000.000000,666.666667,53.333333,13.333333,0,0,0.000000
4,train,/+6C2lDDYJgzzCXpn96AFA==,4,1,18,2000,1,6,16,40421.53,...,3,1,0,1666.666667,666.666667,53.333333,0.000000,1,0,0.177079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200009,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,1,2,1740.000000,1740.000000,130.500000,8.700000,1,0,0.000000
200010,test,ZWocy2KIKqJbgKZVoXZIFA==,1,1,30,2900,1,5,18,0.00,...,5,2,-15,1740.000000,1740.000000,130.500000,-37.700000,0,0,0.000000
200011,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,3,24,1740.000000,1740.000000,130.500000,78.300000,0,0,0.000000
200012,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,4,-1,1740.000000,1740.000000,130.500000,8.700000,0,0,0.000000


In [None]:
# show dataset names
data_simulator_full.columns

In [None]:
simulator_data_train = data_simulator_full.loc[data_simulator_full['group'] == 'train']
simulator_data_train

In [None]:
simulator_data_test = data_simulator_full.loc[data_simulator_full['group'] == 'test']
simulator_data_test

In [None]:
from sklearn.preprocessing import StandardScaler

# the columns that we want to normalize are numeric columns (e.g. 'age', 'amount', 'num_loan', 'duration' etc.)
numeric_columns = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
    "y_installment",
    "y_installment_timestep",
    "y_state_cum_overduelength",
    "y_remaining_debt",
    "y_state_capital",
    "y_state_interests",
    "y_state_penalty",
]


scaler = StandardScaler()
train_no_scaled = simulator_data_train[numeric_columns]
test_no_scaled = simulator_data_test[numeric_columns]
# print(train_no_scaled.head())
# print(test_no_scaled.head())

In [None]:
train_scaled = scaler.fit_transform(train_no_scaled)
train_scaled_df = pd.DataFrame(train_scaled, columns=numeric_columns)
train_scaled_df_full = pd.concat(
    [
        train_scaled_df.reset_index(drop=True),
        simulator_data_train[
            ["installment_done", "loan_done", "recovery_rate_weighted", "loan_id"]
        ].reset_index(drop=True),
    ],
    axis=1,
)
train_scaled_df_full

In [None]:
test_scaled = scaler.transform(test_no_scaled)
test_scaled_df = pd.DataFrame(test_scaled, columns=numeric_columns)
test_scaled_df_full = pd.concat(
    [
        test_scaled_df.reset_index(drop=True),
        simulator_data_test[
            ["installment_done", "loan_done", "recovery_rate_weighted", "loan_id"]
        ].reset_index(drop=True),
    ],
    axis=1,
)
test_scaled_df_full

In [None]:
features = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
]


targets = [
    "y_installment",
    "y_installment_timestep",
    "y_state_cum_overduelength",
    "y_remaining_debt",
    "y_state_capital",
    "y_state_interests",
    "y_state_penalty",
    "installment_done",
    "loan_done",
    "recovery_rate_weighted",
]

In [None]:
import numpy as np


rlsim_train_features = train_scaled_df_full[["loan_id"] + features]
rlsim_train_targets = train_scaled_df_full[["loan_id"] + targets]
rlsim_test_features = test_scaled_df_full[["loan_id"] + features]
rlsim_test_targets = test_scaled_df_full[["loan_id"] + targets]

In [None]:
train_groups = rlsim_train_features.groupby("loan_id")
test_groups = rlsim_test_features.groupby("loan_id")


train_sequences = []
test_sequences = []
train_targets = []
test_targets = []


for loan_id, group in train_groups:
    # get the feature sequence (remove the loan_id column)
    sequence = group[features].values
    train_sequences.append(sequence)

    # get the target sequence (remove the loan_id column)
    target = rlsim_train_targets[rlsim_train_targets["loan_id"] == loan_id][
        targets
    ].values
    train_targets.append(target)

In [None]:
train_targets

In [None]:
# rlsim_train_features = train_scaled_df_full[features].values
# rlsim_train_targets = train_scaled_df_full[targets].values
# rlsim_test_features = test_scaled_df_full[features].values
# rlsim_test_targets = test_scaled_df_full[targets].values

In [None]:
class LoanSimDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [None]:
# create a dataset instance
train_dataset = LoanSimDataset(rlsim_train_features, rlsim_train_targets)
test_dataset = LoanSimDataset(rlsim_test_features, rlsim_test_targets)

# create a DataLoader instance
train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [None]:
train_dataset = LoanSimDataset(train_sequences, train_targets)

In [None]:
for batch_idx, (batch_data, batch_labels) in enumerate(train_dataset):
    print(f"Batch {batch_idx}:")
    print(f"Data: {batch_data}")
    print(f"Labels: {batch_labels}")