# Dataset and DataLoader in PyTorch

Example: `CustomDataset`

In [45]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        """
        Args:
        - data: data samples, can be list, NumPy array, etc.
        - labels: data labels, can be list, NumPy array, etc
        """
        self.data = data
        self.labels = labels

    def __len__(self):
        """size of the dataset"""
        return len(self.data)

    def __getitem__(self, idx):
        """return a sample and its label"""
        sample = self.data[idx]
        label = self.labels[idx]
        return sample, label

In [46]:
from torch.utils.data import DataLoader

# create some data
data = [[1, 2], [3, 4], [5, 6], [7, 8]]
labels = [0, 1, 0, 1]

# create a dataset instance
dataset = CustomDataset(data, labels)

# create a DataLoader instance
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# iterate over the DataLoader instance
for batch_idx, (batch_data, batch_labels) in enumerate(dataloader):
    print(f"Batch {batch_idx}:")
    print(f"Data: {batch_data}")
    print(f"Labels: {batch_labels}")

Batch 0:
Data: [tensor([7, 5]), tensor([8, 6])]
Labels: tensor([1, 0])
Batch 1:
Data: [tensor([1, 3]), tensor([2, 4])]
Labels: tensor([0, 1])


## In RL Simulator

In [47]:
import pandas as pd
data_simulator_full = pd.read_csv('./Res/simulator_data.csv')
data_simulator_full

Unnamed: 0,group,loan_id,action_num_actual,gender,age,amount,num_loan,duration,year_ratio,diff_city,...,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty,installment_done,loan_done,recovery_rate_weighted
0,train,/+6C2lDDYJgzzCXpn96AFA==,0,1,18,2000,1,6,16,40421.53,...,2,1,2,2000.000000,666.666667,53.333333,3.333333,1,0,0.205350
1,train,/+6C2lDDYJgzzCXpn96AFA==,1,1,18,2000,1,6,16,40421.53,...,2,2,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
2,train,/+6C2lDDYJgzzCXpn96AFA==,2,1,18,2000,1,6,16,40421.53,...,2,3,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
3,train,/+6C2lDDYJgzzCXpn96AFA==,3,1,18,2000,1,6,16,40421.53,...,2,4,8,2000.000000,666.666667,53.333333,13.333333,0,0,0.000000
4,train,/+6C2lDDYJgzzCXpn96AFA==,4,1,18,2000,1,6,16,40421.53,...,3,1,0,1666.666667,666.666667,53.333333,0.000000,1,0,0.177079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200009,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,1,2,1740.000000,1740.000000,130.500000,8.700000,1,0,0.000000
200010,test,ZWocy2KIKqJbgKZVoXZIFA==,1,1,30,2900,1,5,18,0.00,...,5,2,-15,1740.000000,1740.000000,130.500000,-37.700000,0,0,0.000000
200011,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,3,24,1740.000000,1740.000000,130.500000,78.300000,0,0,0.000000
200012,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,4,-1,1740.000000,1740.000000,130.500000,8.700000,0,0,0.000000


In [48]:
# show dataset names
data_simulator_full.columns

Index(['group', 'loan_id', 'action_num_actual', 'gender', 'age', 'amount',
       'num_loan', 'duration', 'year_ratio', 'diff_city', 'marriage', 'kids',
       'month_in', 'housing', 'edu', 'motivation', 'installment',
       'installment_timestep', 'state_cum_overduelength', 'remaining_debt',
       'state_capital', 'state_interests', 'state_penalty', 'y_installment',
       'y_installment_timestep', 'y_state_cum_overduelength',
       'y_remaining_debt', 'y_state_capital', 'y_state_interests',
       'y_state_penalty', 'installment_done', 'loan_done',
       'recovery_rate_weighted'],
      dtype='object')

In [49]:
simulator_data_train = data_simulator_full.loc[data_simulator_full['group'] == 'train']
simulator_data_train

Unnamed: 0,group,loan_id,action_num_actual,gender,age,amount,num_loan,duration,year_ratio,diff_city,...,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty,installment_done,loan_done,recovery_rate_weighted
0,train,/+6C2lDDYJgzzCXpn96AFA==,0,1,18,2000,1,6,16,40421.53,...,2,1,2,2000.000000,666.666667,53.333333,3.333333,1,0,0.205350
1,train,/+6C2lDDYJgzzCXpn96AFA==,1,1,18,2000,1,6,16,40421.53,...,2,2,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
2,train,/+6C2lDDYJgzzCXpn96AFA==,2,1,18,2000,1,6,16,40421.53,...,2,3,7,2000.000000,666.666667,53.333333,11.666667,0,0,0.000000
3,train,/+6C2lDDYJgzzCXpn96AFA==,3,1,18,2000,1,6,16,40421.53,...,2,4,8,2000.000000,666.666667,53.333333,13.333333,0,0,0.000000
4,train,/+6C2lDDYJgzzCXpn96AFA==,4,1,18,2000,1,6,16,40421.53,...,3,1,0,1666.666667,666.666667,53.333333,0.000000,1,0,0.177079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180220,train,zZidoz4jexrnwPrrefjGlw==,1,1,21,3000,1,5,20,105465.48,...,2,2,0,2400.000000,600.000000,50.000000,0.000000,0,0,0.000000
180221,train,zZidoz4jexrnwPrrefjGlw==,2,1,21,3000,1,5,20,105465.48,...,3,1,0,2400.000000,1200.000000,100.000000,0.000000,1,0,0.000000
180222,train,zZidoz4jexrnwPrrefjGlw==,0,1,21,3000,1,5,20,105465.48,...,4,1,0,2400.000000,1800.000000,150.000000,0.000000,1,0,0.000000
180223,train,zZidoz4jexrnwPrrefjGlw==,0,1,21,3000,1,5,20,105465.48,...,5,1,0,2400.000000,2400.000000,200.000000,0.000000,1,0,0.351620


In [50]:
simulator_data_test = data_simulator_full.loc[data_simulator_full['group'] == 'test']
simulator_data_test

Unnamed: 0,group,loan_id,action_num_actual,gender,age,amount,num_loan,duration,year_ratio,diff_city,...,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty,installment_done,loan_done,recovery_rate_weighted
180225,test,/+dnR8Gjp0/XWZfHIN19fA==,0,1,27,1000,1,5,14,-4557.94,...,2,1,3,1000.0,400.0,23.333333,3.0,1,0,0.305151
180226,test,/+dnR8Gjp0/XWZfHIN19fA==,1,1,27,1000,1,5,14,-4557.94,...,2,2,9,1000.0,400.0,23.333333,9.0,0,0,0.000000
180227,test,/+dnR8Gjp0/XWZfHIN19fA==,2,1,27,1000,1,5,14,-4557.94,...,2,3,15,1000.0,400.0,23.333333,15.0,0,0,0.000000
180228,test,/+dnR8Gjp0/XWZfHIN19fA==,0,1,27,1000,1,5,14,-4557.94,...,2,4,21,1000.0,400.0,23.333333,21.0,0,0,0.000000
180229,test,/+dnR8Gjp0/XWZfHIN19fA==,0,1,27,1000,1,5,14,-4557.94,...,3,1,6,1000.0,600.0,35.000000,9.0,1,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200009,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,1,2,1740.0,1740.0,130.500000,8.7,1,0,0.000000
200010,test,ZWocy2KIKqJbgKZVoXZIFA==,1,1,30,2900,1,5,18,0.00,...,5,2,-15,1740.0,1740.0,130.500000,-37.7,0,0,0.000000
200011,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,3,24,1740.0,1740.0,130.500000,78.3,0,0,0.000000
200012,test,ZWocy2KIKqJbgKZVoXZIFA==,0,1,30,2900,1,5,18,0.00,...,5,4,-1,1740.0,1740.0,130.500000,8.7,0,0,0.000000


In [51]:
from sklearn.preprocessing import StandardScaler

# the columns that we want to normalize are numeric columns (e.g. 'age', 'amount', 'num_loan', 'duration' etc.)
numeric_columns = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
    "y_installment",
    "y_installment_timestep",
    "y_state_cum_overduelength",
    "y_remaining_debt",
    "y_state_capital",
    "y_state_interests",
    "y_state_penalty",
]


scaler = StandardScaler()
train_no_scaled = simulator_data_train[numeric_columns]
test_no_scaled = simulator_data_test[numeric_columns]
# print(train_no_scaled.head())
# print(test_no_scaled.head())

In [52]:
train_scaled = scaler.fit_transform(train_no_scaled)
train_scaled_df = pd.DataFrame(train_scaled, columns=numeric_columns)
train_scaled_df_full = pd.concat(
    [
        train_scaled_df.reset_index(drop=True),
        simulator_data_train[
            ["installment_done", "loan_done", "recovery_rate_weighted"]
        ].reset_index(drop=True),
    ],
    axis=1,
)
train_scaled_df_full

Unnamed: 0,action_num_actual,gender,age,amount,num_loan,duration,year_ratio,diff_city,marriage,kids,...,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty,installment_done,loan_done,recovery_rate_weighted
0,-0.937299,0.549748,-1.789592,-1.430495,-0.21118,-0.228040,0.102006,0.731038,-0.00816,-0.029339,...,-1.130985,-1.054981,-0.417120,-0.423009,-0.739843,-0.682376,-0.424225,1,0,0.205350
1,-0.171089,0.549748,-1.789592,-1.430495,-0.21118,-0.228040,0.102006,0.731038,-0.00816,-0.029339,...,-1.130985,-0.200708,0.084661,-0.423009,-0.739843,-0.682376,-0.232703,0,0,0.000000
2,0.595120,0.549748,-1.789592,-1.430495,-0.21118,-0.228040,0.102006,0.731038,-0.00816,-0.029339,...,-1.130985,0.653566,0.084661,-0.423009,-0.739843,-0.682376,-0.232703,0,0,0.000000
3,1.361330,0.549748,-1.789592,-1.430495,-0.21118,-0.228040,0.102006,0.731038,-0.00816,-0.029339,...,-1.130985,1.507840,0.185017,-0.423009,-0.739843,-0.682376,-0.194398,0,0,0.000000
4,2.127539,0.549748,-1.789592,-1.430495,-0.21118,-0.228040,0.102006,0.731038,-0.00816,-0.029339,...,-0.600429,-1.054981,-0.617832,-0.861173,-0.739843,-0.682376,-0.500833,1,0,0.177079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180220,-0.171089,0.549748,-1.267448,0.359035,-0.21118,-0.955485,1.894390,2.566761,-0.00816,-0.029339,...,-1.130985,-0.200708,-0.617832,0.102787,-0.821504,-0.725801,-0.500833,0,0,0.000000
180221,0.595120,0.549748,-1.267448,0.359035,-0.21118,-0.955485,1.894390,2.566761,-0.00816,-0.029339,...,-0.600429,-1.054981,-0.617832,0.102787,-0.086554,-0.074425,-0.500833,1,0,0.000000
180222,-0.937299,0.549748,-1.267448,0.359035,-0.21118,-0.955485,1.894390,2.566761,-0.00816,-0.029339,...,-0.069872,-1.054981,-0.617832,0.102787,0.648396,0.576951,-0.500833,1,0,0.000000
180223,-0.937299,0.549748,-1.267448,0.359035,-0.21118,-0.955485,1.894390,2.566761,-0.00816,-0.029339,...,0.460684,-1.054981,-0.617832,0.102787,1.383345,1.228327,-0.500833,1,0,0.351620


In [53]:
test_scaled = scaler.transform(test_no_scaled)
test_scaled_df = pd.DataFrame(test_scaled, columns=numeric_columns)
test_scaled_df_full = pd.concat(
    [
        test_scaled_df.reset_index(drop=True),
        simulator_data_test[
            ["installment_done", "loan_done", "recovery_rate_weighted"]
        ].reset_index(drop=True),
    ],
    axis=1,
)
test_scaled_df_full

Unnamed: 0,action_num_actual,gender,age,amount,num_loan,duration,year_ratio,diff_city,marriage,kids,...,y_installment,y_installment_timestep,y_state_cum_overduelength,y_remaining_debt,y_state_capital,y_state_interests,y_state_penalty,installment_done,loan_done,recovery_rate_weighted
0,-0.937299,0.549748,-0.223159,-3.220025,-0.21118,-0.955485,-0.794186,-0.538409,-0.00816,-0.029339,...,-1.130985,-1.054981,-0.316764,-1.737500,-1.066487,-1.073201,-0.431885,1,0,0.305151
1,-0.171089,0.549748,-0.223159,-3.220025,-0.21118,-0.955485,-0.794186,-0.538409,-0.00816,-0.029339,...,-1.130985,-0.200708,0.285374,-1.737500,-1.066487,-1.073201,-0.293990,0,0,0.000000
2,0.595120,0.549748,-0.223159,-3.220025,-0.21118,-0.955485,-0.794186,-0.538409,-0.00816,-0.029339,...,-1.130985,0.653566,0.887511,-1.737500,-1.066487,-1.073201,-0.156094,0,0,0.000000
3,-0.937299,0.549748,-0.223159,-3.220025,-0.21118,-0.955485,-0.794186,-0.538409,-0.00816,-0.029339,...,-1.130985,1.507840,1.489648,-1.737500,-1.066487,-1.073201,-0.018198,0,0,0.000000
4,-0.937299,0.549748,-0.223159,-3.220025,-0.21118,-0.955485,-0.794186,-0.538409,-0.00816,-0.029339,...,-0.600429,-1.054981,-0.015695,-1.737500,-0.821504,-0.921214,-0.293990,1,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19784,-0.937299,0.549748,0.298985,0.180082,-0.21118,-0.955485,0.998198,-0.409771,-0.00816,-0.029339,...,0.460684,-1.054981,-0.417120,-0.764777,0.574901,0.322915,-0.300885,1,0,0.000000
19785,-0.171089,0.549748,0.298985,0.180082,-0.21118,-0.955485,0.998198,-0.409771,-0.00816,-0.029339,...,0.460684,-0.200708,-2.123175,-0.764777,0.574901,0.322915,-1.367278,0,0,0.000000
19786,-0.937299,0.549748,0.298985,0.180082,-0.21118,-0.955485,0.998198,-0.409771,-0.00816,-0.029339,...,0.460684,0.653566,1.790717,-0.764777,0.574901,0.322915,1.298705,0,0,0.000000
19787,-0.937299,0.549748,0.298985,0.180082,-0.21118,-0.955485,0.998198,-0.409771,-0.00816,-0.029339,...,0.460684,1.507840,-0.718188,-0.764777,0.574901,0.322915,-0.300885,0,0,0.000000


In [54]:
features = [
    "action_num_actual",
    "gender",
    "age",
    "amount",
    "num_loan",
    "duration",
    "year_ratio",
    "diff_city",
    "marriage",
    "kids",
    "month_in",
    "housing",
    "edu",
    "motivation",
    "installment",
    "installment_timestep",
    "state_cum_overduelength",
    "remaining_debt",
    "state_capital",
    "state_interests",
    "state_penalty",
]


targets = [
    "y_installment",
    "y_installment_timestep",
    "y_state_cum_overduelength",
    "y_remaining_debt",
    "y_state_capital",
    "y_state_interests",
    "y_state_penalty",
    "installment_done",
    "loan_done",
    "recovery_rate_weighted",
]

In [55]:
rlsim_train_features = train_scaled_df_full[features].values
rlsim_train_targets = train_scaled_df_full[targets].values
rlsim_test_features = test_scaled_df_full[features].values
rlsim_test_targets = test_scaled_df_full[targets].values

In [56]:
class LoanSimDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [57]:
# create a dataset instance
train_dataset = LoanSimDataset(rlsim_train_features, rlsim_train_targets)
test_dataset = LoanSimDataset(rlsim_test_features, rlsim_test_targets)

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [58]:
for batch_idx, (batch_data, batch_labels) in enumerate(test_loader):
    print(f"Batch {batch_idx}:")
    print(f"Data: {batch_data.size()}")
    print(f"Labels: {batch_labels.size()}")

Batch 0:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 1:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 2:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 3:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 4:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 5:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 6:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 7:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 8:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 9:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 10:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 11:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 12:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 13:
Data: torch.Size([1000, 21])
Labels: torch.Size([1000, 10])
Batch 14:
Data: torch.Size([10