In [1]:
import pandas as pd
import numpy as np
from train3 import CompoundDataset
from torch.utils.data import DataLoader, SubsetRandomSampler, WeightedRandomSampler
from sklearn.model_selection import train_test_split

In [3]:
data_dir = '/DATA2'
X_filename = 'X_pretrain'
y_filename = 'y_pretrain'
eval_name = 'val'

X_data_eval = pd.read_csv(f'{data_dir}/{X_filename}_{eval_name}.csv', index_col=0)
y_data_eval = pd.read_csv(f'{data_dir}/{y_filename}_{eval_name}.csv', index_col=0)

In [20]:
import torch
def create_dataloaders2(torch_dataset, batch_size, holdout_frac=0, shuffle=True, set_name='train'):
    
    if holdout_frac == 0:
        return {set_name: DataLoader(torch_dataset, batch_size=batch_size, shuffle=shuffle)}

    train_size = int((1-holdout_frac) * len(torch_dataset))
    holdout_size = len(torch_dataset) - train_size

    train_dataset, holdout_dataset = torch.utils.data.random_split(torch_dataset, [train_size, holdout_size])

    return {set_name: DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle),
            f'{set_name}_holdout': DataLoader(holdout_dataset, batch_size=batch_size, shuffle=False)}

In [38]:
def create_dataloaders(torch_dataset, batch_size, holdout_frac=0, shuffle=True, set_name='train', stratify=None):
    if holdout_frac == 0:
        return {set_name: DataLoader(torch_dataset, batch_size=batch_size, shuffle=shuffle)}

    # Get the targets of your dataset if it's available
    if stratify is not None:
        targets = [item[stratify] for item in torch_dataset]
    else:
        targets = None

    # Split the indices of your dataset
    indices = list(range(len(torch_dataset)))
    train_indices, holdout_indices = train_test_split(indices, test_size=holdout_frac, stratify=targets)

    # Create samplers
    train_sampler = SubsetRandomSampler(train_indices)
    holdout_sampler = SubsetRandomSampler(holdout_indices)

    return {set_name: DataLoader(torch_dataset, batch_size=batch_size, sampler=train_sampler),
            f'{set_name}_holdout': DataLoader(torch_dataset, batch_size=batch_size, sampler=holdout_sampler)}

# but the legnth of the dataloader is not the same as the length of the dataset?


In [4]:
y_head_cols= ['is Pediatric','Cohort Label ENC', 'is Female']
y_adv_cols= ['Study ID ENC']

In [43]:
eval_dataset = CompoundDataset(X_data_eval,y_data_eval[y_head_cols], y_data_eval[y_adv_cols])
dl_dict = create_dataloaders(eval_dataset, 32, holdout_frac=0.2, shuffle=True, set_name='eval',stratify=2)

In [44]:
len(eval_dataset)

2542

In [46]:
len(dl_dict['eval_holdout'])

16

In [45]:
len(dl_dict['eval'])

64

In [41]:
len(dl_dict['eval'])

80

In [29]:
eval_dataset.y_head

tensor([[1., 3., 1.],
        [1., 2., nan],
        [1., 2., nan],
        ...,
        [0., 0., nan],
        [0., 0., nan],
        [0., 0., nan]])

In [30]:
targets = [item[1] for item in eval_dataset]

In [35]:
targets[2][torch.isnan(targets[2])]


tensor([nan])

In [36]:
targets = [item[1] for item in eval_dataset]
# convert nan to -1
for item in targets:
    item[torch.isnan(item)] = -1

indices = list(range(len(eval_dataset)))
train_indices, holdout_indices = train_test_split(indices, test_size=0.2, stratify=targets)

In [37]:
train_indices

[1374,
 1566,
 1112,
 1607,
 1356,
 1513,
 1519,
 1452,
 124,
 1308,
 951,
 1727,
 1152,
 985,
 1721,
 2490,
 1196,
 1133,
 2111,
 636,
 387,
 1399,
 1550,
 751,
 476,
 702,
 717,
 1437,
 1808,
 1774,
 681,
 2040,
 571,
 1782,
 1035,
 910,
 242,
 282,
 1775,
 37,
 901,
 2154,
 1325,
 1632,
 469,
 290,
 716,
 863,
 390,
 1364,
 60,
 374,
 2311,
 823,
 2056,
 497,
 1989,
 1565,
 1871,
 15,
 1846,
 661,
 884,
 2320,
 1185,
 147,
 550,
 1638,
 2068,
 822,
 361,
 81,
 1854,
 1878,
 1177,
 2324,
 1537,
 1761,
 1397,
 321,
 1818,
 87,
 1743,
 2528,
 1612,
 1422,
 866,
 555,
 627,
 1995,
 1645,
 32,
 1772,
 1683,
 1903,
 1367,
 1553,
 1318,
 99,
 1230,
 871,
 2119,
 250,
 1851,
 2442,
 1220,
 1733,
 451,
 1460,
 264,
 1850,
 2441,
 1287,
 875,
 23,
 2476,
 1996,
 1659,
 2494,
 770,
 2139,
 630,
 607,
 650,
 1525,
 1232,
 1555,
 262,
 405,
 63,
 166,
 2053,
 966,
 2058,
 1505,
 2071,
 2318,
 560,
 1703,
 1225,
 697,
 1915,
 1677,
 1732,
 207,
 1706,
 1549,
 2257,
 1434,
 1128,
 2259,
 513,
 208

In [21]:
dl = DataLoader(eval_dataset[train_indices], batch_size=32, shuffle=True)

In [25]:
eval2_dataset = eval_dataset[train_indices]

In [27]:
type(eval_dataset)

train3.CompoundDataset

In [26]:
type(eval2_dataset)

tuple

In [24]:
len(dl)

1

In [23]:
for data in dl:
    X, y_head, y_adv, other = data
    print(X.shape)
    break

RuntimeError: stack expects each tensor to be equal size, but got [2033, 3] at entry 0 and [2033, 1] at entry 1

In [19]:
eval_dataset[train_indices]

(tensor([[ 0.0697,  0.0697,  0.0697,  ...,  0.8696,  0.0697,  0.0697],
         [ 1.7929,  1.7929,  1.7929,  ...,  1.7929,  1.7929,  1.7929],
         [-1.3188, -1.3188, -1.3188,  ...,  0.0595, -1.3188, -1.3188],
         ...,
         [-0.8660, -1.0555, -0.3842,  ..., -1.0555, -1.0555, -1.0555],
         [-1.0225, -1.0225, -1.0225,  ..., -1.0225, -1.0225, -1.0225],
         [-1.5796, -0.9991, -0.8102,  ..., -0.0470, -1.0233, -0.2595]]),
 tensor([[1., 2., nan],
         [0., 1., nan],
         [1., 3., 1.],
         ...,
         [1., 3., nan],
         [0., 1., 0.],
         [1., 2., nan]]),
 tensor([[ 2.],
         [14.],
         [18.],
         ...,
         [12.],
         [10.],
         [13.]]),
 tensor([[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]]))

In [17]:
len(train_indices)

2033

In [14]:

targets = [item[2] for item in eval_dataset]

In [13]:
eval_dataset[0]

(tensor([6.1827, 6.1827, 6.1827,  ..., 2.5858, 6.1827, 6.1827]),
 tensor([1., 3., 1.]),
 tensor([18.]),
 tensor([0.]))

In [None]:
eval_dataset = CompoundDataset(X_data_eval,y_data_eval[y_head_cols], y_data_eval[y_adv_cols])

train_loader_dct = create_dataloaders(train_dataset, batch_size, holdout_frac, set_name=train_name)
eval_loader_dct = create_dataloaders(eval_dataset, batch_size, set_name = eval_name)