In [1]:
# general package
import numpy as np
# deep learning package
import torch
import torchvision.models as models
import torchvision.transforms as T
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import STL10
from torch.utils.data import DataLoader
import torchvision.transforms as T

In [2]:
# self defined modules
import DataLoader_tensor as dl
import SCDataset as ds

### Generate Dummy Data

In [3]:
# Create dummy data
# Set the parameters
n = 2180  # Total number of data points
p = 2000   # Number of features
M = 130   # Total number of groups

# Create the first array (n x p)
count_matrix = np.random.rand(n, p)  # Filling with random numbers for illustration

# Create the second array (n x 1)
# Ensure each group has at least two members
lineage = np.repeat(np.arange(1, M + 1), repeats=np.ceil(n / M))[:n]
np.random.shuffle(lineage)  # Shuffle to randomize group allocation
lineage = lineage.reshape(n, 1)

# print("Data Array (n x p):\n", count_matrix)
# print("Group Array (n x 1):\n", lineage)


In [4]:
count_matrix.shape, lineage.shape


((2180, 2000), (2180, 1))

### Generate Batches

In [5]:
# step 1 generate designed batches
batchsize = 3
DLoader = dl.SClineage_DataLoader(count_matrix,lineage,batch_size=batchsize)
batch_all, num_batch = DLoader.batch_generator()
# step 2 generate real dataloader
sc_dataset = ds.SCDataset(batches=batch_all)
data_loader = torch.utils.data.DataLoader(dataset=sc_dataset, batch_size=batchsize, shuffle=False)

In [6]:
DLoader.avail_lineages

{}

### Comparsion between two batches generated by scDataLoader and torch.DataLoader

In [7]:
num_batches = 0
for _ in data_loader:
    num_batches += 1

print(f"Total number of batches generated by scDataLoader: {num_batch}")
print(f"Total number of batches generated by torch.DataLoader: {num_batches}")

Total number of batches generated by scDataLoader: 385
Total number of batches generated by torch.DataLoader: 385


In [8]:
for i, batch in enumerate(data_loader):
    print(f"Batch generated by torch.DataLoader:")
    print(f"Batch {i+1}/{len(data_loader)}:")
    # Assuming each batch is a tuple of two tensors
    for j, cell_pair in enumerate(batch):
        cell_1, cell_2,  cell_3 = cell_pair  # Unpack the tuple
        print(f"  Sample {j+1} in batch:")
        print(f"    Cell 1: {cell_1}")
        print(f"    Cell 2: {cell_2}")
        print(f"    Cell 3: {cell_3}")
    
    # Optionally, break after a few batches to not overload the output
    if i == 0:  # Adjust this number based on how many batches you want to inspect
        break


Batch generated by torch.DataLoader:
Batch 1/385:
  Sample 1 in batch:
    Cell 1: tensor([0.5608, 0.1979, 0.1988,  ..., 0.8523, 0.8901, 0.8406])
    Cell 2: tensor([0.5125, 0.7715, 0.2251,  ..., 0.8581, 0.7422, 0.7733])
    Cell 3: tensor([0.3265, 0.1084, 0.5360,  ..., 0.8087, 0.1469, 0.9803])
  Sample 2 in batch:
    Cell 1: tensor([0.1547, 0.4102, 0.0926,  ..., 0.7680, 0.4136, 0.2097])
    Cell 2: tensor([0.7236, 0.3997, 0.6978,  ..., 0.0862, 0.8388, 0.7590])
    Cell 3: tensor([0.6757, 0.4919, 0.5903,  ..., 0.1644, 0.0920, 0.6560])


In [9]:
print(f"Batch generated by scDataLoader:")
print(f"Batch 1/31:")
batch_all[1]

Batch generated by scDataLoader:
Batch 1/31:


[(tensor([0.5608, 0.1979, 0.1988,  ..., 0.8523, 0.8901, 0.8406]),
  tensor([0.1547, 0.4102, 0.0926,  ..., 0.7680, 0.4136, 0.2097])),
 (tensor([0.5125, 0.7715, 0.2251,  ..., 0.8581, 0.7422, 0.7733]),
  tensor([0.7236, 0.3997, 0.6978,  ..., 0.0862, 0.8388, 0.7590])),
 (tensor([0.3265, 0.1084, 0.5360,  ..., 0.8087, 0.1469, 0.9803]),
  tensor([0.6757, 0.4919, 0.5903,  ..., 0.1644, 0.0920, 0.6560]))]