In [1]:
import pandas as pd
def load_data():
    splits = {'train': 'data/train-00000-of-00001-1359597a978bc4fa.parquet', 'valid': 'data/valid-00000-of-00001-70d52db3c749a935.parquet'}
    train_df:pd.DataFrame = pd.read_parquet("hf://datasets/zh-plus/tiny-imagenet/" + splits["train"])
    val_df:pd.DataFrame = pd.read_parquet("hf://datasets/zh-plus/tiny-imagenet/" + splits["valid"])
    return train_df, val_df

In [2]:
train_df, val_df= load_data()

In [3]:
train_df.head()

Unnamed: 0,image,label
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0


In [4]:
from PIL import Image
import io
import numpy as np
def apply_fn(image_dict):
    byte_data = image_dict['bytes']
    img = Image.open(io.BytesIO(byte_data))

    # Convert the image to a NumPy array
    img_array = np.array(img)

    # # Print the shape of the array (e.g., height, width, channels)
    # print(img_array.shape)
    return img_array

In [5]:
def check_image_array_shapes(df)-> pd.DataFrame:
    count:int = 0
    rows_to_drop = []
    expected_shape:tuple = (64, 64, 3)
    for idx, img_array in enumerate(df['image_array']):
        if img_array.shape != expected_shape:
            count +=1
            print(f'Assertion failed at index {idx}: Expected shape {expected_shape}, but got {img_array.shape}')
            rows_to_drop.append(idx)
            #raise Exception(f"Assertion failed at index {idx}: Expected shape {expected_shape}, but got {img_array.shape}")
    
    if rows_to_drop:
        df.drop(index=rows_to_drop, inplace=True)
        print(f"Dropped {len(rows_to_drop)} rows with mismatched shapes.")
    return df
    #return count

def clean_df(df):
    df['image_array'] = df['image'].apply(lambda x: apply_fn(x))
    try:
        df = check_image_array_shapes(df)
        df = df.reset_index()
        return df
        #print(result)
    except Exception as e:
        print(e)
    

In [6]:
train_df.columns

Index(['image', 'label'], dtype='object')

In [7]:
clean_df(train_df)
train_df.columns

Assertion failed at index 1854: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 1973: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 2218: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 2360: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 2406: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 2491: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 2683: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 3141: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 3192: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 3357: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 3371: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 3406: Expected shape (64, 64, 3), but got (64, 64)
Assertion failed at index 3416: Expected shape (64, 64, 3), but got (64, 64)

Index(['image', 'label', 'image_array'], dtype='object')

In [8]:
train_df.shape

(98179, 3)

In [9]:
train_df.head()

Unnamed: 0,image,label,image_array
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0,"[[[255, 136, 193], [255, 138, 192], [249, 146,..."
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0,"[[[20, 33, 7], [19, 32, 6], [23, 31, 10], [28,..."
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0,"[[[120, 124, 127], [89, 93, 96], [80, 84, 87],..."
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0,"[[[144, 170, 245], [121, 147, 221], [140, 166,..."
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0,"[[[144, 145, 147], [141, 142, 144], [140, 140,..."


In [10]:
import torch
test_data = torch.tensor(train_df.image_array[0]).permute(2,0,1)

In [11]:
test_data = torch.randn(2,3,64,64)

In [12]:
from torchvision.transforms import v2
transform_op = v2.Compose([
            v2.RandomHorizontalFlip(p = 0.5),
            v2.RandomVerticalFlip(p = 0.5), 
            v2.RandomRotation(degrees=30),
            v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            
            
            ])
img = transform_op(test_data)
img.shape


torch.Size([2, 3, 64, 64])

In [13]:
#import numpy as np
output = []
for i in range(5):
    output.append(transform_op(test_data))

print(output)
print(np.stack(output, axis = 0).reshape(-1,3,64,64).shape)

[tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
       

In [14]:
from data_preprocess import TinyImageDataset
train_dataset = TinyImageDataset(train_df)

Start
Reaches here
Image array shape before augementation: (5, 98179, 64, 64, 3)
Image array shape after augementation: (490895, 64, 64, 3)
The shape of image_labels: torch.Size([490895])


In [15]:
len(train_dataset)

490895

In [16]:
# def test_dataset(index) -> None:
#     image_array = train_dataset[index][0]
#     image_label = train_dataset[index][1]
#     print(f'Shape of image_array: {image_array.shape}')
#     print(f'Image Label: {image_label}')



In [17]:
# test_dataset(1)

In [18]:
# from torch.utils.data import DataLoader
# train_dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = True)

In [19]:
# image_array, label = next(iter(train_dataloader))

In [20]:
# image_array.shape

In [21]:
# image_array.permute(0,-1,1,2).shape

In [22]:
# label

In [23]:
label = torch.tensor([1,2,3,4,5,6,7]).reshape(-1,1)
label.shape

torch.Size([7, 1])

In [24]:
label

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6],
        [7]])

In [25]:
label.expand(-1,5).reshape(-1,1).squeeze()

tensor([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5,
        5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7])