<a href="https://colab.research.google.com/github/SKam23/10315-Final-Project/blob/main/10315_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import importlib
import torch 
import torchvision.transforms as transforms
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
from PIL import Image

if importlib.util.find_spec("kaggle") is None:
    !pip install -q kaggle

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d jessicali9530/celeba-dataset

Downloading celeba-dataset.zip to /content
100% 1.33G/1.33G [00:09<00:00, 217MB/s]
100% 1.33G/1.33G [00:09<00:00, 156MB/s]


In [None]:
!unzip -n celeba-dataset.zip

Archive:  celeba-dataset.zip


In [None]:
# list_eval_partition.csv: Recommended partitioning of images into training, validation, testing sets. Images 1-162770 are training, 162771-182637 are validation, 182638-202599 are testing
# list_bbox_celeba.csv: Bounding box information for each image. "x_1" and "y_1" represent the upper left point coordinate of bounding box. "width" and "height" represent the width and height of bounding box
# list_landmarks_align_celeba.csv: Image landmarks and their respective coordinates. There are 5 landmarks: left eye, right eye, nose, left mouth, right mouth
# list_attr_celeba.csv: Attribute labels for each image. There are 40 attributes. "1" represents positive while "-1" represents negative
partition_df = pd.read_csv('list_eval_partition.csv')
bbox_df = pd.read_csv('list_bbox_celeba.csv')
landmarks_df = pd.read_csv('list_landmarks_align_celeba.csv')
attr_df = pd.read_csv('list_attr_celeba.csv')


In [None]:
merged_df = partition_df.merge(bbox_df, on='image_id').merge(landmarks_df, on='image_id').merge(attr_df, on='image_id')
train_df = merged_df[merged_df['partition'] == 0]
val_df = merged_df[merged_df['partition'] == 1]
test_df = merged_df[merged_df['partition'] == 2]


In [None]:
class CelebADataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.transform = ToTensor()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(f"celeba/img_align_celeba/{row['image_id']}")
        image = self.transform(image)
        age = row['age']
        if age < 20:
            label = 0
        elif age < 30:
            label = 1
        elif age < 40:
            label = 2
        elif age < 50:
            label = 3
        elif age < 60:
            label = 4
        else:
            label = 5
        return image, label



In [None]:
train_dataset = CelebADataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

val_dataset = CelebADataset(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=64)

test_dataset = CelebADataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=64)