# Building an own Dataset with Pytorch

In [55]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import json
from PIL import Image

Create the class of your dataset.

In [202]:
class COCO2017(Dataset):
    """COCO 2017 dataset."""

    def __init__(self, json_file, root_dir, transform=None):
        """
        Args:
            json_file (string): Path to the json file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        if json_file is not None:
            with open(json_file,'r') as COCO:
                js = json.loads(COCO.read())
                val_categories = json.dumps(js) 
                
        image_ids = []
        categ_ids = []
        #Get categories of the validation images and ids.
        for i in range(32800):
            image_id = json.dumps(js['annotations'][i]['image_id'])
            miss = 12 - len(str(image_id))
            image_unique_id = ("0" * miss) + str(str(image_id))
            image_ids.append(image_unique_id)
            categ_ids.append(json.dumps(js['annotations'][i]['category_id']))

        dataset = {'ImageID': image_ids,'Categories':categ_ids}
        dataset = pd.DataFrame.from_dict(dataset)
        dataset = dataset.groupby('ImageID', as_index=False).agg(lambda x: x.tolist())
        dataset
        print(len(dataset))
        self.landmarks_frame = dataset
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        img_name = os.path.join(self.root_dir,
                                self.landmarks_frame.iloc[idx, 0] + ".jpg")
        image = Image.open(img_name)
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks])
        sample = {'image': image, 'landmarks': landmarks}

        if self.transform:
            sample = self.transform(sample)

        return sample

Create a new object of your dataset class.

In [203]:
coco_dataset = COCO2017(json_file='../obj_detect_pytorch/dataset/stuff_val2017.json',
                        root_dir='../obj_detect_pytorch/dataset/val2017/',
                        transform = False)

5000


In [210]:
for i in range(3):
    sample = coco_dataset[i]
    print(i, sample['image'].size, sample['landmarks'])

0 (640, 426) [[['98' '102' '105' '118' '119' '123' '130' '132' '142' '152' '156'
   '165' '172' '181' '183']]]
1 (586, 640) [[['124' '183']]]
2 (640, 483) [[['93' '128' '129' '183']]]


array([[['93', '128', '129', '183']]], dtype=object)