# 0. Making Unstructured data structured

In [11]:
from pathlib import Path 
import kaggle
import os

DATA_PATH = Path("../data")
POKEMON_PATH = DATA_PATH / "pokemon"

if POKEMON_PATH.is_dir():
    print("Skipping download")
else:
    print("Downloading Data")
    # DATA_PATH.mkdir(parents=True, exist_ok=True)
    os.environ['KAGGLE_USERNAME'] = "ayyoshii" # username from the json file
    os.environ['KAGGLE_KEY'] = "b3760545daa9bcd03300ec0b08bd2c27" # key from the json file

    kaggle.api.dataset_download_files('truthisneverlinear/pokemon-generations-image-dataset/2', path=DATA_PATH, unzip=True)
        

Downloading Data


In [12]:
def dir_walkthrough(dir_path):
    for dirpath, dirnames, filenames in os.walk(dir_path):
        print(f"There are {len(dirnames)} directories and {len(filenames)} images in {dirpath}.")

In [13]:
dir_walkthrough(POKEMON_PATH)

There are 5 directories and 0 images in ..\data\pokemon.
There are 0 directories and 202 images in ..\data\pokemon\conquest.
There are 3 directories and 942 images in ..\data\pokemon\icons.
There are 0 directories and 5 images in ..\data\pokemon\icons\female.
There are 1 directories and 737 images in ..\data\pokemon\icons\old.
There are 0 directories and 3 images in ..\data\pokemon\icons\old\female.
There are 0 directories and 80 images in ..\data\pokemon\icons\right.
There are 13 directories and 0 images in ..\data\pokemon\main-sprites.
There are 3 directories and 754 images in ..\data\pokemon\main-sprites\black-white.
There are 2 directories and 754 images in ..\data\pokemon\main-sprites\black-white\back.
There are 0 directories and 88 images in ..\data\pokemon\main-sprites\black-white\back\female.
There are 1 directories and 753 images in ..\data\pokemon\main-sprites\black-white\back\shiny.
There are 0 directories and 88 images in ..\data\pokemon\main-sprites\black-white\back\shiny\

In [14]:
import shutil
import os
import random

def dir_walkthrough_copy(path):
    # source_dir = '/path/to/source_folder'
    target_dir = path
    
    for dirpath, dirnames, filenames in os.walk(target_dir):
        print(f"Moving {len(filenames)} images in {dirpath}.")
        for file_name in filenames:
            try:
                shutil.move(os.path.join(dirpath, file_name), target_dir)
            except:
                file_name_arr = file_name.split('.')
                new_file_name = file_name_arr[0]+'-'+str(random.randrange(10000000))+'.'+file_name_arr[1]
                shutil.move(os.path.join(dirpath, file_name), target_dir / new_file_name)
                

In [15]:
dir_walkthrough_copy(POKEMON_PATH)

Moving 0 images in ..\data\pokemon.
Moving 202 images in ..\data\pokemon\conquest.
Moving 942 images in ..\data\pokemon\icons.
Moving 5 images in ..\data\pokemon\icons\female.
Moving 737 images in ..\data\pokemon\icons\old.
Moving 3 images in ..\data\pokemon\icons\old\female.
Moving 80 images in ..\data\pokemon\icons\right.
Moving 0 images in ..\data\pokemon\main-sprites.
Moving 754 images in ..\data\pokemon\main-sprites\black-white.
Moving 754 images in ..\data\pokemon\main-sprites\black-white\back.
Moving 88 images in ..\data\pokemon\main-sprites\black-white\back\female.
Moving 753 images in ..\data\pokemon\main-sprites\black-white\back\shiny.
Moving 88 images in ..\data\pokemon\main-sprites\black-white\back\shiny\female.
Moving 93 images in ..\data\pokemon\main-sprites\black-white\female.
Moving 753 images in ..\data\pokemon\main-sprites\black-white\shiny.
Moving 93 images in ..\data\pokemon\main-sprites\black-white\shiny\female.
Moving 277 images in ..\data\pokemon\main-sprites\cry

In [16]:
def finish_preprocessing(source_path):
    source_dir = source_path
    target_dir = source_path / "raw"
    if target_dir.is_dir():
        print("RAW already created, adding")
    else:
        target_dir.mkdir(parents=True, exist_ok=True)
    
    print("Removing GIF's")
    for file_name in source_dir.glob("*.png"):
        shutil.move(file_name, target_dir)
        
    print("Moving PNG's to /raw")
    for file_name in source_dir.glob("*.gif"):
        os.remove(file_name)
        
    print("Removing 0 class")
    for file_name in target_dir.glob("0*"):
        os.remove(file_name)
        
    ### TODO: DELETE LAST non-CLASSES, DELETE EMPTY FOLDERS
        
    # print("Deleting empty folders")
    # for dirpath, dirnames, filenames in os.walk(target_dir):
    #     print(f"Moving {len(filenames)} images in {dirpath}.")
    #     for file_name in filenames:
    #         try:
    #             shutil.move(os.path.join(dirpath, file_name), target_dir)
    #         except:
    #             file_name_arr = file_name.split('.')
    #             new_file_name = file_name_arr[0]+'-'+str(random.randrange(10000000))+'.'+file_name_arr[1]
    #             shutil.move(os.path.join(dirpath, file_name), target_dir / new_file_name)
    

In [17]:
finish_preprocessing(POKEMON_PATH)

Removing GIF's
Moving PNG's to /raw
Removing 0 class


In [18]:
len(list((POKEMON_PATH / "raw").glob("*.png")))

39180

## 0.1 Preparing classes

In [19]:
import pandas as pd

pokemon_data = pd.read_csv(DATA_PATH / 'Pokemon.txt')
pokemon_data = pokemon_data.drop_duplicates(subset=["#"],keep="first")
pokemon_data = pokemon_data.set_index(["#"], drop=True)

In [20]:
def get_classes(df: pd.DataFrame):
    classes = df["Name"].unique()
    classes_to_idx = {class_name: i for i, class_name in enumerate(classes)}
    return classes, classes_to_idx

In [21]:
classes, classes_to_idx = get_classes(pokemon_data)

# 1. Structured Images to DataLoaders

## 1.1 Data Class

In [22]:
from torch.utils.data import Dataset
from typing import Tuple, Dict, List
import torch
import re

class PokemonData(Dataset):
    def __init__(self,
                 targ_dir: str,
                 classes_df: pd.DataFrame,
                 transform=None):
        super().__init__()
        self.paths = list(Path(targ_dir).glob('*.png'))
        self.transform = transform
        self.classes, self.classes_to_idx = get_classes(classes_df)
        
    def load_image(self, index: int) -> Tuple[torch.Tensor, int]:
        image_path = self.paths[index]
        image = Image.open(image_path).convert("RGBA")
        return Image.composite(image, Image.new('RGBA', image.size, 'white'), image).convert("RGB")
    
    def __len__(self) -> int:
        return len(self.paths)
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
        img = self.load_image(index)
        img_name = self.paths[index].name.split('.')[0]
        img_name = int(re.findall("\d+", img_name)[0])
        class_name = self.classes[img_name]
        class_idx = self.classes_to_idx[class_name]
        if self.transform:
            return self.transform(img), class_idx
        else:
            return img, class_idx

## 1.2 Transformers

In [23]:
from torchvision.transforms import v2

image_transformer = v2.Compose([
    v2.ToImage(),
    # v2.ToDtype(torch.uint8, scale=True),
    v2.Resize(size=(128,128)),
    v2.ToDtype(torch.float32, scale=True)
])

In [24]:
raw_data = PokemonData(targ_dir= POKEMON_PATH / "raw",
                            transform=image_transformer,
                            classes_df=pokemon_data)

## 1.3 DataLoaders

In [25]:
# in dataloaders
import os
from torch.utils.data import DataLoader

BATCH_SIZE = 32
NUM_WORKERS = 0

pokemon_dataloader = DataLoader(dataset=raw_data,
                                     batch_size=BATCH_SIZE,
                                     shuffle=True,
                                     num_workers=NUM_WORKERS,
                                     pin_memory=True)