# Anime Screenshot Pipeline
https://github.com/cyber-meow/anime_screenshot_pipeline

Colab version of: Semi-automatic pipeline to extract image training set from anime for generative model training.

<details>
  <summary><big>Feature</big></summary>
<ul>
  <li>Can be found at cyber-meow github repo <a href='https://github.com/cyber-meow/anime_screenshot_pipeline#table-of-contents' target='_blank'>README.md</a></li>
</ul>
</details>

<details>
  <summary><big>Limitation</big></summary>
<ul>
  <li>Not all code tested</li>
  <li>Not all steps from github repo is implemented</li>
  <li>Since this notebook is combination of many steps, dependency or package conflict may occurs</li>
  <li>Bad english language spelling and grammar (english is not my primary language :#). Feel free to correct and Pull Request!</li>
</ul>
</details>

<details>
  <summary><big>Credits</big></summary>
<ul>
  <li>Author</li>
  <ul>
    <li><a href='https://github.com/TheSkinnyRat' target='_blank'>TheSkinnyRat</a></li>
  </ul>

  <li>Base Code Repo</li>
  <ul>
    <li><a href='https://github.com/cyber-meow/anime_screenshot_pipeline' target='_blank'>cyber-meow github repo</a></li>
  </ul>

  <li>Colab Template and Reference</li>
  <ul>
    <li><a href='https://github.com/Linaqruf/kohya-trainer' target='_blank'>Linaqruf/kohya-trainer</a></li>
  </ul>

  <li>Code Assistant</li>
  <ul>
    <li><a href='https://chat.openai.com/' target='_blank'>OpenAI ChatGPT</a></li>
  </ul>

  <li>Original cyber-meow repo credits</li>
  <ul>
    <li>This is a collection of many resources found on internet (credit to the orignal authors), and some python code written by myself and ChatGPT.</li>
  </ul>
</ul>
</details>

<details>
  <summary><big>Whats new?</big></summary>
<ul>
  <li>(02/26/23):</li>
  <ul>
    <li>Initial First Release 🎉</li>
  </ul>
</ul>
</details>

| Notebook Name | Link | Repo |
| --- | --- | --- |
| [Anime Screenshot Pipeline](https://github.com/TheSkinnyRat/anime_screenshot_pipeline_colab/blob/main/anime_screenshot_pipeline.ipynb) | [![](https://img.shields.io/static/v1?message=Open%20in%20Colab&logo=googlecolab&labelColor=5c5c5c&color=0f80c1&label=%20&style=for-the-badge)](https://colab.research.google.com/github/TheSkinnyRat/anime_screenshot_pipeline_colab/blob/main/anime_screenshot_pipeline.ipynb) | [![](https://img.shields.io/static/v1?message=Github&logo=github&labelColor=5c5c5c&color=0f80c1&label=%20&style=for-the-badge)](https://github.com/TheSkinnyRat/anime_screenshot_pipeline_colab) |


In [None]:
!nvidia-smi

# [0] Drive Settings (Optional)

In [None]:
#@title ## [0.1] Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title ## [0.2] Open Special `File Explorer` for Colab
#@markdown This will work in real-time even when you run other cells
!pip -q install --upgrade gdown imjoy-elfinder

import threading
from google.colab import output
from imjoy_elfinder.app import main
%store -r

thread = threading.Thread(target=main, args=[["--root-dir=/content", "--port=8765"]])
thread.start()

open_in_new_tab = True #@param {type:"boolean"}

if open_in_new_tab:
  output.serve_kernel_port_as_window(8765)
else:
  output.serve_kernel_port_as_iframe(8765, height='500')


# [1] Repository Settings

In [None]:
#@title ## [1.1] Clone Reposiory
#@markdown Clone Anime Screenshot Pipeline from GitHub.
import os
import zipfile
import shutil

root_dir = "/content" #@param {type: "string"}
repo_url = "https://github.com/cyber-meow/anime_screenshot_pipeline" #@param {type: "string"}
repo_dir = os.path.join(root_dir,"anime_screenshot_pipeline")

def clone_repo(url):
  os.chdir(root_dir)
  !git clone {repo_url}

clone_repo(repo_url)

# [2] Data Acquisition

You have 2 options for acquiring your dataset (.mp4 video): 
1. Uploading it to Colab's local files.
2. Locating your dataset from `Google Drive` or `HuggingFace`.

In [None]:
#@title ## [2.1] Locating Data Directory
#@markdown Define location of your data (.mp4 video). This cell will also create a folder based on your input.
import os
%store -r

mp4_data_dir = "/content/mp4_data" #@param {'type' : 'string'}
%store mp4_data_dir

os.makedirs(mp4_data_dir, exist_ok = True)
print(f"Your mp4 data directory : {mp4_data_dir}")

In [None]:
#@title ## [2.2] Unzip Mp4 Video Data
print('installing apt dependencies')
!apt-get install -y unzip aria2 > /dev/null
import os
import shutil
from pathlib import Path
from IPython.utils import capture
%store -r

#@markdown Specify this section if your mp4 data is in a `zip` file and has been uploaded somewhere. This will download your dataset and automatically extract it to the `mp4_data_dir` if the `unzip_to` is empty. 
#@markdown > Get **your** huggingface `WRITE/READ` token [here](https://huggingface.co/settings/tokens)
zipfile_url = "https://huggingface.co/datasets/TheSkinnyRat/majo_no_tabitabi/resolve/main/majo_no_tabitabi-720p.zip" #@param {'type': 'string'}
zipfile_name = "zipfile.zip"
unzip_to = "" #@param {'type': 'string'}

hf_token = 'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXX' #@param {'type': 'string'}
user_header = f"\"Authorization: Bearer {hf_token}\""

if unzip_to:
  os.makedirs(unzip_to, exist_ok=True)
else:
  unzip_to = mp4_data_dir

def download_dataset(url):
  if url.startswith("/content"):
    !unzip -j -o {url} -d "{mp4_data_dir}"
  elif url.startswith("https://drive.google.com"):
    os.chdir(root_dir)
    !gdown --fuzzy {url}
  elif url.startswith("https://huggingface.co/"):
    if '/blob/' in url:
      url = url.replace('/blob/', '/resolve/')
    !aria2c --console-log-level=error --summary-interval=10 --header={user_header} -c -x 16 -k 1M -s 16 -d {root_dir} -o {zipfile_name} {url}
  else:
    !aria2c --console-log-level=error --summary-interval=10 -c -x 16 -k 1M -s 16 -d {root_dir} -o {zipfile_name} {url}

download_dataset(zipfile_url)

os.chdir(root_dir)

if not zipfile_url.startswith("/content"):
  !unzip -j -o "{root_dir}/{zipfile_name}" -d "{unzip_to}"
  os.remove(f"{root_dir}/{zipfile_name}")

# [3] [#](https://github.com/cyber-meow/anime_screenshot_pipeline#frame-extraction) Frame Extraction

Extract 5000~10000 frames per episode of 24 minutes

In [None]:
#@title ## [3.1] Begin Frame Extraction
#@markdown Define your Frame Extraction directory output
fe_dest_dir = "/content/fe_output" #@param {'type': 'string'}
#@markdown **ATTENTION:** Rename your mp4 video file data to the following order to to avoid unwanted errors\
#@markdown Your mp4 file pattern. Example, this pattern will extract frame for following files
#@markdown - majo_no_tabitabi_01.mp4
#@markdown - majo_no_tabitabi_02.mp4
#@markdown - majo_no_tabitabi_03.mp4
#@markdown - ...
fe_pattern = "majo_no_tabitabi_*.mp4" #@param {'type': 'string'}

os.chdir(repo_dir)
!python extract_frames.py --src_dir "{mp4_data_dir}" \
--dst_dir "{fe_dest_dir}" \
--prefix series_episode \
--pattern "{fe_pattern}"

# [4] [#](https://github.com/cyber-meow/anime_screenshot_pipeline#similar-image-removal) Similar Image Removal

Reduce dataset size by a factor of 10 by removing similar images

In [None]:
#@title ## [4.1] Begin Removal
!pip -q install --upgrade fiftyone
import numpy as np
import fiftyone as fo
import fiftyone.zoo as foz

dataset_dir = fe_dest_dir
dataset = fo.Dataset.from_dir(dataset_dir, dataset_type=fo.types.ImageDirectory)

model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
embeddings = dataset.compute_embeddings(model)

print(embeddings.shape)

from tqdm import tqdm

def mark_duplicate(subdataset, similarity_matrix, thresh=0.985):
    
    n = len(similarity_matrix)
    similarity_matrix = similarity_matrix - np.identity(n)
    
    id_map = [s.id for s in subdataset.select_fields(["id"])]
    samples_to_remove = set()
    samples_to_keep = set()

    for idx, sample in enumerate(subdataset):
        max_similarity = similarity_matrix[idx].max()
        sample["max_similarity"] = max_similarity
        sample.save()

    for idx, sample in tqdm(enumerate(subdataset)):
        if sample.id not in samples_to_remove:
            # Keep the first instance of two duplicates
            samples_to_keep.add(sample.id)

            dup_idxs = np.where(similarity_matrix[idx] > thresh)[0]
            for dup in dup_idxs:
                # We kept the first instance so remove all other duplicates
                samples_to_remove.add(id_map[dup])

            if len(dup_idxs) > 0:
                sample.tags.append("has_duplicates")
                sample.save()

        else:
            sample.tags.append("duplicate")
            sample.save()
    return samples_to_remove, samples_to_keep

from sklearn.metrics.pairwise import cosine_similarity

max_compare_size = 10000
thresh = 0.985

samples_to_remove = set()
samples_to_keep = set()

for k in range(0, len(embeddings), max_compare_size):
    end = min(k + max_compare_size, len(embeddings))
    similarity_matrix = cosine_similarity(embeddings[k:end])
    samples_to_remove_sub, samples_to_keep_sub = mark_duplicate(
        dataset[k:end], similarity_matrix, thresh)
    samples_to_remove = samples_to_remove | samples_to_remove_sub
    samples_to_keep = samples_to_keep | samples_to_keep_sub

sir_visualize_dataset = False #@param {'type' : 'boolean'}
if sir_visualize_dataset:
  session = fo.launch_app(dataset)

import os
for sample_id in tqdm(samples_to_remove):
    os.remove(dataset[sample_id].filepath)
dataset.delete_samples(list(samples_to_remove))

# [5] [#](https://github.com/cyber-meow/anime_screenshot_pipeline#face-detection-and-cropping) Face Detection and Cropping

In [None]:
#@title ## [5.1] Add face information to metadata

# ERROR ISSUE: https://github.com/hysts/anime-face-detector/issues/13#issuecomment-1419694747
!pip -q install numpy scipy numba --upgrade

!pip -q install openmim>=0.2.1
#!mim install mmcv-full>=1.6.1
!pip -q install mmcv-full==1.7.0 -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.13/index.html
!pip -q install mmdet>=2.25.1
!pip -q install mmpose>=0.28.1
!pip -q install -U moviepy>=1.0.3
#!git clone https://github.com/hysts/anime-face-detector
!pip -q install anime-face-detector

os.chdir(repo_dir)
!python detect_faces.py --src_dir "{fe_dest_dir}"

In [None]:
#@title ## [5.2] Crop out the maximum square for each face
#@markdown Indicates the minimum number of faces that should be contained in the original image for the cropping to take place. Note that the cropped images are store in the same folders as the input images and no cropping is performed for face that is too large
fdc_min_face_number = 2 #@param {type :"integer"}

os.chdir(repo_dir)
!python crop_faces.py --src_dir "{fe_dest_dir}" --min_face_number {fdc_min_face_number}

# [6] [#](https://github.com/cyber-meow/anime_screenshot_pipeline#automatic-tagging) Automatic Tagging

In [None]:
#@title ## [6.1] Tag your images with an off-the-shelf tagger
!pip -q install "tensorflow<2.11"
!pip -q install huggingface-hub

os.chdir(os.path.join(repo_dir, "tagger"))
!python tag_images_by_wd14_tagger.py --batch_size 16 --caption_extension ".tags" "{fe_dest_dir}"

In [None]:
#@title ## [6.2] Save tag information into metadata
os.chdir(repo_dir)
!python augment_metadata.py --use_tags --general_description "aniscreen" --src_dir "{fe_dest_dir}"

In [None]:
#@title ## [6.3] Data Cleansing
#@markdown This will delete the dataset and their metadata if `n_people==0` in the `.json` file metadata
import os
import json

# Define the directory path
at_cleansing_path = fe_dest_dir

# Traverse the directory tree
for root, dirs, files in os.walk(at_cleansing_path):
    for file in files:
        # Check if the file is a .json file
        if file.endswith('.json'):
            # Construct the full file path
            file_path = os.path.join(root, file)
            # Open the JSON file
            with open(file_path, 'r') as f:
                json_data = json.load(f)
            # Check if the 'n_people' field is 0
            if json_data['n_people'] == 0:
                # Construct the image file path and delete the file
                image_file_path = os.path.splitext(file_path)[0] + '.png'
                if os.path.exists(image_file_path):
                    os.remove(image_file_path)
                # Delete the .json and .tags files
                os.remove(file_path)
                tags_file_path = os.path.splitext(file_path)[0] + '.png.tags'
                if os.path.exists(tags_file_path):
                    os.remove(tags_file_path)

print('Done! Data Cleaned')

# [7] [#](https://github.com/cyber-meow/anime_screenshot_pipeline#character-classification-with-few-shot-learning) Character Classification with Few-Shot Learning
Train your own model for series-specific concepts

## [7.1] Dataset Preparation
This is a simple classification task. To begin just create a directory `training_data_original` and organize the directory as following
```
├── ./elaina_(majo_no_tabitabi)
├── ./saya_(majo_no_tabitabi)
├── ./character3
├── ./character4
...
└── ./ood
```
Put a few images in each folder that try to capture different variations of the character. In my test 10~20 is good enough. Zip the whole `training_data_original` folder and upload to hugging face or somewhere else, or you can directly upload to colab notebook using file manager.
> Note: [Original Instruction](https://github.com/cyber-meow/anime_screenshot_pipeline#dataset-preparation)

> If you already have `.cktp` training file, you can skip `[7.1]` and `[7.2]` step and go to `[7.3-C]` to define and locate your `.ckpt` file

In [None]:
os.chdir(os.path.join(repo_dir, "classifier_training"))
!pip -q install -r requirements.txt
os.chdir(os.path.join(repo_dir, "classifier_training/models"))
!pip -q install -e .

cc_zipfile_url = "https://huggingface.co/datasets/TheSkinnyRat/majo_no_tabitabi/resolve/main/training_data_original.zip" #@param {'type': 'string'}
cc_zipfile_name = "zipfile.zip"
cc_unzip_to = "" #@param {'type': 'string'}
#@markdown If you want to split into training and test set you can check this.\
#@markdown It does a 70%/30% train/test split. 
#@markdown > Note: Not tested yet
cc_split_training_and_test = False #@param {'type': 'boolean'}

user_header = f"\"Authorization: Bearer {hf_token}\""

if cc_unzip_to:
  os.makedirs(cc_unzip_to, exist_ok=True)
else:
  cc_unzip_to = "/content/"

def download_dataset(url):
  if url.startswith("/content"):
    !unzip -j -o {url} -d "/content/"
  elif url.startswith("https://drive.google.com"):
    os.chdir(root_dir)
    !gdown --fuzzy {url}
  elif url.startswith("https://huggingface.co/"):
    if '/blob/' in url:
      url = url.replace('/blob/', '/resolve/')
    !aria2c --console-log-level=error --summary-interval=10 --header={user_header} -c -x 16 -k 1M -s 16 -d {root_dir} -o {zipfile_name} {url}
  else:
    !aria2c --console-log-level=error --summary-interval=10 -c -x 16 -k 1M -s 16 -d {root_dir} -o {zipfile_name} {url}

download_dataset(cc_zipfile_url)

os.chdir(root_dir)

if not zipfile_url.startswith("/content"):
  !unzip -o "{root_dir}/{cc_zipfile_name}" -d "{cc_unzip_to}"
  os.remove(f"{root_dir}/{cc_zipfile_name}")

cc_src_dir = os.path.join(cc_unzip_to, "training_data_original")
cc_dataset_dir = os.path.join(cc_unzip_to, "training_dataset")

# replace filename contain space with underscore and lowercase
[os.rename(root + os.sep + file, root + os.sep + file.lower().replace(" ", "_")) for root, _, files in os.walk(cc_src_dir) for file in files]

os.chdir(os.path.join(repo_dir, "classifier_dataset_preparation"))
!python crop_and_make_dataset.py --src_dir "{cc_src_dir}" --dst_dir "{cc_dataset_dir}/data"

os.chdir(os.path.join(repo_dir, "classifier_dataset_preparation"))
!python make_data_dic_imagenetsyle.py "{cc_dataset_dir}"

if cc_split_training_and_test:
  os.chdir(os.path.join(repo_dir, "classifier_dataset_preparation"))
  cc_dataset_labels = os.path.join(cc_dataset_dir, "labels.csv")
  !python data_split.py "{cc_dataset_labels}" 0.7 0.3

  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m
Download Results:
gid   |stat|avg speed  |path/URI
3746b0|[1;32mOK[0m  |    32MiB/s|/content/zipfile.zip

Status Legend:
(OK):download completed.
Archive:  /content/zipfile.zip
  inflating: /content/training_data_original/class1/1 (1).png  
  inflating: /content/training_data_original/class1/1 (12).png  
  inflating: /content/training_data_original/class1/1 (13).png  
  inflating: /content/training_data_original/class1/1 (14).png  
  inflating: /content/training_data_original/class1/1 (15).png  
  inflating: /content/training_data_original/class1/1 (16).png  
  inflating: /content/training_data_original/class1/1 (17).png  
  inflating: /content/training_data_original/class1/1 (18).png  
  inflating: /content/training_data_original/class1/1 (19).png  
  inflating: /content/training_data_original/class1/1 (20).png  
  inflating: /content/training_data_original/class1/1 (21).png  
  inflating: /content/training_data_original/clas

In [None]:
#@title ## [7.2] Begin Training
!apt-get install -y wget > /dev/null
cc_checkpoint_dir = os.path.join(root_dir, "training_model")
cc_checkpoint_path = os.path.join(root_dir, "training_model/danbooruFaces_L_16_image128_batch16_SGDlr0.001_ptTrue_seed0_warmupCosine_interTrue_mmFalse_textLenNone_maskNoneconstanttagtokenizingshufFalse_lastEpoch.ckpt")
cc_trained_checkpoint_dir = os.path.join(root_dir, "trained_model")
cc_trained_checkpoint_path = os.path.join(root_dir, "trained_model/*.ckpt")

if not os.path.exists(cc_checkpoint_path):
  if not os.path.exists(cc_checkpoint_dir):
    os.makedirs(cc_checkpoint_dir, exist_ok=True)
  os.chdir(cc_checkpoint_dir)
  !wget https://huggingface.co/TheSkinnyRat/public-backup/resolve/main/danbooruFaces_L_16_image128_batch16_SGDlr0.001_ptTrue_seed0_warmupCosine_interTrue_mmFalse_textLenNone_maskNoneconstanttagtokenizingshufFalse_lastEpoch.ckpt
#@markdown Note that testing is disabled by default. Check this if you want to use test.\
#@markdown Validation will then be performed every 5 epochs (default).
cc_training_test_set = False #@param {'type': 'boolean'}

#@markdown > **WARNING:** Don't know what happen, but you'll get wandb requires interactive prompt like:
#@markdown ```
#@markdown wandb: (1) Create a W&B account
#@markdown wandb: (2) Use an existing W&B account
#@markdown wandb: (3) Don't visualize my results
#@markdown wandb: Enter your choice: Traceback (most recent call last): ...
#@markdown ```
#@markdown I'll only choose `(3) Don't visualize my results` by typing `3` and `enter` in the cursor result, or you want something experimental?.

!pip install keras==2.10.0
# working commit https://github.com/arogozhnikov/einops/commit/f569905f8ba2f55393164262c5e4200a8cdb57ab
!pip install git+https://github.com/arogozhnikov/einops.git

os.chdir(os.path.join(repo_dir, "classifier_training"))
if not cc_training_test_set:
  !python train.py --transfer_learning --model_name L_16 --interm_features_fc \
  --batch_size=8 --no_epochs 40 --dataset_path "{cc_dataset_dir}" \
  --results_dir "{cc_trained_checkpoint_dir}" \
  --checkpoint_path "{cc_checkpoint_path}"
else:
  !python train.py --transfer_learning --model_name L_16 --interm_features_fc \
  --batch_size=8 --no_epochs 40 --dataset_path "{cc_dataset_dir}" \
  --results_dir "{cc_trained_checkpoint_dir}" \
  --checkpoint_path "{cc_checkpoint_path}"
  --use_test_set

In [None]:
#@title ## [7.3-C] Define and Locate Model (`.ckpt` model required)
!apt-get install -y wget > /dev/null
cc_model_url = "example.com/model.ckpt" #@param {'type': 'string'}
cc_trained_checkpoint_dir = os.path.join(root_dir, "trained_model")
cc_trained_checkpoint_path = os.path.join(root_dir, "trained_model/*.ckpt")

os.chdir(cc_trained_checkpoint_dir)
!wget "{cc_model_url}"

print("Done! Model Located")

In [None]:
#@title ## [7.4] Inference
#@markdown Threshold of confidence to classify as character
cc_cls_thresh = 0.25 #@param {type:"slider", min:0.1, max:1, step:0.01}
#@markdown Threshold of confidence to add a tag
cc_tagger_thresh = 0.35 #@param {type:"slider", min:0.1, max:1, step:0.01}
#@markdown At this point we finally get a model to classify the characters of the series!\
#@markdown We then write the character information into the metadata file if such information is not present.\
#@markdown To overwrite in all the cases check this
cc_inference_overwrite = False #@param {type:"boolean"}
import glob
cc_trained_checkpoint_path = glob.glob('/content/trained_model/*.ckpt')[0]

os.chdir(repo_dir)
if not cc_inference_overwrite:
  !python classify_characters.py --dataset_path "{cc_dataset_dir}" \
  --checkpoint_path "{cc_trained_checkpoint_path}" \
  --cls_thresh "{cc_cls_thresh}" --tagger_thresh "{cc_tagger_thresh}"\
  --src_dir "{fe_dest_dir}"
else:
  !python classify_characters.py --dataset_path "{cc_dataset_dir}" \
  --checkpoint_path "{cc_trained_checkpoint_path}" \
  --cls_thresh "{cc_cls_thresh}" --tagger_thresh "{cc_tagger_thresh}"\
  --src_dir "{fe_dest_dir}" \
  --overwrite


# [8] [#](https://github.com/cyber-meow/anime_screenshot_pipeline#folder-arrangement) Folder Arrangement

In [None]:
#@title ## [8.1] Only keep images with faces and resize
#@markdown The following command can be run before cropping and tagging to eliminate images with no faces.
#@markdown - With `min_face_number` and `max_face_number` it only saves images whose face number is within this range to `dst_dir`.
#@markdown - `max_image_size` makes sure that saved images are resized so that both its width and height are smaller than the provided value.
#@markdown - Check `move_file` if you want to move file to destination directory instead of creating new ones. `max_image_size` is ignored in this case.
fa_dest_dir = "/content/fe_output_arranged" #@param {'type': 'string'}
fa_min_face = 1 #@param {'type': 'integer'}
fa_max_face = 10 #@param {'type': 'integer'}
fa_max_image_size = 1024 #@param {'type': 'integer'}
fa_move_file = False #@param {'type': 'boolean'}

os.chdir(repo_dir)
if not fa_move_file:
  !python arrange_folder.py --min_face_number {fa_min_face} --max_face_number {fa_max_face} \
  --keep_src_structure --format '' --max_image_size {fa_max_image_size} \
  --src_dir "{fe_dest_dir}" --dst_dir "{fa_dest_dir}"
else:
  !python arrange_folder.py --min_face_number {fa_min_face} --max_face_number {fa_max_face} \
  --keep_src_structure --format '' --max_image_size {fa_max_image_size} \
  --src_dir "{fe_dest_dir}" --dst_dir "{fa_dest_dir}"
  --move_file

In [None]:
#@title ## [8.2] Arrange the folder in hierarchy using metadata
#@markdown The folder structure itself is specified by the argument `format`. Different levels of folders are separated by `/`. Accepted folder types are `n_characters`, `n_faces`, `n_people`, `character`, and `fh_ratio`.
#@markdown - `n_characters`: This creates folders using the number of characters. Passing argument `max_character_number` = `6` puts all the scenes with more than 6 characters into the folder `6+_characters`.
#@markdown - `character`: This creates folders with sorted character names split by `+`. To avoid creating a specific folder for character combination that shows up too few times, we pass the argument `min_image_per_combination` so that images of all the character combinations with fewer than a certain number of images are saved in `.../character_others`.
#@markdown - `fh_ratio`: This creates folders according to the maximum face height ratio.

fa_heirarchy_dest_dir = "/content/fe_output_arranged_hierarchy" #@param {'type': 'string'}
fa_format = "n_characters/character/fh_ratio" #@param {'type': 'string'}
fa_max_character_number = 6 #@param {'type': 'integer'}
fa_min_image_per_combination = 10 #@param {'type': 'integer'}

os.chdir(repo_dir)
!python arrange_folder.py \
--move_file --format "{fa_format}" \
--max_character_number {fa_max_character_number} --min_image_per_combination {fa_min_image_per_combination} \
--src_dir "{fa_dest_dir}" --dst_dir "{fa_heirarchy_dest_dir}"

# [9] Create `.txt` File (Optional)
 Create `.txt` file next to every image with json metadata

In [None]:
#@title ## [9.1] Begin Create `.txt` File
import os
import json

#@markdown Define the directory containing the image files and their corresponding JSON files
ctf_dir_path = '/content/fe_output_arranged_hierarchy' #@param {type: 'string'}

# Iterate through all files and subdirectories in the directory
for root, dirs, files in os.walk(ctf_dir_path):
    for file in files:
        # Check if the file is an image file (has extension .png, .jpg, or .jpeg)
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            # Construct the path to the JSON file corresponding to this image file
            json_file = os.path.join(root, file.split('.')[0] + '.json')
            # Check if the JSON file exists
            if os.path.exists(json_file):
                # Load the JSON data from the file
                with open(json_file, 'r') as f:
                    json_data = json.load(f)
                # Extract the desired values from the JSON data
                characters = ', '.join(json_data['characters']).replace('_', ' ')
                general = json_data['general'].replace('_', ' ')
                tags = ', '.join(json_data['tags']).replace('_', ' ')
                # Construct the text to write to the .txt file
                text = f"{characters}, {general}, {tags}"
                # Construct the path to the .txt file to write to
                txt_file = os.path.join(root, file.split('.')[0] + '.txt')
                # Write the text to the .txt file
                with open(txt_file, 'w') as f:
                    f.write(text)

print('Done! .txt File Created')


# [10] HuggingFace Upload (Optional)
 Upload your dataset to hugging face

In [None]:
#@title ### [10.1] Upload Config
!pip -q install --upgrade huggingface-hub
from huggingface_hub import login
from huggingface_hub import HfApi
from huggingface_hub.utils import validate_repo_id, HfHubHTTPError

login(hf_token, add_to_git_credential=True)

api = HfApi()
user = api.whoami(hf_token)

#@markdown Fill this if you want to upload to your organization, or just leave it empty.

orgs_name = "" #@param{type:"string"}

#@markdown If your model/dataset repo didn't exist, it will automatically create your repo.
dataset_name = "majo_no_tabitabi" #@param{type:"string"}
make_this_model_private = True #@param{type:"boolean"}

if orgs_name == "":
  datasets_repo = user['name']+"/"+dataset_name.strip()
else:
  datasets_repo = orgs_name+"/"+dataset_name.strip()

if dataset_name != "":
  try:
      validate_repo_id(datasets_repo)
      api.create_repo(repo_id=datasets_repo,
                      repo_type="dataset",
                      private=make_this_model_private)
      print("Dataset Repo didn't exists, creating repo")
      print("Dataset Repo",datasets_repo,"created!\n")

  except HfHubHTTPError as e:
      print(f"Dataset repo: {datasets_repo} exists, skipping create repo\n")


In [None]:
#@title ### [10.2] Zip Dataset
hu_dest_dir = "/content/fe_output_arranged_hierarchy" #@param {'type': 'string'}
hu_zip_file_name = "final_dataset_ep1" #@param {'type': 'string'}
hu_zip_path = "/content/"+hu_zip_file_name+".zip"

os.chdir(hu_dest_dir)
!7z a "{hu_zip_path}" ./ -y
print("Done!")

In [None]:
#@title ### [10.3] Begin Upload
from huggingface_hub import HfApi
from pathlib import Path
import shutil
import zipfile
import os

api = HfApi()

#@markdown This will be compressed your `fe_dest_dir` to zip and  uploaded to datasets repo

#@markdown  Other Information
commit_message = "" #@param {type :"string"}
temp_dataset = hu_zip_path

if not commit_message:
  commit_message = f"feat: upload {hu_zip_file_name}.zip"

def upload_dataset(dataset_paths):
  path_obj = Path(dataset_paths)
  #dataset_name = path_obj.parts[-1]
  dataset_name = f"{hu_zip_file_name}.zip"

  print(f"Uploading {dataset_name} to https://huggingface.co/datasets/"+datasets_repo)
  print(f"Please wait...")

  api.upload_file(
      path_or_fileobj=dataset_paths,
      path_in_repo=dataset_name,
      repo_id=datasets_repo,
      repo_type="dataset",
      commit_message=commit_message,
  )
  print(f"Upload success, located at https://huggingface.co/datasets/"+datasets_repo+"/blob/main/"+dataset_name+"\n")

def zip_file(tmp_folders):
    zipfiles = temp_dataset 
    with zipfile.ZipFile(zipfiles, 'w') as zip:
      for tmp_folders, dirs, files in os.walk(tmp_folders):
          for file in files:
              zip.write(os.path.join(tmp_folders, file))

def upload():
  # zip_file(fe_dest_dir)
  upload_dataset(temp_dataset)
  os.remove(temp_dataset)

upload()