# Первичная настройка

## Настройка путей

In [4]:
from pathlib import Path


raw_dir = Path("../data/raw")
images_dir = Path('../data/images')
preprocessed_path = Path("../data/preprocessed/image_only.csv")

## Проверка существования

In [5]:
import pandas as pd


if not preprocessed_path.exists():
    main_df = pd.DataFrame(columns=["image_name", "blocked"])
else: 
    main_df = pd.read_csv(preprocessed_path)
images_dir.mkdir(exist_ok=True)

# Загрузка данных

## hearmeneigh/e621-rising-v3-small

### Скачивание датасета

In [6]:
import csv
from tqdm import tqdm
from datasets import load_dataset


max_rows = 30000
dataset_name = "hearmeneigh/e621-rising-v3-curated"
dataset_path = Path(raw_dir, dataset_name.split('/')[-1]).with_suffix(".csv")


dataset = load_dataset(dataset_name, split="train", streaming=True)
with open(dataset_path, "w", encoding="utf-8", newline="") as f:
    writer = None
    progress = tqdm(total=max_rows, desc="Сохранение записей", unit="row")
    for i, example in enumerate(dataset):
        if i >= max_rows:
            break
        if writer is None:
            fieldnames = list(example.keys())
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
        
        writer.writerow(example)
        progress.update(1)
    progress.close()

temp_df = pd.read_csv("../data/raw/e621-rising-v3-curated.csv")
temp_df.head(5)

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

Сохранение записей:   2%|▏         | 500/30000 [00:51<51:03,  9.63row/s]
Сохранение записей: 100%|██████████| 30000/30000 [10:13<00:00, 48.92row/s]


Unnamed: 0,source_id,source,image,tags,url,text,selector
0,3724100,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['nude', 'erection', 'mismatched_animal_penis'...",https://static1.e621.net/data/9b/9b/9b9b3a2a14...,nude erection mismatched_animal_penis digital_...,tier-1
1,2323551,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['brown_fur', 'mammal_humanoid', 'midriff', 't...",https://static1.e621.net/data/b6/73/b673f9bd71...,brown_fur mammal_humanoid midriff topwear solo...,tier-2
2,3858728,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['favorites_below_1000', 'anthro', 'solo', 'wh...",https://static1.e621.net/data/4c/a6/4ca60e5ac9...,favorites_below_1000 anthro solo white_backgro...,tier-3
3,3972702,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['open_mouth', 'eyebrows', 'white_body', 'favo...",https://static1.e621.net/data/93/98/9398b7dfb4...,open_mouth eyebrows white_body favorites_above...,tier-2
4,2535548,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['restraints', 'tail', 'shackles', 'chair', 's...",https://static1.e621.net/data/9e/14/9e1452967b...,restraints tail shackles chair score_above_500...,tier-1


### Загрузка изображений

In [7]:
from vk_mod.data import download_image
from tqdm import tqdm


processed_data = []

for index, row in tqdm(temp_df[:max_rows].iterrows(), total=max_rows):
    is_explicit = 1 if "rating_explicit" in row['tags'] else 0
    try:
        filename = f"e621_{index}.png"
        file_path = Path(images_dir, filename)
        download_image(row["url"], file_path)    
        processed_data.append({
            'image_name': filename,
            'blocked': is_explicit
        })
    except Exception as e:
        pass

temp_df = pd.DataFrame(processed_data)
display(temp_df.sample(5))
display(temp_df.loc[:, temp_df.columns == "blocked"].value_counts().reset_index(name="count"))

100%|██████████| 30000/30000 [21:45:33<00:00,  2.61s/it]   


Unnamed: 0,image_name,blocked
3541,e621_3732.png,1
19575,e621_20654.png,1
6821,e621_7195.png,1
12558,e621_13243.png,1
12540,e621_13225.png,1


Unnamed: 0,blocked,count
0,1,22727
1,0,5716


### Обьединение с основным датасетом

In [17]:
main_df = pd.concat([main_df, temp_df])
display(main_df)

Unnamed: 0,image_name,blocked
0,e621_0.png,1
1,e621_1.png,1
2,e621_2.png,1
3,e621_3.png,0
4,e621_4.png,1
...,...,...
28438,e621_29994.png,1
28439,e621_29996.png,1
28440,e621_29997.png,0
28441,e621_29998.png,1


# Очистка данных

In [18]:
main_df = (main_df
           .drop("Unnamed: 0", axis=1, errors="ignore")
           .drop_duplicates()
           .dropna(subset=["image_name", "blocked"]))
main_df['blocked'] = main_df['blocked'].astype(int)

display(main_df.loc[:, main_df.columns == "blocked"].value_counts().reset_index(name="count"))
print(main_df.shape)

Unnamed: 0,blocked,count
0,1,22727
1,0,5716


(28443, 2)


# Сохранение

In [20]:
preprocessed_path.parent.mkdir(exist_ok=True)
main_df.to_csv(preprocessed_path, index=False)