# Первичная настройка путей

In [6]:
from pathlib import Path


raw_data_path = Path("../data/raw")
preprocessed_file = Path("../data/preprocessed/image_only.csv")
images_dir = Path('../data/images')

In [7]:
import pandas as pd
if not preprocessed_file.exists():
    main_df = pd.DataFrame(columns=["image_name", "blocked"])
else: 
    main_df = pd.read_csv(preprocessed_file)
images_dir.mkdir(exist_ok=True)

# Загрузка данных

## hearmeneigh/e621-rising-v3-small

In [None]:
from datasets import load_dataset
import csv
from tqdm import tqdm

max_records = 279296
raw_dataset_path = Path(raw_data_path, "e621-rising-v3-curated.csv")
dataset = load_dataset("hearmeneigh/e621-rising-v3-curated", split="train", streaming=True)

with open(raw_dataset_path, "w", encoding="utf-8", newline="") as f:
    writer = None
    progress = tqdm(total=max_records, desc="Сохранение записей", unit="row")
    for i, example in enumerate(dataset):
        if i >= max_records:
            break
        if writer is None:
            fieldnames = list(example.keys())
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
        
        writer.writerow(example)
        progress.update(1)
    progress.close()

temp_df = pd.read_csv("../data/raw/e621-rising-v3-curated.csv")
temp_df.head(5)

Сохранение записей: 100%|██████████| 279296/279296 [1:38:13<00:00, 63.24row/s]  

Unnamed: 0,source_id,source,image,tags,url,text,selector
0,3724100,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['nude', 'erection', 'mismatched_animal_penis'...",https://static1.e621.net/data/9b/9b/9b9b3a2a14...,nude erection mismatched_animal_penis digital_...,tier-1
1,2323551,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['brown_fur', 'mammal_humanoid', 'midriff', 't...",https://static1.e621.net/data/b6/73/b673f9bd71...,brown_fur mammal_humanoid midriff topwear solo...,tier-2
2,3858728,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['favorites_below_1000', 'anthro', 'solo', 'wh...",https://static1.e621.net/data/4c/a6/4ca60e5ac9...,favorites_below_1000 anthro solo white_backgro...,tier-3
3,3972702,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['open_mouth', 'eyebrows', 'white_body', 'favo...",https://static1.e621.net/data/93/98/9398b7dfb4...,open_mouth eyebrows white_body favorites_above...,tier-2
4,2535548,e621,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"['restraints', 'tail', 'shackles', 'chair', 's...",https://static1.e621.net/data/9e/14/9e1452967b...,restraints tail shackles chair score_above_500...,tier-1


In [42]:
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm

Image.MAX_IMAGE_PIXELS = None

max_items = 20000
processed_data = []

for index, row in tqdm(temp_df[:max_items].iterrows(), total=max_items):
    is_explicit = 1 if "rating_explicit" in row['tags'] else 0
    try:
        response = requests.get(row['url'] , timeout=3)
        image = Image.open(BytesIO(response.content))
        if image.mode != 'RGB':
            image = image.convert('RGB')

        filename = f"e621v3{index}.jpg"     
        image.save(Path(images_dir, filename))        
        processed_data.append({
            'image_name': filename,
            'blocked': is_explicit
        })
    except Exception as e:
        #print(f"Ошибка обработки изображения {row['url']}: {e}")
        pass

temp_df = pd.DataFrame(processed_data)
display(temp_df.sample(5))
display(temp_df.loc[:, temp_df.columns == "blocked"].value_counts().reset_index(name="count"))

100%|██████████| 20000/20000 [11:12:36<00:00,  2.02s/it]  


Unnamed: 0,image_name,blocked
14771,e621v315594.jpg,1
16250,e621v317152.jpg,1
9316,e621v39831.jpg,1
17642,e621v318610.jpg,0
13920,e621v314707.jpg,1


Unnamed: 0,blocked,count
0,1,15145
1,0,3808


In [None]:
main_df = pd.concat([main_df, temp_df])
display(main_df)

Unnamed: 0,image_name,blocked
0,e621v30.jpg,1
1,e621v31.jpg,1
2,e621v32.jpg,1
3,e621v33.jpg,0
4,e621v34.jpg,1
...,...,...
3548,e621v33742.jpg,1
3549,e621v33743.jpg,0
3550,e621v33744.jpg,1
3551,e621v33745.jpg,0


# Очистка данных

In [None]:
main_df = (main_df
           .drop("Unnamed: 0", axis=1, errors="ignore")
           .drop_duplicates()
           .dropna(subset=["image_name", "blocked"]))
main_df['blocked'] = main_df['blocked'].astype(int)

In [None]:
display(main_df.loc[:, main_df.columns == "blocked"].value_counts().reset_index(name="count"))
print(main_df.shape)

Unnamed: 0,blocked,count
0,1,2859
1,0,694


(3553, 2)


# Сохранение

In [None]:
preprocessed_file.parent.mkdir(exist_ok=True)
main_df.to_csv(preprocessed_file, index=False)