# Settings

In [1]:
HUGGING_FACE_DATASET_NAME = 'Vampyrian/buitine_technika'
MIN_COUNT = 5
LIMIT_COUNT = 500
MAX_IMAGE_WIDTH = 300

# Update parquet from DB

In [2]:
import mysql.connector
import dotenv
import os
import pandas as pd

In [3]:
dotenv.load_dotenv(dotenv_path="../.env", override=True)

True

In [4]:
conn = None
try:
    conn = mysql.connector.connect(
        host=os.getenv("MYSQL_HOST"),
        user=os.getenv("MYSQL_USER"),
        password=os.getenv("MYSQL_PASSWORD"),
        port=os.getenv("MYSQL_PORT"),
        database=os.getenv("MYSQL_DATABASE")
    )
    print("Database connection successful!")
except mysql.connector.Error as e:
    print(e)

Database connection successful!


In [5]:
sql = """
SELECT category_training_data.image_path, categories.id AS category_id, categories.name AS category_name
FROM category_training_data
JOIN listings ON category_training_data.listing_id = listings.id
JOIN categories ON category_training_data.category_id = categories.id
WHERE category_training_data.is_confirmed_category = 1 and category_training_data.image_path is not null;
"""

# sql = """
# SELECT category_training_data.image_path, categories.id AS category_id, categories.name AS category_name
# FROM category_training_data
# JOIN listings ON category_training_data.listing_id = listings.id
# JOIN categories ON category_training_data.category_id = categories.id
# WHERE category_training_data.is_confirmed_category = 1 and category_training_data.image_path is not null
# AND categories.id between 1240 and 1329;
# """

data = pd.read_sql(sql, conn)

  data = pd.read_sql(sql, conn)


In [6]:
image_folder = "../image"
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

path = os.path.join(image_folder, "images_url.parquet")

In [7]:
data.to_parquet(path, index=False)

# Preprocess data

In [8]:
data['category'] = data['category_id'].astype(str) + '_' + data['category_name']

In [9]:
data.drop(columns=['category_id', 'category_name'], inplace=True)

# Filter dataframe if category have less than particular count of images

In [10]:
data

Unnamed: 0,image_path,category
0,training_data/f9188f66-3701-4ae7-a69e-25050f9c...,3_Laisvų rankų įranga
1,training_data/b3187f7a-ef16-4c45-9738-6d6345f8...,3_Laisvų rankų įranga
2,training_data/388503a8-5452-47ce-80f8-50bc311b...,3_Laisvų rankų įranga
3,training_data/0526ea1f-c62d-4c81-a106-d44dd4df...,3_Laisvų rankų įranga
4,training_data/56aa6e74-8db5-46ff-af99-8c8c67e6...,3_Laisvų rankų įranga
...,...,...
324694,training_data/adbdc39e-7a1b-494a-9669-ef405ac3...,12_Ekrano plėvelės
324695,training_data/fc2d4c5b-31d2-4452-b564-472e40bd...,4_Dėklai telefonams
324696,training_data/ef574785-1d1b-4584-9156-f94ea8d9...,4_Dėklai telefonams
324697,training_data/a270c431-44dc-48d5-a1b7-e40fff2d...,4_Dėklai telefonams


In [11]:
# Get value counts for the desired column
value_counts = data['category'].value_counts()

# Find indices (categories) where the count is less than MIN_COUNT
categories_to_keep = value_counts[value_counts > MIN_COUNT].index

# Filter the DataFrame to keep only rows with those categories
filtered_data = data[data['category'].isin(categories_to_keep)]

In [12]:
filtered_data

Unnamed: 0,image_path,category
0,training_data/f9188f66-3701-4ae7-a69e-25050f9c...,3_Laisvų rankų įranga
1,training_data/b3187f7a-ef16-4c45-9738-6d6345f8...,3_Laisvų rankų įranga
2,training_data/388503a8-5452-47ce-80f8-50bc311b...,3_Laisvų rankų įranga
3,training_data/0526ea1f-c62d-4c81-a106-d44dd4df...,3_Laisvų rankų įranga
4,training_data/56aa6e74-8db5-46ff-af99-8c8c67e6...,3_Laisvų rankų įranga
...,...,...
324694,training_data/adbdc39e-7a1b-494a-9669-ef405ac3...,12_Ekrano plėvelės
324695,training_data/fc2d4c5b-31d2-4452-b564-472e40bd...,4_Dėklai telefonams
324696,training_data/ef574785-1d1b-4584-9156-f94ea8d9...,4_Dėklai telefonams
324697,training_data/a270c431-44dc-48d5-a1b7-e40fff2d...,4_Dėklai telefonams


# Limit image per category

In [13]:
limited_data = filtered_data.groupby('category').head(LIMIT_COUNT)

In [14]:
limited_data

Unnamed: 0,image_path,category
0,training_data/f9188f66-3701-4ae7-a69e-25050f9c...,3_Laisvų rankų įranga
1,training_data/b3187f7a-ef16-4c45-9738-6d6345f8...,3_Laisvų rankų įranga
2,training_data/388503a8-5452-47ce-80f8-50bc311b...,3_Laisvų rankų įranga
3,training_data/0526ea1f-c62d-4c81-a106-d44dd4df...,3_Laisvų rankų įranga
4,training_data/56aa6e74-8db5-46ff-af99-8c8c67e6...,3_Laisvų rankų įranga
...,...,...
324694,training_data/adbdc39e-7a1b-494a-9669-ef405ac3...,12_Ekrano plėvelės
324695,training_data/fc2d4c5b-31d2-4452-b564-472e40bd...,4_Dėklai telefonams
324696,training_data/ef574785-1d1b-4584-9156-f94ea8d9...,4_Dėklai telefonams
324697,training_data/a270c431-44dc-48d5-a1b7-e40fff2d...,4_Dėklai telefonams


# Upload images from parquet to disk

In [21]:
import requests
from tqdm import tqdm  # For visualizing progress
from PIL import Image, ImageFile
from PIL.Image import DecompressionBombError
from io import BytesIO

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [22]:
host_url = os.getenv("S3_URL")

In [None]:
output_folder = "../image/downloaded_images"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for index, row in tqdm(limited_data.iterrows(), total=limited_data.shape[0]):
    image_url = host_url + row['image_path']
    category = row['category'] # Use category for organization

    category_folder = os.path.join(output_folder, category)
    if not os.path.exists(category_folder):
        os.makedirs(category_folder)

    image_filename = f"{index}.jpg"  # Save as index or any unique naming scheme
    image_path = os.path.join(category_folder, image_filename)

    try:
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            original_width, original_height = image.size

            if original_width > MAX_IMAGE_WIDTH:
                aspect_ratio = original_height / original_width
                new_height = int(MAX_IMAGE_WIDTH * aspect_ratio)

                resized_image = image.resize((MAX_IMAGE_WIDTH, new_height))
            else:
                resized_image = image

            if resized_image.mode == 'RGBA':
                resized_image = resized_image.convert('RGB')

            resized_image.save(image_path, format="JPEG")
        else:
            print(f"Failed to download {image_url}, from category {category}: Status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Error downloading {image_url} from category {category}: {e}")
    except DecompressionBombError as e:
        print(f"Decompress bomb error while processing the image {image_url} from category {category}: {e}")
    except OSError as e:
        print(f"Error while processing the image {image_url} from category {category}: {e}")

  0%|          | 17/323953 [00:04<19:47:25,  4.55it/s]

Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/2882947f-83cf-4eed-aa40-c47d34ece61d, from category 3_Laisvų rankų įranga: Status code 403


  0%|          | 26/323953 [00:06<18:40:13,  4.82it/s]

Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/ccfc5b4e-81c0-43f6-9ed4-904cbb687962, from category 3_Laisvų rankų įranga: Status code 403


  0%|          | 29/323953 [00:06<17:12:50,  5.23it/s]

Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/d1feb352-4103-4d8b-8db7-5ee01da31f15, from category 3_Laisvų rankų įranga: Status code 403


  0%|          | 33/323953 [00:07<18:03:27,  4.98it/s]

Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/364da44e-53f6-4351-bcbb-22352f67473b, from category 3_Laisvų rankų įranga: Status code 403


  0%|          | 36/323953 [00:08<16:37:56,  5.41it/s]

Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/d2c646ba-9852-46c1-9304-f26cf37fd5d1, from category 3_Laisvų rankų įranga: Status code 403
Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/57802bae-d5d9-46c0-a36a-933a810dd650, from category 3_Laisvų rankų įranga: Status code 403


  0%|          | 39/323953 [00:09<18:16:04,  4.93it/s]

Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/d218f9d5-e6d7-4983-9193-4680b1c3e6f8, from category 3_Laisvų rankų įranga: Status code 403


  0%|          | 59/323953 [00:13<17:42:29,  5.08it/s]

Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/c96a944a-2d5c-4d03-a9f9-9b236330f59e, from category 3_Laisvų rankų įranga: Status code 403
Failed to download https://kainoteka-public.s3.eu-central-1.amazonaws.com/training_data/81c688ea-70eb-494d-babf-8c3552fb8ee4, from category 3_Laisvų rankų įranga: Status code 403


  0%|          | 61/323953 [00:13<19:04:31,  4.72it/s]

# Create dataset on Hugging face

In [26]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
dataset = load_dataset("imagefolder", data_dir=output_folder)

Downloading data: 100%|██████████| 6693/6693 [00:00<00:00, 1445916.90files/s]
Generating train split: 6693 examples [00:00, 33574.58 examples/s]


In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 6693
    })
})

In [29]:
dataset.push_to_hub(HUGGING_FACE_DATASET_NAME)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]
Map:   0%|          | 0/3347 [00:00<?, ? examples/s][A
Map:  21%|██        | 700/3347 [00:00<00:00, 5432.46 examples/s][A
Map:  42%|████▏     | 1400/3347 [00:00<00:00, 5608.23 examples/s][A
Map:  63%|██████▎   | 2100/3347 [00:00<00:00, 5602.44 examples/s][A
Map: 100%|██████████| 3347/3347 [00:00<00:00, 5382.52 examples/s][A

Creating parquet from Arrow format: 100%|██████████| 34/34 [00:00<00:00, 352.13ba/s]
Uploading the dataset shards:  50%|█████     | 1/2 [00:17<00:17, 17.79s/it]
Map:   0%|          | 0/3346 [00:00<?, ? examples/s][A
Map:  18%|█▊        | 600/3346 [00:00<00:00, 4307.23 examples/s][A
Map:  36%|███▌      | 1200/3346 [00:00<00:00, 4803.15 examples/s][A
Map:  57%|█████▋    | 1900/3346 [00:00<00:00, 5321.10 examples/s][A
Map:  75%|███████▍  | 2500/3346 [00:00<00:00, 5387.14 examples/s][A
Map: 100%|██████████| 3346/3346 [00:00<00:00, 5067.67 examples/s][A

Creating parquet from Arrow format:   

CommitInfo(commit_url='https://huggingface.co/datasets/Vampyrian/buitine_technika/commit/4913a0dc3e4611bb77dbd9c2824edaeef4f4e127', commit_message='Upload dataset', commit_description='', oid='4913a0dc3e4611bb77dbd9c2824edaeef4f4e127', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Vampyrian/buitine_technika', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Vampyrian/buitine_technika'), pr_revision=None, pr_num=None)