# Settings

In [None]:
MIN_COUNT = 5
LIMIT_COUNT = 500
MAX_IMAGE_WIDTH = 224
OUTPUT_FOLDER = "../image/downloaded_images"

In [None]:
# DATASET_NAME = 'Vampyrian/telefonai'
# PARENT_CATEGORY_ID = 1
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/kompiuterine-technika'
# PARENT_CATEGORY_ID = 16
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/vaizdo-ir-garso-technika'
# PARENT_CATEGORY_ID = 107
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/foto-ir-video'
# PARENT_CATEGORY_ID = 166
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/sodui-ir-namams'
# PARENT_CATEGORY_ID = 181
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/automobiliu-prekes'
# PARENT_CATEGORY_ID = 814
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/buitine-technika-ir-elektronika'
# PARENT_CATEGORY_ID = 1240
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/grozis-ir-sveikata'
# PARENT_CATEGORY_ID = 1327
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/vaiku-prekes'
# PARENT_CATEGORY_ID = 1625
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/sportas-laisvalaikis'
# PARENT_CATEGORY_ID = 1719
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/biuro-ir-kanceliarines-prekes'
# PARENT_CATEGORY_ID = 2470
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/apranga-ir-avalyne'
# PARENT_CATEGORY_ID = 2568
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/baldai-ir-namu-intejeras'
# PARENT_CATEGORY_ID = 2781
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/gyvunu-prekes'
# PARENT_CATEGORY_ID = 2936
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/maistas-ir-gerimai'
# PARENT_CATEGORY_ID = 2995
#-------------------------------------------------------------------
# DATASET_NAME = 'Vampyrian/statybines-prekes'
# PARENT_CATEGORY_ID = 3494
#-------------------------------------------------------------------
DATASET_NAME = 'Vampyrian/erotines-prekes'
PARENT_CATEGORY_ID = 3602
#-------------------------------------------------------------------


# Update parquet from DB

In [None]:
from sqlalchemy import create_engine
import dotenv
import os
import pandas as pd

In [None]:
dotenv.load_dotenv(dotenv_path="../.env", override=True)

In [None]:
engine = create_engine(f"mysql+pymysql://{os.getenv('MYSQL_USER')}:{os.getenv('MYSQL_PASSWORD')}@{os.getenv('MYSQL_HOST')}:{os.getenv('MYSQL_PORT')}/{os.getenv('MYSQL_DATABASE')}")

# Test connection
with engine.connect() as connection:
    print("Database connection successful!")

In [None]:
query = f"""
SELECT category_training_data.image_path, categories.id AS category_id, categories.name AS category_name
FROM category_training_data
JOIN listings ON category_training_data.listing_id = listings.id
JOIN categories ON category_training_data.category_id = categories.id
WHERE category_training_data.is_confirmed_category = 1
AND category_training_data.image_path IS NOT NULL
AND category_training_data.category_id IN (
    WITH RECURSIVE category_tree AS (
    SELECT id, name, category_id
    FROM categories
    WHERE id = {PARENT_CATEGORY_ID} -- The ID of the parent category
    UNION ALL
    SELECT c.id, c.name, c.category_id
    FROM categories c
    INNER JOIN category_tree ct ON c.category_id = ct.id)
    SELECT id FROM category_tree);
"""

In [None]:
data = pd.read_sql(query, engine)

In [None]:
data

In [None]:
image_folder = "../image"
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

path = os.path.join(image_folder, "images_url.parquet")

In [None]:
data.to_parquet(path, index=False)

# Preprocess data

In [None]:
data['category'] = data['category_id'].astype(str) + '_' + data['category_name']

In [None]:
data.drop(columns=['category_id', 'category_name'], inplace=True)

# Filter dataframe if category have less than particular count of images

In [None]:
data

In [None]:
# Get value counts for the desired column
value_counts = data['category'].value_counts()

# Find indices (categories) where the count is less than MIN_COUNT
categories_to_keep = value_counts[value_counts > MIN_COUNT].index

# Filter the DataFrame to keep only rows with those categories
filtered_data = data[data['category'].isin(categories_to_keep)]

In [None]:
filtered_data

# Limit image per category

In [None]:
limited_data = filtered_data.groupby('category').head(LIMIT_COUNT)

In [None]:
limited_data

In [None]:
sorted_limited_data = limited_data.sort_values(by='category')

In [None]:
sorted_limited_data

# Upload images from parquet to disk

In [None]:
import requests
from tqdm import tqdm  # For visualizing progress
from PIL import Image, ImageFile
from PIL.Image import DecompressionBombError
from io import BytesIO

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
host_url = os.getenv("S3_URL")

In [None]:
TEMP = 0
print(TEMP)

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

for index, row in tqdm(limited_data.iterrows(), total=sorted_limited_data.shape[0]):
    image_url = host_url + row['image_path']
    category = row['category'] # Use category for organization

    category_folder = os.path.join(OUTPUT_FOLDER, category)

    if os.path.exists(category_folder) and TEMP == 0: # Check progress
        continue

    TEMP = 1

    if not os.path.exists(category_folder):
        os.makedirs(category_folder)

    image_filename = f"{index}.jpg"  # Save as index or any unique naming scheme
    image_path = os.path.join(category_folder, image_filename)

    try:
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            original_width, original_height = image.size

            if original_width > MAX_IMAGE_WIDTH:
                aspect_ratio = original_height / original_width
                new_height = int(MAX_IMAGE_WIDTH * aspect_ratio)

                resized_image = image.resize((MAX_IMAGE_WIDTH, new_height))
            else:
                resized_image = image

            if resized_image.mode not in ("RGB", "L"):
                resized_image = resized_image.convert('RGB')

            if "exif" in resized_image.info:
                resized_image.info.pop("exif")

            resized_image.save(image_path, format="JPEG")
        else:
            print(f"Failed to download {image_url}, from category {category}: Status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Error downloading {image_url} from category {category}: {e}")
    except DecompressionBombError as e:
        print(f"Decompress bomb error while processing the image {image_url} from category {category}: {e}")
    except OSError as e:
        print(f"Error while processing the image {image_url} from category {category}: {e}")

# Create dataset on Hugging face

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("imagefolder", data_dir=OUTPUT_FOLDER)

In [None]:
dataset

In [None]:
dataset.push_to_hub(DATASET_NAME)