**Imports**

In [None]:
# Standard library imports
import os
import json
import glob
import random

import pandas as pd  # Data manipulation
from tqdm import tqdm  # Progress bars

**Function: retrieve_english_value**

In [None]:
def retrieve_english_value(data_field):
    """Retrieve English text from a field with language identifiers"""
    if not data_field:
        return None  # No data provided
        
    if not isinstance(data_field, list):
        return data_field  # Already a primitive value
    
    # Handle empty list
    if not data_field:
        return None
    
    # Check first entry for direct language-tagged dict
    initial_entry = data_field[0]
    if isinstance(initial_entry, dict):
        if 'language_tag' in initial_entry and 'value' in initial_entry:
            if not initial_entry['language_tag'].startswith('en_'):
                return None
            return initial_entry.get('value')
    
    # Search for English entries
    for record in data_field:
        if isinstance(record, dict):
            if (record.get("language_tag", "").startswith("en")
                and 'value' in record):
                return record["value"]
    
    # Fallback: first available value
    for record in data_field:
        if isinstance(record, dict) and 'value' in record:
            return record['value']
    
    return None

**Function: gather_english_keywords**

In [None]:
def gather_english_keywords(keyword_collection):
    """Gather and deduplicate English keywords into a string"""
    if not keyword_collection or not isinstance(keyword_collection, list):
        return None
    
    # Extract lowercase keywords, filtering by English or unspecified language
    keyword_items = [
        item['value'].strip().lower()
        for item in keyword_collection
        if (isinstance(item, dict)
            and 'value' in item
            and ('language_tag' not in item
                 or item['language_tag'].startswith('en')))
    ]
    
    # Remove duplicates while preserving order
    tracked_keywords = set()
    unique_keywords = [
        kw for kw in keyword_items
        if not (kw in tracked_keywords or tracked_keywords.add(kw))
    ]
    
    # Join to single comma-separated string
    return ', '.join(unique_keywords) if unique_keywords else None

**Main Function: create_curated_csv**

In [None]:
def create_curated_csv(source_dir, target_csv, image_data_csv,
                        limit_per_type=10000, filter_empty=False):
    """Create a curated and balanced CSV from metadata files
    
    Args:
        source_dir (str): Directory with JSON metadata files
        target_csv (str): Destination path for the output CSV
        image_data_csv (str): Path to the image metadata CSV
        limit_per_type (int): Max entries per product type
        filter_empty (bool): Remove entries with missing values
    """
    # Load image metadata
    print("Reading image data CSV...")
    img_dataframe = pd.read_csv(image_data_csv)
    img_mapping = {row['image_id']: row
                   for _, row in img_dataframe.iterrows()}
    print(f"Loaded {len(img_dataframe)} image records")

    # Parse JSON files and extract curated fields
    product_details = {}
    json_list = sorted(glob.glob(os.path.join(source_dir, "listings_*.json")))
    print(f"Discovered {len(json_list)} JSON files for processing")

    for json_path in tqdm(json_list, desc="Parsing JSON files"):
        with open(json_path, 'r', encoding='utf-8') as file:
            for entry in file:
                try:
                    item_info = json.loads(entry.strip())
                    key_image_id = item_info.get("main_image_id")
                    # Only include if image exists
                    if not key_image_id or key_image_id not in img_mapping:
                        continue

                    curated_data = {
                        'title': retrieve_english_value(item_info.get('item_name')),
                        'category': retrieve_english_value(item_info.get('product_type')),
                        'shade': retrieve_english_value(item_info.get('color')),
                        'tags': gather_english_keywords(item_info.get('item_keywords'))
                    }
                    product_details[key_image_id] = curated_data
                except (json.JSONDecodeError, Exception) as err:
                    # Skip malformed entries
                    continue

    print(f"Retrieved metadata for {len(product_details)} images")

    # Build initial records by joining image info with metadata
    curated_records = []
    for img_id, img_record in tqdm(img_mapping.items(), desc="Building initial dataset"):
        if img_id in product_details:
            details = product_details[img_id]
            # Skip blacklisted IDs or incomplete records
            if img_id in ['518Dk4FOzZL', '719hoe+OvIL', '71Qbh8wmhnL']:
                continue
            if not all(details.get(field) for field in ['title','category','shade','tags']):
                continue

            # Ensure ASCII-only text
            def check_ascii(text):
                return isinstance(text, str) and text.isascii()
            if not all(check_ascii(details[f]) for f in ['title','category','shade']):
                continue

            record = {
                'filepath': img_record['path'],
                'image_id': img_id,
                'title': details['title'].lower(),
                'category': details['category'].lower(),
                'shade': details['shade'].lower(),
                'tags': details['tags'].lower()
            }
            curated_records.append(record)

    # Convert to DataFrame and optional filtering
    base_dataframe = pd.DataFrame(curated_records)
    if filter_empty:
        base_dataframe = base_dataframe.replace('', pd.NA).dropna()
    base_dataframe = base_dataframe.sort_values('filepath').reset_index(drop=True)

    print(f"Dataset size before balancing: {len(base_dataframe)}")
    print("\nTop 10 categories before balancing:")
    print(base_dataframe['category'].value_counts().head(10))

    # Balance categories by sampling
    balanced_collection = []
    for _, type_group in tqdm(base_dataframe.groupby('category'), desc="Balancing categories"):
        if len(type_group) <= limit_per_type:
            balanced_collection.append(type_group)
        else:
            balanced_collection.append(type_group.sample(limit_per_type, random_state=42))

    final_dataframe = pd.concat(balanced_collection)
    final_dataframe = final_dataframe.sort_values('filepath').reset_index(drop=True)
    final_dataframe.to_csv(target_csv, index=False)

    # Output summary
    print("\nTop 10 categories after balancing:")
    print(final_dataframe['category'].value_counts().head(10))
    print(f"\nData saved to: {target_csv}")

    return {
        'processed_image_count': len(img_dataframe),
        'metadata_found': len(product_details),
        'pre_balance_size': len(base_dataframe),
        'post_balance_size': len(final_dataframe),
        'unique_categories': final_dataframe['category'].nunique()
    }

**Helper Function: run_processing**

In [None]:
def run_processing():
    """Configure paths and run the CSV creation pipeline"""
    img_source_file = "/kaggle/input/vrproject2/abo-images-small/images/metadata/images.csv"
    metadata_location = "/kaggle/input/vrproject2/abo-listings/listings/metadata"
    output_file = "/kaggle/working/abo-images-small/images/small/balanced_dataset.csv"

    type_cap = 11000  # Maximum samples per category
    remove_blanks = True  # Drop incomplete records

    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Run processing and collect results
    results = create_curated_csv(metadata_location, output_file,
                                 img_source_file, type_cap, remove_blanks)

    # Print summary
    print("\nSummary of Processing:")
    print(f"Images processed: {results['processed_image_count']}")
    print(f"Images with metadata: {results['metadata_found']}")
    print(f"Dataset size pre-balance: {results['pre_balance_size']}")
    print(f"Dataset size post-balance: {results['post_balance_size']}")
    print(f"Unique category count: {results['unique_categories']}")

**Main Execution**

In [None]:
if __name__ == "__main__":
    run_processing()

In [None]:
# !cd /kagge/working
# !ls

In [None]:
# !zip -r file.zip /kaggle/working
# !ls