In [1]:
import os, json, random, math, glob, re, shutil
import pandas as pd
from PIL import Image

### Clear csv and make them jsons

In [3]:
df = pd.read_csv(r'C:\Users\syune\OneDrive\Desktop\Wine_Data/new_labels.csv')
df = df.fillna('') 

In [4]:
df = df[["WineID", 'WineName', 'Type', 'Country', 'WineryName']]
df

Unnamed: 0,WineID,WineName,Type,Country,WineryName
0,100001,Espumante Moscatel,Sparkling,Brazil,Casa Perini
1,100002,Ancellotta,Red,Brazil,Casa Perini
2,100003,Cabernet Sauvignon,Red,Brazil,Castellamare
3,100005,Maison de Ville Cabernet-Merlot,Red,Brazil,Aurora
4,100007,Do Lugar Moscatel Espumantes,Sparkling,Brazil,Dal Pizzol
...,...,...,...,...,...
1002,199408,Petite Arvine,White,Switzerland,Niklaus Wittwer
1003,199481,St. Laurent Reserve,Red,Austria,Hundsdorfer
1004,199533,Mariengarten Chardonnay,White,Austria,Muster-Gamlitz
1005,199885,Grüner Veltliner Federspiel Ried Kreuzberg,White,Austria,Josef Fischer


In [5]:
unique_names = df["Type"].unique()
unique_names

array(['Sparkling', 'Red', 'White', 'Dessert/Port', 'Dessert', 'Rosé'],
      dtype=object)

In [6]:
df["Type"] = df["Type"].replace({"Dessert/Port": "Dessert", "Rosé": "Rose"})

In [7]:
df = df.rename(columns={"WineName": "Name", "WineryName": "Brand", "Country": "Region"})

In [9]:
df.to_excel(r'C:\Users\syune\OneDrive\Desktop\Wine_Data/new_labels.xlsx')

In [8]:
output_json_dir = r'C:\Users\syune\OneDrive\Desktop\Wine_Data/new_dataset/labels'

In [21]:
for index, row in df.iterrows():
    image_id = str(row['WineID'])  # convert int to string
    image_name = image_id + ".jpg"
    # Write JSON
    data = {
        "Name": row.get("Name", ""),
        "Brand": row.get("Brand", ""),
        "Type": row.get("Type", ""),
        "Region": row.get("Region", "")
    }
    json_filename = image_id + ".json"
    json_path = os.path.join(output_json_dir, json_filename)
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [23]:
## Change names to avoid overlaps
def rename_and_move_files(image_folder, json_folder, output_image_folder, output_json_folder, offset=17000000):
    os.makedirs(output_image_folder, exist_ok=True)
    os.makedirs(output_json_folder, exist_ok=True)
    for filename in os.listdir(image_folder):
        match = re.match(r"(\d+)\.jpeg", filename)
        if match:
            old_number = int(match.group(1))
            new_number = old_number + offset
            new_name = f"{new_number}.jpeg"
            # Copy and rename image
            src_image = os.path.join(image_folder, filename)
            dst_image = os.path.join(output_image_folder, new_name)
            shutil.copyfile(src_image, dst_image)
            # Copy and rename JSON (if exists)
            old_json = os.path.join(json_folder, f"{old_number}.json")
            new_json = os.path.join(output_json_folder, f"{new_number}.json")
            if os.path.exists(old_json):
                shutil.copyfile(old_json, new_json)
# Run the function
rename_and_move_files(
    image_folder = r'C:\Users\syune\OneDrive\Desktop\Wine_Data/new_dataset/images',
    json_folder = r'C:\Users\syune\OneDrive\Desktop\Wine_Data/new_dataset/labels',
    output_image_folder = r'C:\Users\syune\OneDrive\Desktop\Wine_Data/cleaned_dataset/images', 
    output_json_folder = r'C:\Users\syune\OneDrive\Desktop\Wine_Data/cleaned_dataset/labels',
    offset=17000000
)

### Group data to folders

In [None]:
labels_dir = r"C:\Users\syune\OneDrive\Desktop/Wine_Data/cleaned_dataset/labels"   # folder with JSON files
images_dir = r"C:\Users\syune\OneDrive\Desktop/Wine_Data/cleaned_dataset/images"   # folder with images
output_base = r"C:\Users\syune\OneDrive\Desktop\Wine_Data/Data"

# Make sure output folders exist
os.makedirs(output_base, exist_ok=True)
# Loop over all JSON label files
for json_file in os.listdir(labels_dir):
    if json_file.lower().endswith(".json"):
        json_path = os.path.join(labels_dir, json_file)
        # Read JSON file
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        # Get the wine type
        wine_type = data.get("Type", "Unknown").strip()
        
        # Normalize folder name
        type_folder_name = wine_type.lower().replace(" ", "_")
        # Create destination folders
        labels_dest_dir = os.path.join(output_base, f"{type_folder_name}/labels")
        images_dest_dir = os.path.join(output_base, f"{type_folder_name}/images")
        os.makedirs(labels_dest_dir, exist_ok=True)
        os.makedirs(images_dest_dir, exist_ok=True)
        # Move JSON
        shutil.move(json_path, os.path.join(labels_dest_dir, json_file))
        # Move corresponding image (assuming same filename but different extension)
        image_name_base = os.path.splitext(json_file)[0]
        for ext in [".jpg", ".jpeg", ".png", ".webp"]:
            image_path = os.path.join(images_dir, image_name_base + ext)
            if os.path.exists(image_path):
                shutil.move(image_path, os.path.join(images_dest_dir, image_name_base + ext))
                break

### Clean Data

In [2]:
base_dir = r"C:\Users\syune\OneDrive\Desktop\Wine_Data/Data"

# get all folder names inside Data
folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

# now build full image/json dirs
image_dirs = [os.path.join(base_dir, f, "images") for f in folders]
json_dirs = [os.path.join(base_dir, f, "jsons") for f in folders]

# Example: loop through each image folder
for image_dir in image_dirs:
    for img_name in os.listdir(image_dir):
        img_path = os.path.join(image_dir, img_name)
        if os.path.getsize(img_path) < 7 * 1024:  # remove files smaller than 7 KB
            os.remove(img_path)

In [3]:
folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

for folder in folders:
    images_dir = os.path.join(base_dir, folder, "images")
    jsons_dir = os.path.join(base_dir, folder, "labels")

    if not os.path.exists(images_dir) or not os.path.exists(jsons_dir):
        continue

    for json_file in os.listdir(jsons_dir):
        base_name = os.path.splitext(json_file)[0]

        possible_images = [base_name + ext for ext in ['.jpg', '.jpeg', '.JPG', '.JPEG', '.png']]

        image_exists = any(os.path.exists(os.path.join(images_dir, img)) for img in possible_images)

        print(f"[{folder}] Checking JSON: {json_file} -> Image exists? {image_exists}")

        if not image_exists:
            # os.remove(os.path.join(jsons_dir, json_file))
            print(f"[{folder}] Removed JSON without image: {json_file}")

[Champagne] Checking JSON: 100503.json -> Image exists? True
[Champagne] Checking JSON: 106583.json -> Image exists? False
[Champagne] Removed JSON without image: 106583.json
[Champagne] Checking JSON: 111294.json -> Image exists? True
[Champagne] Checking JSON: 12619.json -> Image exists? True
[Champagne] Checking JSON: 135798.json -> Image exists? True
[Champagne] Checking JSON: 13824.json -> Image exists? True
[Champagne] Checking JSON: 145169.json -> Image exists? True
[Champagne] Checking JSON: 14636.json -> Image exists? True
[Champagne] Checking JSON: 14641.json -> Image exists? True
[Champagne] Checking JSON: 14846.json -> Image exists? True
[Champagne] Checking JSON: 155341.json -> Image exists? True
[Champagne] Checking JSON: 158550.json -> Image exists? True
[Champagne] Checking JSON: 162883.json -> Image exists? True
[Champagne] Checking JSON: 167486.json -> Image exists? True
[Champagne] Checking JSON: 173161.json -> Image exists? True
[Champagne] Checking JSON: 17595.json

In [4]:
base_dir = r"C:\Users\syune\OneDrive\Desktop\Wine_Data/Data"

# get all folder names inside Data
folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

# build json directories
json_dirs = [os.path.join(base_dir, f, "labels") for f in folders]

# list to store records
records = []

# loop through json directories
for folder, json_dir in zip(folders, json_dirs):
    if os.path.exists(json_dir):  # check if labels folder exists
        for json_file in os.listdir(json_dir):
            if json_file.endswith(".json"):
                json_path = os.path.join(json_dir, json_file)

                try:
                    with open(json_path, "r", encoding="utf-8") as f:
                        data = json.load(f)  # parse json content

                    # extract fields from JSON
                    name = data.get("Name", None)
                    brand = data.get("Brand", None)

                    # assign type from folder name instead of JSON
                    wine_type = folder

                    # handle region → split by comma and take last part (country)
                    region = data.get("Region", None)
                    country = None
                    if region and "," in region:
                        country = region.split(",")[-1].strip()
                    elif region:  # if no comma, take full string
                        country = region.strip()

                    # build image path
                    image_dir = os.path.join(base_dir, folder, "images")
                    image_filename = os.path.splitext(json_file)[0]  # remove .json
                    # look for common image extensions
                    for ext in [".jpg", ".jpeg", ".png"]:
                        image_path = os.path.join(image_dir, image_filename + ext)
                        if os.path.exists(image_path):
                            break
                    else:
                        image_path = None  # if no matching image found

                    # store each file's data + paths
                    records.append({
                        "image_path": image_path,
                        "file_name": json_file,
                        "Name": name,
                        "Brand": brand,
                        "Type": wine_type,  # <-- folder name
                        "Country": country
                    })
                except Exception as e:
                    print(f"Error reading {json_path}: {e}")

# convert to DataFrame
df_json = pd.DataFrame(records)

df_json.head()


Unnamed: 0,image_path,file_name,Name,Brand,Type,Country
0,C:\Users\syune\OneDrive\Desktop\Wine_Data/Data...,100503.json,Bollinger Brut Rosé Champagne,Bollinger,Champagne,France
1,,106583.json,Charles Ellner Séduction Brut Champagne 2007,Charles Ellne,Champagne,France
2,C:\Users\syune\OneDrive\Desktop\Wine_Data/Data...,111294.json,Moët & Chandon Brut Imperial Champagne,Moet & Chandon,Champagne,France
3,C:\Users\syune\OneDrive\Desktop\Wine_Data/Data...,12619.json,Vilmart et Cie Grande Réserve 1er Cru Champagne,Vintages Front Line Release,Champagne,France
4,C:\Users\syune\OneDrive\Desktop\Wine_Data/Data...,135798.json,Perrier-Jouët Belle Epoque Brut Champagne 2016,Vintages Classic Catalogue,Champagne,France


In [5]:
df_json.to_excel(r'C:\Users\syune\OneDrive\Desktop\Wine_Data/wine_data.xlsx')