In [26]:
import pandas as pd
import numpy as np
import shutil
import os

%pip install tabulate
from tabulate import tabulate

/Users/omkar/.zshenv:.:1: no such file or directory: /Users/omkar/.cargo/env
Note: you may need to restart the kernel to use updated packages.


In [3]:
pd.set_option('display.max_colwidth', None)
DATASET_DIR = "dataset"
IMAGES_DIR = f"{DATASET_DIR}/images"
PROCESSED_IMAGES_DIR = f"{DATASET_DIR}/processed"

In [4]:
def format_article_number(num):
    return '{:010}'.format(num)
def format_product_code(num):
    return '{:07}'.format(num)

print(format_article_number(1120129001))
print(format_article_number(120129001))
print(format_product_code(176209))
print(format_product_code(1176209))

1120129001
0120129001
0176209
1176209


In [5]:
articles = pd.read_csv(f"{DATASET_DIR}/articles.csv")
# articles.head()

In [10]:
def getProductVariants(product_code, limit: int = None):
    productVariants = articles.loc[articles['product_code'] == product_code]
    if limit:
        return productVariants.head(limit)
    return productVariants

# getProductVariants(176209)
getProductVariants(176209, 2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
99,176209023,176209,Mr Harrington w/hood,308,Hoodie,Garment Upper body,1010016,Solid,9,Black,...,Jacket Street,F,Menswear,3,Menswear,31,Mens Outerwear,1007,Outdoor,"Short, padded jacket with a jersey-lined hood and stand-up collar with a drawstring. Zip down the front, flap side pockets with a press-stud, one inner pocket, and ribbing at the cuffs and hem. Quilted lining."
100,176209025,176209,Mr Harrington w/hood,308,Hoodie,Garment Upper body,1010016,Solid,19,Greenish Khaki,...,Jacket Street,F,Menswear,3,Menswear,31,Mens Outerwear,1007,Outdoor,"Short, padded jacket with a jersey-lined hood and stand-up collar with a drawstring. Zip down the front, flap side pockets with a press-stud, one inner pocket, and ribbing at the cuffs and hem. Quilted lining."


In [18]:
def getArticlesWithCaptions(product_code, limit: int = None):
    productVariants = getProductVariants(product_code, limit)

    productArticlesList = []
    for _, row in productVariants.iterrows():
        articleInfo = {
            'article_id': format_article_number(row['article_id']),
            'product_code': format_product_code(row['product_code']),
            'prod_name': row['prod_name'],
            'caption': f"{row['colour_group_name']} {row['product_type_name']}, {row['detail_desc']}"
        }
        productArticlesList.append(articleInfo)

    productArticlesDf = pd.DataFrame(productArticlesList)
    productInfo = productVariants.iloc[0][["prod_name", "product_type_name", "product_group_name", "graphical_appearance_name", "colour_group_name", "index_name", "section_name", "garment_group_name"]]
    return productArticlesDf, productInfo

# productArticlesDf, productInfo = getArticlesWithCaptions(176209, 2)
# print(productInfo)
# print(productArticlesDf)

prod_name                    Mr Harrington w/hood
product_type_name                          Hoodie
product_group_name             Garment Upper body
graphical_appearance_name                   Solid
colour_group_name                           Black
index_name                               Menswear
section_name                       Mens Outerwear
garment_group_name                        Outdoor
Name: 99, dtype: object
   article_id product_code             prod_name   
0  0176209023      0176209  Mr Harrington w/hood  \
1  0176209025      0176209  Mr Harrington w/hood   

                                                                                                                                                                                                                                    caption  
0           Black Hoodie, Short, padded jacket with a jersey-lined hood and stand-up collar with a drawstring. Zip down the front, flap side pockets with a press-stud, one inner poc

In [30]:
def preprocessImages(product_code, limit: int = None):
    articlesWithCaptions, productInfo = getArticlesWithCaptions(product_code, limit)
    processedDestination = f"{PROCESSED_IMAGES_DIR}/{format_product_code(product_code)}"
    productInstancePrompt = f"photo of {productInfo['prod_name']} {productInfo['product_type_name']}"
    productClassPrompt = f"photo of {productInfo['product_type_name']}"

    os.makedirs(processedDestination, exist_ok=True)
    print(f"Processed destination: '{processedDestination}' created successfully/already exists.")
        
    for _, article in articlesWithCaptions.iterrows():
        captionLocation = f"{PROCESSED_IMAGES_DIR}/{article['product_code']}/{article['article_id']}.txt"
        captionFile = open(captionLocation, "w")
        captionFile.write(article['caption'])
        captionFile.close()
        shutil.copyfile(
            f"{IMAGES_DIR}/{article['product_code']}/{article['article_id']}.jpg",
            f"{PROCESSED_IMAGES_DIR}/{article['product_code']}/{article['article_id']}.jpg",
        )
    print(f"Processed captions for {len(articlesWithCaptions)} {product_code} images !")


    # Create a dictionary to store the data
    # trainingConfig = [
    #     ("Path for training", processedDestination),
    #     ("Instance prompt", productInstancePrompt),
    #     ("Class prompt", productClassPrompt),
    # ]

    # Create a DataFrame from the dictionary
    # print(pd.DataFrame(trainingConfig).transpose())

    # Generate the table using tabulate
    # configTable = tabulate(trainingConfig, ["Attribute", "Value"], tablefmt="grid")
    # print(configTable)

    print("\n------------------   Training config   ------------------\n")
    print(f"Path for training: \t{processedDestination}")
    print(f"Instance prompt: \t{productInstancePrompt}")
    print(f"Class prompt: \t\t{productClassPrompt}")
    print("\n-------------------------------------------------------\n")


preprocessImages(176209, 2)

Processed destination: 'dataset/processed/0176209' created successfully/already exists.
Processed captions for 2 176209 images !

------------------   Training config   ------------------

Path for training: 	dataset/processed/0176209
Instance prompt: 	photo of Mr Harrington w/hood Hoodie
Class prompt: 		photo of Hoodie

-------------------------------------------------------

