In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train_dir = '../../data/dataset/train'
val_dir = '../../data/dataset/validation'
test_dir = '../../data/dataset/test'

In [None]:
articles = pd.read_csv('../../data/raw/articles.csv')

In [4]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [5]:
# Load images function
def load_images(folder_path):
    ids = []
    labels = []
    paths = []

    for class_folder in os.listdir(folder_path):
        
        if class_folder == '.DS_Store': continue
        class_path = os.path.join(folder_path, class_folder)
        
        for file in os.listdir(class_path):
            file_path = os.path.join(class_path, file)
            img_id = file_path[-14: -4]
            ids.append(img_id)
            labels.append(class_folder)
    return np.array(ids), np.array(labels)

In [6]:
# Load training, validation, and test data
train_ids, train_labels = load_images(train_dir)
val_ids, val_labels = load_images(val_dir)
test_ids, test_labels = load_images(test_dir)

In [7]:
ids_arr = np.append(train_ids, val_ids)
ids_arr = np.append(ids_arr, test_ids)

labels_arr = np.append(train_labels, val_labels)
labels_arr = np.append(labels_arr, test_labels)

print(f'{ids_arr.shape=}')
print(f'{labels_arr.shape=}')

ids_arr.shape=(6000,)
labels_arr.shape=(6000,)


In [8]:
articles['article_id']

0         108775015
1         108775044
2         108775051
3         110065001
4         110065002
            ...    
105537    953450001
105538    953763001
105539    956217002
105540    957375001
105541    959461001
Name: article_id, Length: 105542, dtype: int64

In [9]:
articles['article_id'] = '0' + articles['article_id'].astype(str)
articles['article_id']

0         0108775015
1         0108775044
2         0108775051
3         0110065001
4         0110065002
             ...    
105537    0953450001
105538    0953763001
105539    0956217002
105540    0957375001
105541    0959461001
Name: article_id, Length: 105542, dtype: object

In [10]:
dataset_articles = articles[articles['article_id'].isin(ids_arr)]

In [11]:
dataset_articles.shape

(6000, 25)

In [12]:
# Select and rename relevant columns for clarity
relevant_columns = {
    'article_id': 'id',
    'product_type_name': 'type',
    'product_group_name': 'group',
    'graphical_appearance_name': 'pattern',
    'colour_group_name': 'group_color',
    'perceived_colour_master_name': 'perceived_color',
    'index_name': 'index',
    'section_name': 'section',
    'garment_group_name': 'garment_group'
}

In [13]:
metadata = dataset_articles[relevant_columns.keys()].rename(columns=relevant_columns)

In [14]:
metadata.head()

Unnamed: 0,id,type,group,pattern,group_color,perceived_color,index,section,garment_group
43,145872051,Top,Garment Upper body,Melange,Black,Black,Sport,Men H&M Sport,Jersey Fancy
55,150959011,Trousers,Garment Lower body,Solid,Black,Black,Sport,Men H&M Sport,Jersey Fancy
56,150959013,Trousers,Garment Lower body,Solid,Dark Grey,Grey,Sport,Men H&M Sport,Jersey Fancy
69,156289011,Trousers,Garment Lower body,Denim,Blue,Blue,Children Sizes 92-140,Kids Boy,Trousers Denim
70,156610001,Trousers,Garment Lower body,Solid,Black,Black,Sport,Men H&M Sport,Jersey Fancy


In [15]:
patterns = metadata.value_counts('pattern').index
patterns

Index(['Solid', 'All over pattern', 'Stripe', 'Melange', 'Front print',
       'Denim', 'Placement print', 'Sequin', 'Check', 'Embroidery',
       'Colour blocking', 'Application/3D', 'Jacquard', 'Lace',
       'Other structure', 'Dot', 'Glittering/Metallic', 'Treatment',
       'Mixed solid/pattern', 'Slub', 'Chambray', 'Contrast', 'Mesh',
       'Metallic', 'Neps', 'Other pattern', 'Argyle', 'Transparent'],
      dtype='object', name='pattern')

In [16]:
patterns = metadata.value_counts('group_color').index
patterns

Index(['Black', 'White', 'Dark Blue', 'Light Pink', 'Light Beige', 'Off White',
       'Beige', 'Red', 'Grey', 'Blue', 'Greenish Khaki', 'Light Blue',
       'Dark Green', 'Pink', 'Dark Red', 'Dark Grey', 'Yellow', 'Light Yellow',
       'Light Orange', 'Yellowish Brown', 'Light Grey', 'Dark Beige',
       'Dark Orange', 'Light Green', 'Green', 'Light Turquoise', 'Dark Pink',
       'Orange', 'Light Purple', 'Dark Yellow', 'Dark Turquoise', 'Turquoise',
       'Greyish Beige', 'Other Pink', 'Light Red', 'Other Yellow',
       'Other Orange', 'Other Blue', 'Dark Purple', 'Other Green', 'Purple',
       'Silver', 'Other Red', 'Other', 'Bronze/Copper'],
      dtype='object', name='group_color')

In [17]:
# Define upper-body and lower-body categories for logical compatibility
upper_body = ['T-shirt', 'Top', 'Blouse', 'Sweater']
lower_body = ['Trousers']
one_piece = ['Dress']

In [18]:
def categorize_item(product_type):
    if product_type in upper_body:
        return 'upper_body'
    elif product_type in lower_body:
        return 'lower_body'
    elif product_type in one_piece:
        return 'one_piece'
    else:
        return 'other'

In [19]:
metadata['category'] = metadata['type'].apply(categorize_item)

In [20]:
# Simplify patterns for compatibility logic
solid_patterns = ['Solid', 'Melange', 'Chambray', 'Plain']
printed_patterns = ['All over pattern', 'Front print', 'Placement print', 
                    'Sequin', 'Embroidery', 'Application/3D', 'Jacquard', 
                    'Lace', 'Dot', 'Mixed solid/pattern', 'Contrast', 
                    'Argyle', 'Transparent']
structured_patterns = ['Denim', 'Check', 'Colour blocking', 
                       'Glittering/Metallic', 'Mesh', 'Metallic', 
                       'Neps', 'Slub', 'Other structure', 'Treatment']
other_patterns = ['Other pattern']

def simplify_pattern(pattern):
    if pattern in solid_patterns:
        return 'solid'
    elif pattern in printed_patterns:
        return 'printed'
    elif pattern in structured_patterns:
        return 'structured'
    elif pattern in other_patterns:
        return 'other'
    else:
        return 'unknown'


In [21]:
metadata['simplified_pattern'] = metadata['pattern'].apply(simplify_pattern)

In [22]:
# Simplify color groups for compatibility logic
neutral_colors = ['Black', 'White', 'Grey', 'Beige', 'Off White', 'Light Grey', 'Dark Grey', 'Greyish Beige', 'Dark Beige']
warm_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Dark Red', 'Light Pink', 'Light Orange', 'Light Yellow', 'Dark Orange', 'Dark Yellow', 'Other Red', 'Other Pink', 'Other Orange', 'Other Yellow']
cool_colors = ['Blue', 'Dark Blue', 'Light Blue', 'Turquoise', 'Light Turquoise', 'Dark Turquoise', 'Green', 'Dark Green', 'Light Green', 'Greenish Khaki', 'Other Blue', 'Other Green']
purple_colors = ['Purple', 'Light Purple', 'Dark Purple']
metallic_colors = ['Silver', 'Bronze/Copper']

def categorize_color(color):
    if color in neutral_colors:
        return 'neutral'
    elif color in warm_colors:
        return 'warm'
    elif color in cool_colors:
        return 'cool'
    elif color in purple_colors:
        return 'purple'
    elif color in metallic_colors:
        return 'metallic'
    else:
        return 'other'


In [23]:
metadata['color_gategory'] = metadata['group_color'].apply(categorize_color)

In [24]:
# Display updated metadata with derived columns
print('Metadata with Derived Columns:')
print(metadata.head())

Metadata with Derived Columns:
            id      type               group  pattern group_color  \
43  0145872051       Top  Garment Upper body  Melange       Black   
55  0150959011  Trousers  Garment Lower body    Solid       Black   
56  0150959013  Trousers  Garment Lower body    Solid   Dark Grey   
69  0156289011  Trousers  Garment Lower body    Denim        Blue   
70  0156610001  Trousers  Garment Lower body    Solid       Black   

   perceived_color                  index        section   garment_group  \
43           Black                  Sport  Men H&M Sport    Jersey Fancy   
55           Black                  Sport  Men H&M Sport    Jersey Fancy   
56            Grey                  Sport  Men H&M Sport    Jersey Fancy   
69            Blue  Children Sizes 92-140       Kids Boy  Trousers Denim   
70           Black                  Sport  Men H&M Sport    Jersey Fancy   

      category simplified_pattern color_gategory  
43  upper_body              solid        neutr

In [None]:
# Save the processed metadata to a new CSV file
metadata_file = '../../data/compatibility/metadata.csv'
metadata.to_csv(metadata_file, index=False)
print(f'Processed metadata saved to {metadata_file}')

Processed metadata saved to ../data/compatibility/metadata.csv
