In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from enum import IntEnum
from dataclasses import dataclass
from enum import Enum
import copy
from colorthief import ColorThief
import os

In [3]:
df = pd.read_parquet('../datasets/cleaned/polyvore_v1.parquet')

In [4]:
@dataclass
class WearType(Enum):
    accessoire = 1.0
    innerWear = 2.0
    outerWear = 3.0
    bottomWear = 4.0
    shoes = 5.0

In [5]:
def add_bg_removed_path_to_df(imagePath):
    raw, images, number, file = imagePath.split("/")
    endnumber, end = file.split(".")
    
    return f"datasets/bg_removed/{number}/{endnumber}.png"

def add_images256x256_path_to_df(imagePath):
    raw, images, number, file = imagePath.split("/")
    
    return f"datasets/images256x256/{number}/{file}"

## Add the Path to the BG-Removed and 256x256 Images to the DataFrame

In [17]:
df['imagePath 256x256'] = df['imagePath'].apply(add_images256x256_path_to_df)
df['imagePath BG-Removed'] = df['imagePath'].apply(add_bg_removed_path_to_df)

# Extract most dominant color

In [7]:
def get_palette(img_path, color_count, quality):
    parent_directory = os.path.dirname(os.getcwd())
    full_path = parent_directory + "/" + img_path
    print(full_path)
    thief = ColorThief(full_path)
    palette = thief.get_palette(color_count=color_count, quality=quality)
    return palette

In [8]:
small_df = df.head(10)

In [9]:
small_df["Dominant Color"] = small_df["imagePath BG-Removed"].apply(lambda x: get_palette(x, 3, 1))

/Users/romanzberg/projects/DSPRO2-outfit-recommendation/datasets/bg_removed/120161271/1.png


FileNotFoundError: [Errno 2] No such file or directory: '/Users/romanzberg/projects/DSPRO2-outfit-recommendation/datasets/bg_removed/120161271/1.png'

In [10]:
split = pd.DataFrame(small_df['Dominant Color'].to_list(), columns= ['1st Dominant Color', '2nd Dominant Color', '3rd Dominant Color', '4th Dominant Color'])

KeyError: 'Dominant Color'

In [11]:
small_df.reset_index(drop=True, inplace=True)
split.reset_index(drop=True, inplace=True)
small_df = pd.concat([small_df, split], axis=1)

NameError: name 'split' is not defined

In [12]:
small_df

Unnamed: 0,outfitName,outfitViews,wearType,type,outfitLikes,date,set_id,outfitDesc,clothingItemIndex,clothingItemName,clothingItemPrice,clothingItemLikes,clothingItemCategoryId,imagePath,imagePath 256x256,imagePath BG-Removed
0,Being a Vans shoe model with Luke. Idk about t...,188,2.0,top,9,Two years,120161271,A fashion look from April 2014 featuring destr...,1,nirvana distressed t-shirt,10.0,1290,21,raw/images/120161271/1.jpg,datasets/images256x256/120161271/1.jpg,datasets/bg_removed/120161271/1.png
1,Being a Vans shoe model with Luke. Idk about t...,188,4.0,pants,9,Two years,120161271,A fashion look from April 2014 featuring destr...,2,rag bone rock w/ black skinny jeans,235.0,17406,237,raw/images/120161271/2.jpg,datasets/images256x256/120161271/2.jpg,datasets/bg_removed/120161271/2.png
2,These Chanel bags is a bad habit .x,562,2.0,top,32,Two years,143656996,12.19.14,1,monki singlet,16.0,20094,104,raw/images/143656996/1.jpg,datasets/images256x256/143656996/1.jpg,datasets/bg_removed/143656996/1.png
3,These Chanel bags is a bad habit .x,562,4.0,pants,32,Two years,143656996,12.19.14,3,topshop moto joni high rise skinny jeans,65.0,23324,237,raw/images/143656996/3.jpg,datasets/images256x256/143656996/3.jpg,datasets/bg_removed/143656996/3.png
4,These Chanel bags is a bad habit .x,562,1.0,bag,32,Two years,143656996,12.19.14,5,pre-owned chanel shoulder bag,2450.0,4489,37,raw/images/143656996/5.jpg,datasets/images256x256/143656996/5.jpg,datasets/bg_removed/143656996/5.png
5,These Chanel bags is a bad habit .x,562,1.0,hats,32,Two years,143656996,12.19.14,6,rag bone floppy brim fedora,195.0,1833,55,raw/images/143656996/6.jpg,datasets/images256x256/143656996/6.jpg,datasets/bg_removed/143656996/6.png
6,the people i choose never choose me and my hea...,1580,2.0,top,395,9 days,216470135,A fashion look from February 2017 by exco feat...,1,isabel marant alpaca blend jumper,615.0,268,19,raw/images/216470135/1.jpg,datasets/images256x256/216470135/1.jpg,datasets/bg_removed/216470135/1.png
7,the people i choose never choose me and my hea...,1580,4.0,skirt,395,9 days,216470135,A fashion look from February 2017 by exco feat...,2,yoins light blue gradient color hole denim skirt,22.0,331,9,raw/images/216470135/2.jpg,datasets/images256x256/216470135/2.jpg,datasets/bg_removed/216470135/2.png
8,the people i choose never choose me and my hea...,1580,5.0,shoes,395,9 days,216470135,A fashion look from February 2017 by exco feat...,3,alice light blue shoes flats leather sandals,189.0,2634,41,raw/images/216470135/3.jpg,datasets/images256x256/216470135/3.jpg,datasets/bg_removed/216470135/3.png
9,Be happy!,591,2.0,top,233,14 days,216220312,A fashion look from February 2017 by rasa-j fe...,1,oasis shadow bird knit pink,47.0,4874,19,raw/images/216220312/1.jpg,datasets/images256x256/216220312/1.jpg,datasets/bg_removed/216220312/1.png


## Add the Colorname based on the RGB Value

In [17]:
from scipy.spatial import KDTree
from webcolors import css3_hex_to_names, hex_to_rgb

def convert_rgb_to_names(rgb_tuple):
    
    # a dictionary of all the hex and their respective names in css3
    css3_db = css3_hex_to_names
    names = []
    rgb_values = []
    for color_hex, color_name in css3_db.items():
        names.append(color_name)
        rgb_values.append(hex_to_rgb(color_hex))
    
    kdt_db = KDTree(rgb_values)
    distance, index = kdt_db.query(rgb_tuple)
    return f'closest match: {names[index]}'
print(convert_rgb_to_names((30, 29, 28)))

ImportError: cannot import name 'css3_hex_to_names' from 'webcolors' (/Users/luca/.local/share/virtualenvs/DSPRO2-outfit-recommendation-etrui0aa/lib/python3.10/site-packages/webcolors/__init__.py)

In [16]:
from scipy.spatial import KDTree
import webcolors
def convert_rgb_to_names(rgb_tuple):
    css3_db = webcolors.CSS3_HEX_TO_NAMES
    names = []
    rgb_values = []
    for color_hex, color_name in css3_db.items():
        names.append(color_name)
        rgb_values.append(webcolors.hex_to_rgb(color_hex))
    kdt_db = KDTree(rgb_values)
    distance, index = kdt_db.query(rgb_tuple)
    return f'closest match: {names[index]}'

In [19]:
import webcolors
webcolors.hex_to_name('(30, 29, 28)')

ValueError: "(30, 29, 28)" is not a valid hexadecimal color value.

# REAL CODE

In [18]:
df["Dominant Color"] = df["imagePath BG-Removed"].apply(lambda x: get_palette(x, 3, 1))

/Users/romanzberg/projects/DSPRO2-outfit-recommendation/datasets/bg_removed/120161271/1.png


FileNotFoundError: [Errno 2] No such file or directory: '/Users/romanzberg/projects/DSPRO2-outfit-recommendation/datasets/bg_removed/120161271/1.png'

In [None]:
df.head()

In [None]:
df["Ratio"] =  df["outfitLikes"] / df["outfitViews"]

In [None]:
df["Ratio"].describe()

In [None]:
df = df.drop(df[df["Ratio"] > 1.0].index)

In [None]:
df["Ratio_Log"] =  np.log10(df["outfitLikes"] / df["outfitViews"])

In [None]:
# I want to create a new dataframe with the unique indexes of the setID. So every setID should only appear once.

df_copy = copy.deepcopy(df)
setIDs = df_copy["set_id"].unique()

In [None]:
df.head()

In [None]:
def add_item_to_setID(setID, item, wearType):
    df_setID.loc[setID, f"{wearType} ClothingItemName"] = item["clothingItemName"]
    df_setID.loc[setID, f"{wearType} wearType"] = item["wearType"]
    df_setID.loc[setID, f"{wearType} ImagePath"] = item["imagePath"]
    df_setID.loc[setID, f"{wearType} ImagePath 256x256"] = item["imagePath 256x256"]
    df_setID.loc[setID, f"{wearType} ImagePath BG Removed"] = item["imagePath BG-Removed"]


In [None]:
df_setID = pd.DataFrame()
for set_id in setIDs:
    #Initialisatio
    items = df[df["set_id"] == set_id]
    df_items = pd.DataFrame(items)
    added_accessoire = False
    
    #Adding the basic information about the SetID
    df_setID.loc[set_id, "OutfitName"] = df[df["set_id"] == set_id]["outfitName"].values[0]
    df_setID.loc[set_id, "outfitDesc"] = df[df["set_id"] == set_id]["outfitDesc"].values[0]
    df_setID.loc[set_id, "outfitViews"] = df[df["set_id"] == set_id]["outfitViews"].values[0]
    df_setID.loc[set_id, "outfitLikes"] = df[df["set_id"] == set_id]["outfitLikes"].values[0]
    df_setID.loc[set_id, "Ratio"] = df[df["set_id"] == set_id]["Ratio"].values[0]
    
    #Adding the items to the setID
    for index, item in df_items.iterrows():
        if (item["wearType"] == 1.0) and added_accessoire:
            print("Sorry, es wurde bereits ein Accessoirces hinzugefügt")
            continue
        if (item["wearType"] == 5.0):
            add_item_to_setID(set_id, item, "Shoes")
        elif (item["wearType"] == 4.0):
            add_item_to_setID(set_id, item, "Bottomwear")
        elif (item["wearType"] == 2.0):
            add_item_to_setID(set_id, item, "Innerwear")   
        elif (item["wearType"] == 3.0):
            add_item_to_setID(set_id, item, "Outerwear")
        elif (item["wearType"] == 1.0):
            add_item_to_setID(set_id, item, "Accessoire")
            added_accessoire = True
        else:
            print(item["wearType"])
            raise ValueError("WearType not found")

# Data Quality Assessment

In [None]:
df[df["wearType"].isna() == True]

### Drop the Rows the a Ratio over 1.0. This is not possible

In [None]:
df_setID = df_setID.drop(df_setID[df_setID["Ratio"] > 1.0].index)

In [None]:
sns.histplot(df_setID["Ratio"])

In [None]:
sns.boxplot(x=df_setID["Ratio"])


In [None]:
df_setID["Ratio_Log"] =  abs(np.log(df_setID["Ratio"]))
df_setID["Ratio_Log10"] =  abs(np.log10(df_setID["Ratio"]))
print(df_setID["Ratio_Log"].skew())
print(df_setID["Ratio_Log10"].skew())

In [None]:
#Normalize this from 0 to 1 --> df_setID["Ratio_Log"] 
import sklearn.preprocessing

scaler = sklearn.preprocessing.MinMaxScaler()
df_setID["Ratio_Log_Normalized"] = scaler.fit_transform(df_setID[["Ratio_Log"]])

In [None]:
sns.histplot(df_setID["Ratio_Log_Normalized"])

# Add the three most dominant colors to the dataframe

### Add the path to the BG-Removed to the dataframe

In [1]:
df_setID.head()

NameError: name 'df_setID' is not defined