In [None]:
import numpy as np
import pandas as pd
from PIL import Image
import glob
import matplotlib.pyplot as plt
from matplotlib import colors
import cv2

In [None]:
styles = pd.read_csv('C:\\Users\\WantaTyler\\MachineLearning-335\\datasets\\fashion-dataset\\styles_clean.csv')

In [None]:
styles.head()

In [None]:
# start off with just taking the id, gender, and season
predictors = styles[['id', 'gender', 'season']]

In [None]:
# method used to numericalize gender into age label
# 0 => any age, 1 => teen and lower, 2 => adult and above
def genderToAge(row):
    if row['gender'] == 'Unisex':
        return 0
    if row['gender'] == 'Boys' or row['gender'] == 'Girls':
        return 1
    if row['gender'] == 'Men' or row['gender'] == 'Women':
        return 2

In [None]:
# change gender column into an age label
ageCol = predictors.apply(lambda row: genderToAge(row), axis=1)
predictors = predictors.assign(age=ageCol.values)

In [None]:
# method used to numericalize seasons
# 1 => first 3 months, 2 => second 3 months i.e April, May, June, 3 => third 3 months, 4 => fourth 3 months
def numericalizeSeason(row):
    if row['season'] == 'Winter':
        return 1
    if row['season'] == 'Spring':
        return 2
    if row['season'] == 'Summer':
        return 3
    if row['season'] == 'Fall':
        return 4
    return 0

In [None]:
# also going to convert season into triplet of months out of the year....if that made sense?
months = predictors.apply(lambda row: numericalizeSeason(row), axis=1)
predictors = predictors.assign(tripletsOfMonths=months)
predictors.head()

In [None]:
# gets the most common value for color channel, excluding 255 so to not get white
def getMostCommon(colors):
    values, counts = np.unique(colors, return_counts=True)
    
    if ~(counts[values < 255].any()):
        return 0
    
    ind = np.argmax(counts[values < 255])
    return values[ind]

In [None]:
# add columns for most common red, green, and blue value in image
predictors = predictors.assign(commonRed=0)
predictors = predictors.assign(commonGreen=0)
predictors = predictors.assign(commonBlue=0)

In [None]:
# loop will take around 6 hours to finish 
# have to read in one at a time/ don't have enough ram to store all
# *hit shift-enter * *time for a nap*  
counter = 1
for fileName in glob.glob('C:\\Users\\WantaTyler\\MachineLearning-335\\datasets\\fashion-dataset\\images\\*.jpg'):
    
    # get the image id
    filePath = fileName.split('\\')
    id = filePath[7]
    id = id[:id.index('.jpg')]
    id = int(id)
    
    # read in and conver to rgb
    image = cv2.imread(fileName)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # get red, blue, green channels
    red = image[:,:,0:1]
    green = image[:,:,1:2]
    blue = image[:,:,2:3]
    
    # find most common value
    cRed = getMostCommon(red)
    cGreen = getMostCommon(green)
    cBlue = getMostCommon(blue)
    
    # add to dataframe
    predictors.loc[predictors['id'] == id, ['commonRed', 'commonGreen', 'commonBlue']] = [cRed, cGreen, cBlue]
    
    # just used to see some progress
    if (counter % 1000) == 0:
        print(f'{counter/45556*100} % done')
    
    counter += 1

In [None]:
# send to csv so I dont ever have to run that loop again
predictors.to_csv('data_part1.csv')