In [None]:
# !pip install swifter
# !pip install tensorflow==2.0.0

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import matplotlib.image as mpimg
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os # accessing directory structure

In [None]:
DATASET_PATH = "/kaggle/input/fashion-product-images-dataset/fashion-dataset/fashion-dataset/"
print(os.listdir(DATASET_PATH))

In [None]:
df = pd.read_csv(DATASET_PATH + "styles.csv", on_bad_lines='skip')
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.reset_index(drop=True)
df.head(10)

In [None]:
df['filename'] = df['image']\
.apply(lambda filename: os.path.join(DATASET_PATH, 'fashion-dataset', 'images', filename))
df.head(2)

In [None]:
import cv2
def plot_figures(figures, nrows = 1, ncols=1,figsize=(8, 8)):
    """Plot a dictionary of figures.

    Parameters
    ----------
    figures : <title, figure> dictionary
    ncols : number of columns of subplots wanted in the display
    nrows : number of rows of subplots wanted in the figure
    """

    fig, axeslist = plt.subplots(ncols=ncols, nrows=nrows,figsize=figsize)
    for ind,title in enumerate(figures):
        axeslist.ravel()[ind].imshow(cv2.cvtColor(figures[title], cv2.COLOR_BGR2RGB))
        axeslist.ravel()[ind].set_title(title)
        axeslist.ravel()[ind].set_axis_off()
    plt.tight_layout() # optional
    
def img_path(img):
    return DATASET_PATH+"/images/"+img

def load_image(img, resized_fac = 0.1):
    img     = cv2.imread(img_path(img))
    w, h, _ = img.shape
    resized = cv2.resize(img, (int(h*resized_fac), int(w*resized_fac)), interpolation = cv2.INTER_AREA)
    return resized

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# generation of a dictionary of (title, images)
figures = {'im'+str(i): load_image(row.image) for i, row in df.sample(6).iterrows()}
# plot of the images in a figure, with 2 rows and 3 columns
plot_figures(figures, 2, 3)

**Final data**

In [None]:
# masterCategory count 
gr_data_masterCate = df.groupby('masterCategory').size()
gr_data_masterCate_sorted= gr_data_masterCate.sort_values()
gr_data_masterCate_sorted

In [None]:
plt.figure(figsize = (10, 4))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_masterCate_sorted.index, gr_data_masterCate_sorted.values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
# subCategory count 
gr_data_subCate = df.groupby('subCategory').size()
gr_data_subCate_sorted= gr_data_subCate.sort_values()
len(gr_data_subCate_sorted)

In [None]:
plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_subCate_sorted[-25:].index, gr_data_subCate_sorted[-25:].values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
# articleType count 
gr_data_season_type = df.groupby('season').size()
gr_data_season_type_sorted = gr_data_season_type.sort_values()

In [None]:
plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_season_type_sorted[-25:].index, gr_data_season_type_sorted[-25:].values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$SEASON$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
# articleType count 
gr_data_usage = df.groupby('usage').size()
gr_data_usage_sorted = gr_data_usage.sort_values()

In [None]:
plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_usage_sorted[-25:].index, gr_data_usage_sorted[-25:].values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$USAGE$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
# articleType count 
gr_data_articleType = df.groupby('articleType').size()
gr_data_articleType_sorted = gr_data_articleType.sort_values()

In [None]:
plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_articleType_sorted[-25:].index, gr_data_articleType_sorted[-25:].values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
categoricals = sorted(list(gr_data_subCate_sorted.index[-15:]))

In [None]:
data_20 = df[df['subCategory'].isin(categoricals)]

In [None]:
# data_20 = data_20[['subCategory', 'filename']]

In [None]:
data_20

In [None]:
data_20.groupby('subCategory').size().sort_values(ascending=False)

In [None]:
from sklearn.utils import resample, shuffle
from sklearn.model_selection import train_test_split

In [None]:
n_samples = 600
lst_df = []
for categorical in categoricals:
    df_class_tmp = data_20.loc[data_20['subCategory'] == categorical]
    if df_class_tmp.shape[0] < n_samples:
        df_resample_tmp = df_class_tmp
    else:
        df_resample_tmp = resample(df_class_tmp, n_samples=n_samples, random_state=42)
    lst_df.append(df_resample_tmp)
df_new = pd.concat(lst_df)

In [None]:
cate = df_new.groupby('subCategory').size().sort_values()

# plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(cate[-25:].index, cate[-25:].values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

 **Use Model to Recommendation**

In [None]:
import tensorflow as tf
import keras
from keras import Model
from keras.applications.resnet import ResNet50
from keras.preprocessing import image
# from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.applications.densenet import preprocess_input, decode_predictions
from keras.layers import GlobalMaxPooling2D
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics.pairwise import pairwise_distances
import sklearn

tf.__version__


In [None]:
keras.__version__


In [None]:
sklearn.__version__


In [None]:
np.__version__


In [None]:
pd.__version__

In [None]:
seaborn.__version__

In [None]:
sfgdhhgf

In [None]:
base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in base_model.layers:
    layer.trainable = False

x = base_model.output
x = Flatten()(x)
x = Dense(4096, activation='leaky_relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)
x = Dense(1024, activation='sigmoid')(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)
predictions = Dense(20, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

model.summary()

In [None]:
# base_model = VGG19(weights='imagenet', 
#                       include_top=False, 
#                       input_shape = (224, 224, 3))
# base_model.trainable = False

# # Add Layer Embedding
# model = keras.Sequential([
#     base_model,
#     GlobalMaxPooling2D()
# ])

# model.summary()

In [None]:
model.save("model_vgg19_ch")

In [None]:
def get_embedding(model, img_name):
    # Reshape
    img = image.load_img(img_path(img_name), target_size=(224, 224))
    # img to Array
    x   = image.img_to_array(img)
    # Expand Dim (1, w, h)
    x   = np.expand_dims(x, axis=0)
    # Pre process Input
    x   = preprocess_input(x)
    return model.predict(x).reshape(-1)

Get item Embedding

In [None]:
import keras.utils as image

In [None]:
df_new.head()

In [None]:
emb = get_embedding(model, df_new.iloc[0].image)
emb.shape

In [None]:
img_array = load_image(df_new.iloc[0].image)
plt.imshow(cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB))
print(img_array.shape)
print(emb)

In [None]:
df_new.shape

Get Embedding for all itens in dataset

In [None]:
%%time
#import swifter

# Parallel apply
df_sample      = df_new#.sample(10)
map_embeddings = df_sample['image'].apply(lambda img: get_embedding(model, img))
df_embs        = map_embeddings.apply(pd.Series)

print(df_embs.shape)
df_embs.head()

#### Compute Similarity Between Items

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
from sklearn.metrics.pairwise import pairwise_distances

# Calcule DIstance Matriz
cosine_sim = 1-pairwise_distances(df_embs, metric='cosine')
cosine_sim[:4, :4]

In [None]:
cosine_sim

In [None]:
df_embs

In [None]:
df_sample

#### Recommender Similar Items

In [None]:
indices = pd.Series(range(len(df_new)), index=df_new.index)
indices

# Function that get movie recommendations based on the cosine similarity score of movie genres
def get_recommender(idx, df_new, top_n = 5):
    sim_idx    = indices[idx]
    sim_scores = list(enumerate(cosine_sim[sim_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    idx_rec    = [i[0] for i in sim_scores]
    idx_sim    = [i[1] for i in sim_scores]
    
    return indices.iloc[idx_rec].index, idx_sim

get_recommender(9805, df_new, top_n = 5)

In [None]:
def show_reccomendations(image_id,x):
    idx_ref = image_id

    # Recommendations
    idx_rec, idx_sim = get_recommender(idx_ref, df_new, top_n = x)

    # Plot
    #===================
    plt.imshow(cv2.cvtColor(load_image(df.iloc[idx_ref].image), cv2.COLOR_BGR2RGB))

    # generation of a dictionary of (title, images)
    figures = {'im'+str(i): load_image(row.image) for i, row in df_new.loc[idx_rec].iterrows()}
    # plot of the images in a figure, with 2 rows and 3 columns
    plot_figures(figures, 1, x)

In [None]:
df_sample.index

In [None]:
# choose a random element from a list
from random import seed
from random import choice
# seed random number generator
seed(1)
# prepare a sequence
# make choices from the sequence
for _ in range(5):
 selection = choice(df_sample.index)
 print(selection)

In [None]:
show_reccomendations(9805,5)

In [None]:
show_reccomendations(choice(df_sample.index),6)

In [None]:
idx_ref = choice(df_sample.index)
# x=4
# Recommendations
idx_rec, idx_sim = get_recommender(idx_ref, df_new, top_n = 5)

# Plot
#===================
plt.imshow(cv2.cvtColor(load_image(df.iloc[idx_ref].image), cv2.COLOR_BGR2RGB))

# generation of a dictionary of (title, images)
figures = {'im'+str(i): load_image(row.image) for i, row in df_new.loc[idx_rec].iterrows()}
# plot of the images in a figure, with 2 rows and 3 columns
plot_figures(figures, 1, 5)

In [None]:
from sklearn.manifold import TSNE
import time
import seaborn as sns

In [None]:
df_new.head()

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df_embs)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
df_new['tsne-2d-one'] = tsne_results[:,0]
df_new['tsne-2d-two'] = tsne_results[:,1]

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two",
                hue="masterCategory",
                data=df_new,
                legend="full",
                alpha=0.8)

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two",
                hue="subCategory",
                data=df_new,
                legend="full",
                alpha=0.8)

In [None]:
df_new.sample(10).to_csv('df_sample.csv')
df_embs.to_csv('embeddings.csv')
df_new.to_csv('metadados.csv')