In [None]:
# !pip install keras-tuner
# !pip install wandb

In [None]:
#ignore warnings
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import random

import tensorflow as tf
# data generator 
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# wandb
import wandb
from wandb.keras import WandbCallback

DATA_PATH = '/kaggle/input/fashion-product-images-dataset/'

%matplotlib inline

In [None]:
images_df = pd.read_csv(
    os.path.join(DATA_PATH, 'fashion-dataset', 'images.csv')
)

In [None]:
images_df.head(2)

In [None]:
styles_df = pd.read_csv(
    os.path.join(DATA_PATH, 'fashion-dataset', 'styles.csv'),
    on_bad_lines='skip'
)

In [None]:
styles_df.head(2)

In [None]:
images_df.head(2)

In [None]:
# tạo id để merge với metadata df
images_df['id'] = images_df['filename']\
.apply(lambda filename: filename\
       .replace('.jpg', '')).astype(int)

In [None]:
images_df.head(2)

## Merging the two dataframes

In [None]:
data = styles_df.merge(images_df, on='id', how='left')
data.head(2)


In [None]:
# chuyển filename thành filepath
data['filename'] = data['filename']\
.apply(lambda filename: os.path.join(DATA_PATH, 'fashion-dataset', 'images', filename))
data.head(2)

## Removing Products for which images are not present

In [None]:
# lấy danh sách ảnh trong dataset
image_files = os.listdir(
    os.path.join(DATA_PATH, 'fashion-dataset', 'images')
)
print(len(image_files))

In [None]:
# các file có trong dataset
data['file_found'] = data['id'].apply(lambda idx: '{}.jpg'.format(idx)  in image_files)

In [None]:
data['file_found'].value_counts()

In [None]:
# xóa bỏ file không có ảnh trong dataset
data = data[data['file_found']].reset_index(drop=True)
print(data.shape)
data.head(2)

In [None]:
data.isnull().sum()

## Visualizations

In [None]:
# trực quan một số hình ảnh trong datasets
def dislay_image(image_files):
    random.shuffle(image_files)
    for idx, image_file in enumerate(image_files[0:9]):
        plt.subplot(3, 3, idx+1)
        image_path = os.path.join(DATA_PATH, 'fashion-dataset', 'images', image_file)
        image_arr = cv2.imread(image_path)
        image_arr = cv2.cvtColor(image_arr, cv2.COLOR_BGR2RGB)
        
        plt.imshow(image_arr)
        plt.axis("off")

dislay_image(image_files) 

In [None]:
# masterCategory count 
gr_data_masterCate = data.groupby('masterCategory').size()
gr_data_masterCate_sorted= gr_data_masterCate.sort_values()
gr_data_masterCate_sorted

In [None]:
plt.figure(figsize = (10, 4))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_masterCate_sorted.index, gr_data_masterCate_sorted.values, color='lightblue')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
# subCategory count 
gr_data_subCate = data.groupby('subCategory').size()
gr_data_subCate_sorted= gr_data_subCate.sort_values()

In [None]:
len(gr_data_subCate_sorted)

In [None]:
plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_subCate_sorted[-25:].index, gr_data_subCate_sorted[-25:].values, color='lightblue')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
# articleType count 
gr_data_articleType = data.groupby('articleType').size()
gr_data_articleType_sorted = gr_data_articleType.sort_values()

In [None]:
plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(gr_data_articleType_sorted[-25:].index, gr_data_articleType_sorted[-25:].values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

## final data

In [None]:
# lấy 20 danh mục 

In [None]:
categoricals = sorted(list(gr_data_subCate_sorted.index[-20:]))

In [None]:
data_20 = data[data['subCategory'].isin(categoricals)]

In [None]:
data_20 = data_20[['subCategory', 'filename']]

In [None]:
data_20

In [None]:
data_20.groupby('subCategory').size().sort_values(ascending=False)

In [None]:
# mỗi danh mục lấy tối đa 600 ảnh


In [None]:
from sklearn.utils import resample, shuffle
from sklearn.model_selection import train_test_split

In [None]:
n_samples = 600
lst_df = []
for categorical in categoricals:
    df_class_tmp = data_20.loc[data_20['subCategory'] == categorical]
    if df_class_tmp.shape[0] < n_samples:
        df_resample_tmp = df_class_tmp
    else:
        df_resample_tmp = resample(df_class_tmp, n_samples=n_samples, random_state=42)
    lst_df.append(df_resample_tmp)
df = pd.concat(lst_df)

In [None]:
cate = df.groupby('subCategory').size().sort_values()

# plt.figure(figsize = (10, 10))
with plt.rc_context({"ytick.color":"darkgrey"}):
    plt.barh(cate[-25:].index, cate[-25:].values, color='pink')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.ylabel("$CATEGORIES$", size=15, color="darkgrey")
plt.xlabel("Number of Image", size=15, color="darkgrey")
plt.show()

In [None]:
df.shape

In [None]:
df = shuffle(df, random_state=42)
df = df.reset_index(drop=True)

In [None]:
df.rename({'subCategory':'categorical'}, axis=1, inplace=True)

In [None]:
# final data
data = df
data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(data,\
                                     test_size=0.2,\
                                     random_state=42,\
                                     stratify=data['categorical'])
valid_df, test_df = train_test_split(test_df,\
                                      test_size=0.5,\
                                      random_state=42,\
                                      stratify=test_df['categorical'])

In [None]:
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
train_df

## Data Augmentation

In [None]:
datagen = ImageDataGenerator(rescale=1/255.,
                            shear_range=0.2,
                            zoom_range=0.2,
                            horizontal_flip=True)

train_generator = datagen.flow_from_dataframe(dataframe=train_df,
                                             target_size=(224,224),
                                             x_col='filename',
                                            y_col='categorical',
                                             class_mode='categorical',
                                             batch_size=32,
                                             shuffle=True,
                                              seed=42)


test_datagen = ImageDataGenerator(rescale=1/255.)
valid_generator = test_datagen.flow_from_dataframe(dataframe=valid_df,
                                             target_size=(224,224),
                                             x_col='filename',
                                            y_col='categorical',
                                             class_mode='categorical',
                                             batch_size=32,
                                             shuffle=True,
                                              seed=42)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='filename',
    y_col='categorical',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    shuffle=True,
    seed=42
)

In [None]:
# tmp

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
def build_model(name, weights_path=None):
    base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    x = Flatten()(x)
    x = Dense(4096, activation='leaky_relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(1024, activation='sigmoid')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    predictions = Dense(20, activation='softmax')(x)

    model = Model(name=name, inputs=base_model.input, outputs=predictions)
    if weights_path:
        model.load_weights(weights_path)
    return model

In [None]:
import time

In [None]:
dfghj

In [None]:
NAME = "vgg19-{}".format(int(time.time()))
model = build_model(NAME)
model.summary()

In [None]:
lr = 0.01
epochs = 10
## Initlisazie wandb project
wandb.init(project='CBIR-fashion product dataset',
          name = NAME,
          config={
              "learning_rate": lr,
              "Batch_normalization": True,
              "Batch_size" : 64,
              "Dropout" : "0.4",
              "architecture": "VGG19",
              "dataset": "fashion-product-images-dataset",
              "epochs": epochs,
              "data generator" : True
          }
          )

In [None]:
wandb_callback = WandbCallback()

filepath = "{}_loss_opti.hdf5".format("vgg19-{}".format(int(time.time())))

checkpoint1 = tf.keras.callbacks.ModelCheckpoint(
    filepath,
    monitor="val_loss",
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode="auto",
    save_freq="epoch"
)


In [None]:
def recall(y_true, y_pred):
    y_true = K.ones_like(y_true)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))

    recall = true_positives / (all_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    y_true = K.ones_like(y_true)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=lr),
              metrics=['accuracy', precision, recall, f1_score])

In [None]:
history = model.fit_generator(train_generator,
                    validation_data = train_generator,
                    steps_per_epoch = train_generator.n//train_generator.batch_size,
                    validation_steps = valid_generator.n//valid_generator.batch_size,
                    epochs=epochs,
                    callbacks=[checkpoint1, wandb_callback])

In [None]:
filepath = "{}_loss_opti.hdf5".format("vgg19-{}".format(int(time.time())))
model.save(filepath)

In [None]:
# pip install google-colab

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# model.save(r"C:\Users\olha.burtso\Documents\study\cbir_vgg19_model.h5")

In [None]:
# from keras.models import load_model 
# model3 = load_model(r'C:\Users\olha.burtso\Downloads\model-best.h5')

In [None]:
model3.summary()

In [None]:
sweep_id = wandb.run.sweep_id
print("Sweep ID:", sweep_id)

In [None]:
# import wandb
# api = wandb.Api()

# sweep = api.sweep("rec_diploma/CBIR-fashion product dataset/1699400325")
# runs = sorted(sweep.runs,
#   key=lambda run: run.summary.get("val_acc", 0), reverse=True)
# val_acc = runs[0].summary.get("val_acc", 0)
# print(f"Best run {runs[0].name} with {val_acc}% validation accuracy")

# runs[0].file("model.h5").download(replace=True)
# print("Best model saved to model-best.h5")

In [None]:
score = model2.evaluate_generator(test_generator)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
from tensorflow.keras.models import load_model
best_model1 = load_model(filepath1)

In [None]:
filepath

In [None]:
filepath1 = 'vgg19-1699558789_loss_opti.hdf5'

In [None]:
best_model.summary()

In [None]:
score = best_model.evaluate_generator(test_generator)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
IMAGESIZE = 224
CHANNELS = 3
def image_preprocess(image_path):
    image_orig = cv2.imread(image_path)
    image_arr = cv2.cvtColor(image_orig, cv2.COLOR_BGR2RGB)
    image_arr = cv2.resize(image_arr, (IMAGESIZE, IMAGESIZE))
    image_arr = image_arr/255.
    image_arr = image_arr.reshape(-1, IMAGESIZE, IMAGESIZE, CHANNELS)
    return image_arr

In [None]:
test_df.filename[1]

In [None]:
anchor_path = '/kaggle/input/fashion-product-images-dataset/fashion-dataset/images/50892.jpg'
pos_path = '/kaggle/input/fashion-product-images-dataset/fashion-dataset/images/45986.jpg'
neg_path = '/kaggle/input/fashion-product-images-dataset/fashion-dataset/images/29863.jpg'
image_arr1 = image_preprocess(anchor_path)
image_arr2 = image_preprocess(pos_path)
image_arr3 = image_preprocess(neg_path)
image_lst = [image_arr1, image_arr2, image_arr3]
for i, image_arr in enumerate(image_lst):
    plt.subplot(1, 3, i+1)
    plt.imshow(image_arr[0])
    plt.axis(False)
plt.show()

In [None]:
y_hat = model2.predict(image_arr)

In [None]:
img_features = y_hat

In [None]:
from PIL import Image
import os
import matplotlib.pyplot as plt
import numpy as np

from keras.applications import vgg16
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from keras.models import Model
from keras.applications.imagenet_utils import preprocess_input


from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [None]:
importedImages = []

for f in image_files:
    filename = f
#     original = load_img(filename, target_size=(224, 224))
#     numpy_image = img_to_array(original)
    image_arr = cv2.imread(os.path.join(DATA_PATH, 'fashion-dataset', 'images', filename))
    numpy_image = cv2.cvtColor(image_arr, cv2.COLOR_BGR2RGB)
    image_batch = np.expand_dims(numpy_image, axis=0)
    
    importedImages.append(image_batch)
    
images = np.vstack(importedImages)

processed_imgs = preprocess_input(images.copy())

In [None]:
y_hat.argmax()

In [None]:
y_hat

In [None]:
categoricals[14]

In [None]:
best_model.layers

In [None]:
CBIR_model = Model(inputs=best_model.input, outputs=best_model.layers[-2].output)

In [None]:
CBIR_model

In [None]:
prehashcode1 = CBIR_model.predict(image_arr1)
prehashcode2 = CBIR_model.predict(image_arr2)
prehashcode3 = CBIR_model.predict(image_arr3)

In [None]:
prehashcode1.shape

In [None]:
hashcode1 = np.where(prehashcode1<0.5, 0, 1)
hashcode2 = np.where(prehashcode2<0.5, 0, 1)
hashcode3 = np.where(prehashcode3<0.5, 0, 1)

In [None]:
hashcode1 = hashcode1.astype('bool')
hashcode2 = hashcode2.astype('bool')
hashcode3 = hashcode3.astype('bool')

In [None]:
hamming_distance = np.hamming(hashcode1.shape[0]) * np.abs(np.subtract(hashcode1, hashcode2, dtype=np.float32))

In [None]:
hamming_dist = np.count_nonzero(hashcode1 != hashcode2)

In [None]:
hamming_dist

In [None]:
hamming_dist = np.count_nonzero(hashcode1 != hashcode3)
hamming_dist

In [None]:
hamming_dist = np.count_nonzero(hashcode2 != hashcode3)
hamming_dist

In [None]:
def build_model(name, weights_path=None):
    base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    x = Flatten()(x)
    x = Dense(4096, activation='leaky_relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(1024, activation='sigmoid')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    predictions = Dense(20, activation='softmax')(x)

    model = Model(name=name, inputs=base_model.input, outputs=predictions)
    if weights_path:
        model.load_weights(weights_path)
    return model

In [None]:
def build_model_vgg(name, weights_path=None):
    base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    base_model.trainable = False

    # Add Layer Embedding
    model = keras.Sequential([
        base_model,
        GlobalMaxPooling2D(),
        Dense(20, activation='softmax')
    ])
#     predictions = Dense(20, activation='softmax')(x)
    
    if weights_path:
        model.load_weights(weights_path)
    return model

In [None]:
import keras
from keras import Model
from keras.applications.resnet import ResNet50
from keras.preprocessing import image
# from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.applications.densenet import preprocess_input, decode_predictions
from keras.layers import GlobalMaxPooling2D

In [None]:
# Input Shape
# img_width, img_height, _ = 224, 224, 3 #load_image(df.iloc[0].image).shape

# # Pre-Trained Model
# base_model = ResNet50(weights='imagenet', 
#                       include_top=False, 
#                       input_shape = (img_width, img_height, 3))
# base_model.trainable = False

# # Add Layer Embedding
# model = keras.Sequential([
#     base_model,
#     GlobalMaxPooling2D()
# ])

# model.summary()

In [None]:
NAME2 = "vgg19_only2-{}".format(int(time.time()))
model2 = build_model_vgg(NAME2)
model2.summary()

In [None]:
lr = 0.01
epochs = 10
## Initlisazie wandb project
wandb.init(project='CBIR-fashion product dataset',
          name = NAME2,
          config={
              "learning_rate": lr,
              "Batch_normalization": True,
              "Batch_size" : 64,
              "Dropout" : "0.4",
              "architecture": "VGG19",
              "dataset": "fashion-product-images-dataset",
              "epochs": epochs,
              "data generator" : True
          }
          )

In [None]:
wandb_callback = WandbCallback()

filepath = "{}_loss_opti2.hdf5".format(NAME2)

checkpoint1 = tf.keras.callbacks.ModelCheckpoint(
    filepath,
    monitor="val_loss",
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode="auto",
    save_freq="epoch"
)


In [None]:
model2.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=lr),
              metrics=['accuracy'])

In [None]:
history = model2.fit_generator(train_generator,
                    validation_data = train_generator,
                    steps_per_epoch = train_generator.n//train_generator.batch_size,
                    validation_steps = valid_generator.n//valid_generator.batch_size,
                    epochs=epochs,
                    callbacks=[checkpoint1, wandb_callback])

In [None]:
data

In [None]:
def img_path(img):
    return DATA_PATH+"/images/"+img

def load_image(img, resized_fac = 0.1):
    img     = cv2.imread(img_path(img))
    w, h, _ = img.shape
    resized = cv2.resize(img, (int(h*resized_fac), int(w*resized_fac)), interpolation = cv2.INTER_AREA)
    return resized

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# generation of a dictionary of (title, images)
figures = {'im'+str(i): load_image(row.image) for i, row in image_files.sample(6).iterrows()}
# plot of the images in a figure, with 2 rows and 3 columns
plot_figures(figures, 2, 3)

In [None]:
# trực quan một số hình ảnh trong datasets
def dislay_image(image_files):
    random.shuffle(image_files)
    for idx, image_file in enumerate(image_files[0:9]):
        plt.subplot(3, 3, idx+1)
        image_path = os.path.join(DATA_PATH, 'fashion-dataset', 'images', image_file)
        image_arr = cv2.imread(image_path)
        image_arr = cv2.cvtColor(image_arr, cv2.COLOR_BGR2RGB)
        
        plt.imshow(image_arr)
        plt.axis("off")

dislay_image(image_files) 

In [None]:
def load_image(image_files[x]):
    image_arr = cv2.imread(os.path.join(DATA_PATH, 'fashion-dataset', 'images', image_files[234]))
    image_arr = cv2.cvtColor(image_arr, cv2.COLOR_BGR2RGB)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# Calcule DIstance Matriz
cosine_sim = 1-pairwise_distances(df_embs, metric='cosine')
cosine_sim[:4, :4]

In [None]:
indices = pd.Series(range(len(df)), index=df.index)
indices

# Function that get movie recommendations based on the cosine similarity score of movie genres
def get_recommender(idx, df, top_n = 5):
    sim_idx    = indices[idx]
    sim_scores = list(enumerate(cosine_sim[sim_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    idx_rec    = [i[0] for i in sim_scores]
    idx_sim    = [i[1] for i in sim_scores]
    
    return indices.iloc[idx_rec].index, idx_sim

get_recommender(2993, image_files, top_n = 5)

In [None]:
image_arr = cv2.imread(os.path.join(DATA_PATH, 'fashion-dataset', 'images', image_files[234]))
image_arr = cv2.cvtColor(image_arr, cv2.COLOR_BGR2RGB)

plt.imshow(image_arr)
plt.axis("off")

ResNet50

In [None]:
import keras
from keras import Model
from keras.applications.resnet import ResNet50
from keras.preprocessing import image
# from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.applications.densenet import preprocess_input, decode_predictions
from keras.layers import GlobalMaxPooling2D

In [None]:
def resNetModel(name, weights_path=None):
    model = ResNet50(weights='imagenet', include_top=False, input_shape = (224, 224, 3))
    model.trainable = False
    output = GlobalMaxPooling2D()(model.outputs)
    model = Model(inputs=model.inputs, outputs=output)
    model.summary()
    return model