### Create New Folders with 20 class instead of 500 class

In [30]:
# # run this script after extracting the dataset into a folder called "archive"
# # it creates new folders with only a few classes

# import os
# import shutil
# import random
# from tqdm import tqdm
# import json


# def merge_folders(source_root,source_dir_list, target_dir):

#     """
#     Merge two folders into one
#     """
#     if not os.path.exists(target_dir):
#         os.mkdir(target_dir)
#     for source_dir in source_dir_list:
#         full_path_source = os.path.join(source_root, source_dir)
#         for root, dirs, files in os.walk(full_path_source):
#             for file in files:
#                 shutil.copy(os.path.join(root, file), os.path.join(target_dir, file))


# def merge_folders_from_dict(source_path, destination_path):
#     """"""

#     if not os.path.exists(destination_path):
#         os.mkdir(destination_path)

#     categories ={}
#     with open('final_classes.json', 'r') as dict_reader:
#         categories=json.load(dict_reader)

#     for k, v in categories.items():
#         target_dir = os.path.join(destination_path, k) 
#         #print(f'the target directory is: {target_dir}') 
#         merge_folders(source_path,v, target_dir)

# merge_folders_from_dict('./archive/vinted_train', './archive/vinted_train_merged_folder')
# merge_folders_from_dict('./archive/vinted_val', './archive/vinted_val_merged_folder')
# shutil.rmtree('./archive/vinted_train')
# shutil.rmtree('./archive/vinted_val')
# os.rename('./archive/vinted_train_merged_folder', './archive/vinted_train')
# os.rename('./archive/vinted_val_merged_folder', './archive/vinted_val')

In [31]:
# import os
# import shutil
# import random
# from tqdm import tqdm

# def make_train_val(folder, divider: int):
#     train_folder, val_folder = f"{folder}_train", f"{folder}_test_only"
#     # for every sub folder in folder
#     for label in tqdm(os.listdir(folder), desc="Making train and test sets"):
#         # if label is not a folder
#         if not os.path.isdir(os.path.join(folder, label)):
#             continue
#         # make train and val sub folders
#         os.makedirs(os.path.join(train_folder, label), exist_ok=True)
#         os.makedirs(os.path.join(val_folder, label), exist_ok=True)
#         # for every file in sub folder
#         filenames = os.listdir(os.path.join(folder, label))
#         random.shuffle(filenames)
#         modulo = min(len(filenames), divider) # val is 1/5th but should contain at least 1 element
#         for i in range(len(filenames)):
#             if i % modulo == 0:
#                 shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(val_folder, label, filenames[i]))
#             else:
#                 shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(train_folder, label, filenames[i]))

# os.rename("./archive/vinted_train", "./archive/vinted")
# make_train_val(folder="archive/vinted", divider=5)
# shutil.rmtree("./archive/vinted")

In [32]:
# # then run this script to remove all classes with not enough images
# # for example, if you have less than X images in the folder vinted_train you can remove it

# min_nr_images = 100

# for folder in ["./archive/vinted_train"]:
#     for label in tqdm(os.listdir(folder), desc=f"Removing classes with less than {min_nr_images} images"):
#         if not os.path.isdir(os.path.join(folder, label)):
#             continue
#         if len(os.listdir(os.path.join(folder, label))) < min_nr_images:
#             shutil.rmtree(os.path.join(folder, label))
#             if os.path.isdir(os.path.join("./archive/vinted_val", label)):
#                 shutil.rmtree(os.path.join("./archive/vinted_val", label))
#             if os.path.isdir(os.path.join("./archive/vinted_test_only", label)):
#                 shutil.rmtree(os.path.join("./archive/vinted_test_only", label))

In [33]:
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from keras.callbacks import ModelCheckpoint

### Using Tensorflow

In [34]:
img_height = 100
img_width = 100
batch_size = 1024

In [35]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  "/kaggle/input/archive-vinted/archive/vinted_train",
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width), batch_size=batch_size)

In [36]:
val_ds = tf.keras.utils.image_dataset_from_directory(
  "/kaggle/input/archive-vinted/archive/vinted_val",
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width), batch_size=batch_size)

In [37]:
class_names = train_ds.class_names
#print(class_names)

In [38]:
"""
tf.data.AUTOTUNE automatically tunes the mapping function to 
increase parallel processing efficiency.
Next, we shuffle only the train dataset with a buffer size of 1000
"""
    
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [39]:
normalization_layer = layers.Rescaling(1./255)

In [40]:
normalized_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image))

In [41]:
num_classes = len(class_names)

model = Sequential([
  layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
    ]) 

In [42]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [43]:
model.summary()

In [44]:
checkpoint_path = "train_vinted_file_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

# Train the model with the new callback
epochs=5
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[cp_callback]
)


In [45]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
plt.savefig('before_reduce_overfitting.png')

In [46]:
data_augmentation = keras.Sequential(
  [
    layers.RandomFlip("horizontal",
                      input_shape=(img_height,
                                  img_width,
                                  3)),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
  ]
)

In [47]:
#add layers.dropout
model = Sequential([
  data_augmentation,
  layers.Rescaling(1./255),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])

In [48]:
model.summary()

In [49]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
plt.savefig('after_reduce_overfitting.png')
