__Shopee is the leading e-commerce platform in Southeast Asia and Taiwan.__

# Competition Goal

__In this competition, you’ll apply your machine learning skills to build a model that predicts which items are the same products.__

# Evaluation Metric

__Submissions will be evaluated based on their mean F1 score.__

# Code Requirements

Submissions to this competition must be made through Notebooks. In order for the "Submit" button to be active after a commit, the following conditions must be met:

- CPU Notebook <= 9
- GPU Notebook <= 2
- Internet access disabled
- Freely & publicly available external data is allowed, including pre-trained models
- Submission file must be named "submission.csv"

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set3')

import cv2
import gc

import os
print(os.listdir('/kaggle/input/shopee-product-matching/'))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

import warnings
warnings.simplefilter('ignore')

In [None]:
base_dir = '/kaggle/input/shopee-product-matching/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
print(f'Number of train images: {len(os.listdir(base_dir + "train_images/"))}')
print(f'Number of test images: {len(os.listdir(base_dir + "test_images/"))}')

In [None]:
train.info()

- Let's add a column in the train/test set with the train/test images path

In [None]:
train['image_path'] = base_dir + 'train_images/' + train['image']
test['image_path'] = base_dir + 'test_images/' + test['image']
display(train.head(2), test.head(2))

In [None]:
def display_images(paths, rows, cols, title = None):
    fig, ax = plt.subplots(rows, cols, figsize = (16, 12))
    ax = ax.flatten()
    for i, path in enumerate(image_paths):
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
        ax[i].set_title(img.shape)
        ax[i].imshow(img)
        ax[i].grid(False)
    if title:
        plt.suptitle(title, fontsize = 15, y = 1.0)

__Display random train Images__

In [None]:
image_paths = np.random.choice(train['image_path'], 9)
display_images(image_paths, 3, 3, 'Display Random Train Images')

In [None]:
image_paths = np.random.choice(train['image_path'], 9)
display_images(image_paths, 3, 3)

__Display Test Images__

In [None]:
image_paths = test['image_path'].values
display_images(image_paths, 1, 3)

__Display Images by Label_Group__

In [None]:
train['label_group'].value_counts()

In [None]:
image_paths = np.random.choice(train['image_path'][train['label_group'] == 3627744656].values, 9)
display_images(image_paths, 3, 3, 'Train Images with most frequent label group')

In [None]:
image_paths = np.random.choice(train['image_path'][train['label_group'] == 994676122].values, 9)
display_images(image_paths, 3, 3, 'Train Images with most frequent label group')

In [None]:
image_paths = train['image_path'][train['label_group'] == 1615893885].values
display_images(image_paths, 1, 2, 'Train Images with least frequent label group')

In [None]:
plt.title('Distribution of trainset title length')
sns.histplot(train['title'].apply(lambda x: len(x)), kde = True);

In [None]:
print(f'Number of unqiue titles in trainset: {train["title"].nunique()}')

- So there are images with same title in the dataset

In [None]:
train['title_len'] = train['title'].apply(lambda x: len(x))
print(f'Max. title length: {train["title_len"].max()}')
print(f'Min. title length: {train["title_len"].min()}')

__Display Images with same title__

In [None]:
train['title'].value_counts()

In [None]:
t = 'Koko syubbanul muslimin koko azzahir koko baju'
image_paths = np.random.choice(train['image_path'][train['title'] == t].values, 6)
display_images(image_paths, 2, 3, t)

In [None]:
t = 'Emina Glossy Stain'
image_paths = np.random.choice(train['image_path'][train['title'] == t].values, 6)
display_images(image_paths, 2, 3, t)

In [None]:
t = 'Viva Air Mawar'
image_paths = np.random.choice(train['image_path'][train['title'] == t].values, 6)
display_images(image_paths, 2, 3, t)

__Finding Similar Images using Nearest Neighbor__

- Extract image embeddings using a pre-trained tensorflow model
- Find nearest neighbor of an image based on Euclidean Distance using sklearn 

In [None]:
import tensorflow as tf

from kaggle_datasets import KaggleDatasets

from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications import EfficientNetB1

print(f'Tensorflow version: {tf.__version__}')

In [None]:
#TPU CONFIG
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

BATCH_SIZE = 128 * strategy.num_replicas_in_sync
print(BATCH_SIZE)
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
print(GCS_DS_PATH)

In [None]:
train_paths = GCS_DS_PATH + '/train_images/' + train['image']
train_paths[:5]

In [None]:
#Create TF Dataset
def decode_image(filename, label = None, image_size = (256, 256)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels = 3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    return image

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_paths))
    .map(decode_image, num_parallel_calls = AUTO)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

In [None]:
for t in train_dataset.unbatch().batch(10):
    print(t.numpy().shape)
    break

__Check Images have loaded correctly__

In [None]:
for img in train_dataset.take(1):
    for i in range(12):
        ax = plt.subplot(3, 4, i + 1)
        plt.imshow(img[i].numpy())
        plt.grid(False)
        plt.axis('off')
        plt.title(img[i].shape)

In [None]:
model = EfficientNetB1(weights = 'imagenet', include_top = False, pooling = 'avg', input_shape = None)
image_embeddings = model.predict(train_dataset, verbose = 1)
print(f'Train Image Embeddings shape: {image_embeddings.shape}')

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = 20
nn = NearestNeighbors(n_neighbors = knn)
nn.fit(image_embeddings)
distances, indices = nn.kneighbors(image_embeddings)

__Predict similar images of few train images__

In [None]:
def find_similar(index):
    query_image = image_embeddings[index].reshape(1, -1)
    distances, indices = nn.kneighbors(query_image)

    dist = np.where(distances[0] < 3.0)[0]
    idx = indices[0][dist]
    posting_ids = train.iloc[idx]['posting_id'].values
    #print(posting_ids)
    return posting_ids

In [None]:
def plot_similar(postings):
    for i in range(6):
        ax = plt.subplot(2, 3, i + 1)
        img = cv2.imread(train['image_path'][train['posting_id'] == str(postings[i])].values[0])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
        img = cv2.resize(img, (256, 256))
        plt.imshow(img)
        plt.grid(False)
        plt.axis('off')
        if i == 0:
            plt.title('Query Image')
        else:
            plt.title('Prediction Image')

In [None]:
posting_ids = find_similar(1000)
plot_similar(posting_ids)

In [None]:
posting_ids = find_similar(100)
plot_similar(posting_ids)

In [None]:
posting_ids = find_similar(1990)
plot_similar(posting_ids)

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))