__Shopee is the leading e-commerce platform in Southeast Asia and Taiwan.__

Thanks to Chris @cdeotte for his great works and sharing the knowledge!!

# Competition Goal

__In this competition, you’ll apply your machine learning skills to build a model that predicts which items are the same products.__

# Evaluation Metric

__Submissions will be evaluated based on their mean F1 score.__

# Code Requirements

Submissions to this competition must be made through Notebooks. In order for the "Submit" button to be active after a commit, the following conditions must be met:

- CPU Notebook <= 9
- GPU Notebook <= 2
- Internet access disabled
- Freely & publicly available external data is allowed, including pre-trained models
- Submission file must be named "submission.csv"

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set3')

import cv2
import gc

import itertools
import collections
from collections import Counter

from nltk.corpus import stopwords

import re
from wordcloud import WordCloud

import os
print(os.listdir('/kaggle/input/shopee-product-matching/'))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

import warnings
warnings.simplefilter('ignore')

In [None]:
base_dir = '/kaggle/input/shopee-product-matching/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
print(f'Number of train images: {len(os.listdir(base_dir + "train_images/"))}')
print(f'Number of test images: {len(os.listdir(base_dir + "test_images/"))}')

In [None]:
train.info()

- Let's add a column in the train/test set with the train/test images path

In [None]:
train['image_path'] = base_dir + 'train_images/' + train['image']
test['image_path'] = base_dir + 'test_images/' + test['image']
display(train.head(), test.head())

In [None]:
tmp = train.groupby('label_group')['posting_id'].agg('unique').to_dict()
train['target'] = train['label_group'].map(tmp)
train.head(2)

In [None]:
def get_f1metric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score

In [None]:
#To calculate F1 score - local
tmp = train.groupby('image_phash')['posting_id'].agg('unique').to_dict()
train['oof'] = train['image_phash'].map(tmp)
train.head(2)

In [None]:
train['f1_base'] = train.apply(get_f1metric('oof'), axis = 1)
print(f"Train F1 Score: {train['f1_base'].mean()}")

In [None]:
def display_images(paths, rows, cols, title = None):
    fig, ax = plt.subplots(rows, cols, figsize = (16, 12))
    ax = ax.flatten()
    for i, path in enumerate(image_paths):
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
        ax[i].set_title(img.shape)
        ax[i].imshow(img)
        ax[i].grid(False)
    if title:
        plt.suptitle(title, fontsize = 15, y = 1.0)

__Display random train Images__

In [None]:
image_paths = np.random.choice(train['image_path'], 9)
display_images(image_paths, 3, 3, 'Display Random Train Images')

In [None]:
image_paths = np.random.choice(train['image_path'], 9)
display_images(image_paths, 3, 3)

__Display Test Images__

In [None]:
image_paths = test['image_path'].values
display_images(image_paths, 1, 3)

__Display Images by Label_Group__

In [None]:
train['label_group'].value_counts()

In [None]:
image_paths = np.random.choice(train['image_path'][train['label_group'] == 3627744656].values, 9)
display_images(image_paths, 3, 3, 'Train Images with most frequent label group')

In [None]:
image_paths = np.random.choice(train['image_path'][train['label_group'] == 994676122].values, 9)
display_images(image_paths, 3, 3, 'Train Images with most frequent label group')

In [None]:
image_paths = train['image_path'][train['label_group'] == 1615893885].values
display_images(image_paths, 1, 2, 'Train Images with least frequent label group')

In [None]:
plt.title('Distribution of trainset title length')
sns.histplot(train['title'].apply(lambda x: len(x)), kde = True);

In [None]:
print(f'Number of unqiue titles in trainset: {train["title"].nunique()}')

- So there are images with same title in the dataset

In [None]:
train['title_len'] = train['title'].apply(lambda x: len(x))
test['title_len'] = test['title'].apply(lambda x: len(x))

print(f'Max. train title length: {train["title_len"].max()}')
print(f'Min. train title length: {train["title_len"].min()}')

__Title Text WordCloud__

In [None]:
def plot_wordcloud(data, senti = None, text = None):
    stop = stopwords.words('english')
    all_words = [word for each in data['title'] for word in each.split() if word not in stop]
    word_freq = Counter(all_words)

    wordcloud = WordCloud(width = 900,
                          height = 500,
                          max_words = 200,
                          max_font_size = 100,
                          relative_scaling = 0.5,
                          background_color = "rgba(255, 255, 255, 0)", 
                          mode = "RGBA",
                          normalize_plurals = True).generate_from_frequencies(word_freq)
    plt.figure(figsize = (16, 12))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.title(text)
    plt.axis("off")
    plt.show()

In [None]:
plot_wordcloud(train, text = 'Train Title WordCloud')

__Display Images with same title__

In [None]:
train['title'].value_counts()

In [None]:
t = 'Koko syubbanul muslimin koko azzahir koko baju'
image_paths = np.random.choice(train['image_path'][train['title'] == t].values, 6)
display_images(image_paths, 2, 3, t)

In [None]:
t = 'Emina Glossy Stain'
image_paths = np.random.choice(train['image_path'][train['title'] == t].values, 6)
display_images(image_paths, 2, 3, t)

In [None]:
t = 'Viva Air Mawar'
image_paths = np.random.choice(train['image_path'][train['title'] == t].values, 6)
display_images(image_paths, 2, 3, t)

In [None]:
#For submission test set will be replaced with bigger dataset
if len(test) == 3:
    df = train
    img_dir = '../input/shopee-product-matching/train_images/'
    print(df.shape)
else:
    df = test
    img_dir = '../input/shopee-product-matching/test_images/'
    print(df.shape)

__Finding Similar Images using Nearest Neighbor__

- Extract image embeddings using a pre-trained tensorflow model
- Find nearest neighbor of an image based on Euclidean Distance using sklearn 

In [None]:
import tensorflow as tf

from tensorflow.keras.applications import EfficientNetB0

print(f'Tensorflow version: {tf.__version__}')

# Find similar images using image embeddings

In [None]:
class ImageDataGen(tf.keras.utils.Sequence):
    def __init__(self, img_path, data, batch_size, 
                 dim, shuffle = False):
        self.dim  = dim
        self.data = data
        self.shuffle  = shuffle
        self.img_path = img_path
        self.batch_size = batch_size
        self.list_idx = self.data.index.values
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.ceil(float(len(self.data)) / float(self.batch_size)))
    
    def __getitem__(self, index):
        batch_idx = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        
        idx = [self.list_idx[k] for k in batch_idx]
        
        Data   = np.zeros((len(batch_idx), self.dim, self.dim, 3), dtype = 'float32')
        
        for i, k in enumerate(idx):
            # load the image file using cv2
            image = cv2.imread(self.img_path + self.data['image'][k])
            image = cv2.resize(image, (self.dim, self.dim))
            
            # assign 
            Data[i, ] =  image
            
        return Data
    
    def on_epoch_end(self):
        self.indices = np.arange(len(self.list_idx))
        if self.shuffle:
            np.random.shuffle(self.indices)

__Check Images have loaded correctly__

In [None]:
def plot_images(dataset, row, col):
    plt.rcParams['figure.figsize'] = 20, 10
    for i in range(row):
        f, ax = plt.subplots(1, col)
        for p in range(col):
            idx = np.random.randint(0, len(dataset))
            img = dataset[idx]
            ax[p].grid(False)
            ax[p].axis('off')
            ax[p].imshow(img[0].astype('uint8'))
    plt.show()
    
traingen = ImageDataGen(img_dir, df, batch_size = 32, dim = 256)
plot_images(traingen, 3, 3)

del traingen
gc.collect()

__Define model to extract embeddings__

In [None]:
weights = '../input/tfkeras-efficientnet-weights/efficientnetb0_notop.h5'

model = EfficientNetB0(weights = weights, include_top = False, pooling = 'avg', input_shape = None)

- Since GPU Notebook should < 2 hours, we chunk the inout data to speed up 

In [None]:
def chunker(data, size):
    return (data[start: start + size] for start in range(0, len(data), size))

In [None]:
%%time
chunk_size = 4096
embeddings = []

for k, chunk in enumerate(chunker(df, chunk_size)):
    print(f'Chunk: {k + 1}')
    datagen = ImageDataGen(img_dir, chunk, batch_size = 128, dim = 256)
    img_embed = model.predict(datagen, verbose = 1)
    embeddings.append(img_embed)
    
image_embeddings = np.concatenate(embeddings)
print(f'Train Image Embeddings shape: {image_embeddings.shape}')

gc.collect()

__Use KNN to find similar images__

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
n = 50
nn = NearestNeighbors(n_neighbors = n)
nn.fit(image_embeddings)
#distances, indices = nn.kneighbors(image_embeddings)

In [None]:
%%time
#chunk_size = 4096
preds = []

for k, chunk in enumerate(chunker(image_embeddings, chunk_size)):
    print(f'Chunk: {k + 1}')
    distances, indices = nn.kneighbors(chunk)
    for i in range(len(chunk)):
        dists = np.where(distances[i,] < 6.0)[0]
        idx = indices[i, dists]
        post_ids = df['posting_id'].iloc[idx].values
        preds.append(post_ids)

print(len(preds))
gc.collect()

In [None]:
df['matches'] = preds
df['num_similar_img'] = df['matches'].apply(lambda x: len(x))
df.head()

__Display similar images__

In [None]:
from textwrap import wrap

def plot_similar():
    while(True):
        pid = np.random.choice(df['posting_id'].values, 1)
        pred_ids = df['matches'][df['posting_id'] == pid[0]].values[0]
        #print(pred_ids)
        if len(pred_ids) > 1:
            break

    if len(pred_ids) > 7:
        col = 6
    else:
        col = len(pred_ids)
    fig, ax = plt.subplots(1, col, figsize = (10, 4))

    for i, ids in enumerate(pred_ids[:col]):
        path = df['image_path'][df['posting_id'] == ids].values[0]
        title = df['title'][df['posting_id'] == ids].values[0]
        img = cv2.imread(path)
        ax[i].imshow(img)
        ax[i].set_title("\n".join(wrap(title, 30)), fontsize = 10)
        ax[i].grid(False)
        ax[i].axis('off')

In [None]:
plot_similar()
plot_similar()

In [None]:
#To calculate F1 score - local
tmp = df.groupby('label_group')['posting_id'].agg('unique').to_dict()
df['target'] = df['label_group'].map(tmp)
df['f1_img'] = df.apply(get_f1metric('matches'), axis = 1)
print(f"CV Score: {df['f1_img'].mean()}")

# Submission

In [None]:
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
subs = pd.read_csv('submission.csv')
subs.head()

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))