In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np, pandas as pd, gc, glob
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer, CountVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
from wordcloud import WordCloud,STOPWORDS
from tensorflow.keras.applications import EfficientNetB0
from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA

print('RAPIDS',cuml.__version__)
print('TF',tf.__version__)

In [None]:
# RESTRICT TENSORFLOW TO 1GB OF GPU RAM
# SO THAT WE HAVE 15GB RAM FOR RAPIDS
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
BASE_PATH = '../input/shopee-product-matching'
TRAIN_PATH = os.path.join(BASE_PATH, "train_images/")
TEST_PATH =  os.path.join(BASE_PATH, "test_images/")            

print(os.listdir(BASE_PATH))
print(os.listdir(TRAIN_PATH))

In [None]:
# Reading train & test CSV file
COMPUTE_CV = True

train_df = pd.read_csv(BASE_PATH + '/train.csv')
test_df = pd.read_csv(BASE_PATH + '/test.csv')
if len(test_df)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')
train_df.head()


In [None]:
test_df.head()

In [None]:
train_images = TRAIN_PATH + train_df['image']
train_df['path'] = train_images
train_same_id =  train_df.groupby('label_group').posting_id.agg('unique').to_dict()
train_df['same_target'] = train_df.label_group.map(train_same_id)
train_hash = train_df.groupby('image_phash').posting_id.agg('unique').to_dict()
train_df['same_hash'] = train_df.image_phash.map(train_hash)
train_df.head()


In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.same_target,row[col]) )
        return 2*n / (len(row.same_target)+len(row[col]))
    return f1score

In [None]:
train_df['f1'] = train_df.apply(getMetric('same_hash'),axis=1)
print('CV score for baseline =',train_df.f1.mean())
train_df.head()

In [None]:
test_images = TEST_PATH + test_df['image']
test_df['path'] = test_images
test_df.head()

In [None]:
print(train_df.info())
print(test_df.info())

In [None]:
# Reading multiple train_images
def display_multiple_images(images_paths, rows, cols):
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8) )
    for ind,image_path in enumerate(images_paths):
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
display_multiple_images(train_images[100:150], 5, 5)

In [None]:
# Reading unique values from train & test
print(train_df['label_group'].nunique())

In [None]:
train_label_count = train_df['label_group'].value_counts()
most_label_count = train_label_count[train_label_count == train_label_count.max()]
low_label_count = train_label_count[train_label_count == train_label_count.min()]
print(f"{m_} Most frequent label group: ", most_label_count)
print(f"{y_} low frequent label group: ", low_label_count)

most_label = np.unique(train_df['label_group'][train_df['label_group'].isin(most_label_count.index)].values)
less_label = np.unique(train_df['label_group'][train_df['label_group'].isin(low_label_count.index)].values)

print(f"{m_} Most frequent label group: ", most_label)
print(f"{y_} Less frequent label group: ", less_label)

In [None]:
#Reading Most frequent images from train_images

def frequent_images(group, m):
    
    if m == 'l':
        z = train_df['image'][train_df['label_group']==group].values
    if m == 't':
        z = train_df['image'][train_df['title']==group].values
        
    image_names = []
    for filename in z:
        fullpath = os.path.join(TRAIN_PATH, filename)
        image_names.append(fullpath)
    return image_names
    

In [None]:
display_multiple_images(frequent_images(562358068,'l'), 3, 3)

In [None]:
display_multiple_images(frequent_images(887886,'l'), 1, 2)

# **Observations from EDA📝:**


* Visually similar images in different label groups 

* Same images with different titles 
 
* Same titles have different images

# WordCloud is a technique to show which words are the most frequent among the given text. The first thing you may want to do before using any functions is check out the docstring of the function, and see all required and optional arguments.

In [None]:
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width = 1500, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      colormap = "cool",
                      max_words=150,
                      stopwords = stopwords,).generate(' '.join(train_df['title'])) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

In [None]:
if COMPUTE_CV:
    test_gf = cudf.DataFrame(train_df)
    print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape )
else:
    test_gf = cudf.DataFrame(test_df)
    print('Test shape is', test_gf.shape )
test_gf.head()

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
WGT = '../input/effnetb0/efficientnetb0_notop.h5'
model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)

if COMPUTE_CV: 
    Base = TRAIN_PATH
    

embed = []
chunks_size = 1024*4
print('Computing image embeddings...')
count = len(train_df)//chunks_size

if len(train_df)%chunks_size!=0:
    count += 1

for i,j in enumerate(range(count)):
    a = j*chunks_size
    b = (j + 1)*chunks_size
    b = min(b,len(train_df))
    print('chunks_size',a,'to',b)

    test_gen = DataGenerator(train_df,img_size = 256, batch_size=32, path=TRAIN_PATH)
    image_embedd = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
    embed.append(image_embedd)
    
del model
_ = gc.collect()
print('image embeddings shape',image_embedd.shape)

In [None]:
KNN = 50
if len(test_df)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embedd)

# **Predicting Similar images**

In [None]:
Pred_img_id = []
print('Predicting Similar images...')
count = len(image_embedd)//chunks_size

if len(image_embedd)%chunks_size!=0:
    count += 1

for j in range(count):
    a = j*chunks_size
    b = (j+1)*chunks_size
    b = min(b,len(image_embedd))
    print('chunks_size',a,'to',b)
    distances, indices = model.kneighbors(image_embedd[a:b,])
    
    for k in range (b-a):
        IDX = np.where(distances[k,]<5.0)[0]
        IDS = indices[k,IDX]
        dist = train_df.iloc[IDS].posting_id.values
        Pred_img_id.append(dist)
        
del model, distances, indices, image_embedd, embed
_ = gc.collect()



In [None]:
train_df['Pred_img_id'] = Pred_img_id

# **Text Embedding Using Cosine Similarity**

In [None]:
import string

def rem_punctuation(text):
    punc_translator = str.maketrans(string.punctuation,' '*len(string.punctuation))
    return text.translate(punc_translator)

train_df['text_clean'] = train_df['title'].apply(rem_punctuation)
title_to_use = cudf.DataFrame(train_df).text_clean

In [None]:
print('Computing text embeddings using Tfidf Vectorizer...')
tfid_vec = TfidfVectorizer(stop_words = 'english',
                          binary = True,
                          max_features = 30000)
text_embedd = tfid_vec.fit_transform(title_to_use).toarray().astype(np.float32)
print('text embedd shape', text_embedd.shape)
print(text_embedd)

In [None]:
Pred_titles = []
print('Predicting Similar titles...')

count = len(train_df)//chunks_size

if len(train_df)%chunks_size!=0:
    count += 1
    
for j in range(count):
    a = j*chunks_size
    b = (j + 1)*chunks_size
    b = min(b,len(train_df))
    print('chunks_size',a,'to',b)
   
    count = cupy.matmul(text_embedd, text_embedd[a:b].T).T
    
    for k in range(b-a):
        IDX = cupy.where(count[k,]>0.7)[0]
        titles = train_df.iloc[cupy.asnumpy(IDX)].posting_id.values
        Pred_titles.append(titles)
        
del tfid_vec, text_embedd
_ = gc.collect()

In [None]:
train_df['Pred_titles'] = Pred_titles
train_df.head()

In [None]:
img_hash_tmp = train_df.groupby('image_phash').posting_id.agg('unique').to_dict()
train_df['img_hash_simi'] = train_df.image_phash.map(img_hash_tmp)
train_df.head()

In [None]:
def comb_sub(row):
    x = np.concatenate([row.Pred_img_id, row.Pred_titles, row.img_hash_simi])
    return ''.join( np.unique(x))

def comb_cv(row):
    x = np.concatenate([row.Pred_img_id, row.Pred_titles, row.img_hash_simi])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    train_same_id = train_df.groupby('label_group').posting_id.agg('unique').to_dict()
    train_df['same_target'] = train_df.label_group.map(train_same_id)
    train_df['same_hash'] = train_df.apply(comb_cv,axis=1)
    train_df['f1'] = train_df.apply(getMetric('same_hash'),axis=1)
    print('CV Score =', round(train_df.f1.mean(), 3) )

train_df['matches'] = train_df.apply(comb_sub,axis=1)

In [None]:
print("CV for image :", round(train_df.apply(getMetric('Pred_img_id'),axis=1).mean(), 3))
print("CV for text  :", round(train_df.apply(getMetric('Pred_titles'),axis=1).mean(), 3))
print("CV for phash :", round(train_df.apply(getMetric('img_hash_simi'),axis=1).mean(), 3))

In [None]:
test = train_df.copy()
test[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()