In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 
from tensorflow.keras.utils import Sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import math
import os 
import cv2 as cv
import gc 
from tqdm import tqdm 
import tensorflow_hub as hub 
#import shutil
#shutil.copy(src="../input/tokenization/tokenization.py",dst="./")
#import tokenization
from transformers import TFBertModel,BertTokenizer
import re 
import string
from cuml import NearestNeighbors
import cupy
from tensorflow.keras.applications import EfficientNetB3

In [2]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [3]:
def sub_matches(col):
    def matches(row):
       return  " ".join(row[col])
    return matches

In [4]:
def combine(*features) :
    def concat(row):
        combinaisons = np.concatenate([row[f] for f in features])
        combinaisons = np.unique(combinaisons)
        return combinaisons
    return concat 

In [5]:
def clean_title(tx):
    tx = tx.lower()
    tx = re.sub(r"\\(\w)+(\d)+"," ",tx)
    #tx = re.sub(r"[\\/-+_*[]()]"," ",tx)
    tx = re.sub(r"(\d+),(\d+)",r"\1.\2",tx)
    for p in string.punctuation :
        if p !="." :
          tx=tx.replace(p," ")
    tx = re.sub(r"\s+"," ",tx)
    
    return tx

In [6]:
submission = False 
test = pd.read_csv("../input/shopee-product-matching/test.csv")
images = "../input/shopee-product-matching/test_images"
if len(test) <=3 :
    test = pd.read_csv("../input/shopee-product-matching/train.csv")
    images = "../input/shopee-product-matching/train_images"
    submission=True


In [7]:
lab = test.groupby("label_group")["posting_id"].agg("unique")
test["target"] = test["label_group"].map(lab)

In [8]:
LIMIT = 4
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus :
    try :
       tf.config.experimental.set_virtual_device_configuration(gpus[0],\
                                                           [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
       logical_gpus = tf.config.experimental.list_logical_devices("GPU")
    
    except RuntimeError as e :
       print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

We will restrict TensorFlow to max 4GB GPU RAM
then RAPIDS can use 12GB GPU RAM


# 1. Duplicated Images :

In [9]:
class DataGenerator(Sequence):
    def __init__(self,df,path=images,batch_size=32,img_size=512):
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path 
        self.df = df 
        self.ranges = np.arange(len(self.df))
    def __len__(self):
        cls = len(self.df) //(self.batch_size)
        cls += int((len(self.df) % (self.batch_size)) != 0)
        return cls 
    def __getitem__(self,index):
        indices = self.ranges[index * self.batch_size:(index + 1) * self.batch_size]
        return self.__generation(indices)
    def __generation(self,indices) :
        data = np.zeros((len(indices),self.img_size,self.img_size,3))
        dff = self.df.iloc[indices]
        for i , (j,row) in enumerate(dff.iterrows()):
            img = row.image
            img = os.path.join(self.path,img)
            img = cv.imread(img)
            img = cv.resize(img,(self.img_size,self.img_size))
            data[i,] = img 
        return data 

In [10]:
model = tf.keras.models.load_model("../input/arc-face-training-part2/embadding_arcface/")

model.summary()

In [11]:
#md = tf.keras.models.Model(inputs=model.layers[-3].input,outputs=model.layers[-3].output)

model = EfficientNetB0(weights="imagenet",input_shape=(512,512,3),\
                       include_top = False,pooling="avg")

In [12]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_image (InputLayer)     [(None, 512, 512, 3)]     0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
Total params: 10,783,535
Trainable params: 10,696,232
Non-trainable params: 87,303
_________________________________________________________________


model.output

In [13]:
#md = tf.keras.models.load_model("../input/arc-face-training-part2/embadding_arcface/")
md = model 
chunk = 256
cls = len(test) // chunk 
cls += int((len(test) % chunk) !=0)
embedding = []
for i in tqdm(range(cls)):
    a = i * chunk
    b = min((i+1) * chunk ,len(test))
    df = test.iloc[a:b,:]
    dt = DataGenerator(df)
    embedding.append(md.predict(dt,use_multiprocessing=True,workers = 4))
del(md)
img_embedding = np.concatenate(embedding,axis=0)
gc.collect()

100%|██████████| 134/134 [35:07<00:00, 15.73s/it]


130

In [14]:
from cuml.neighbors import NearestNeighbors
import cupy 
duplicated_images = NearestNeighbors(n_neighbors=50,metric="cosine")

In [15]:
duplicated_images.fit(img_embedding)

NearestNeighbors(n_neighbors=50, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7fc75366beb0>, algorithm='brute', metric='cosine', p=2, metric_params=None, output_type='numpy')

In [16]:
pred = []
chunk = 4 * 1024 
cls = len(test) // chunk 
cls += int((len(test) % chunk) !=0)
for i in tqdm(range(cls)) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances,indices = duplicated_images.kneighbors(img_embedding[a:b,])
    for j in range(b-a):
        distance = distances[j,:]
        ind = np.where(distance < 0.39)[0]
        ind = indices[j,ind]
        pred.append(test.iloc[ind].posting_id.values)

100%|██████████| 9/9 [00:12<00:00,  1.37s/it]


In [17]:
test["pred_img"] = pred

In [18]:
if submission :
    
    test["f2"] = test.apply(getMetric("pred_img"),axis=1)
    
    print('CV score for tf embedding image =',test.f2.mean())

CV score for tf embedding image = 0.7973066509928955


# 2. Duplicated text :

In [19]:
# "cahya/bert-base-indonesian-522M"
tokenizer = BertTokenizer.from_pretrained("../input/indonesian-distilbert-finetuning-with-arcmargin/tokenizer/")

In [20]:
def encode (seq_title,tokenizer,max_seq):
    all_tokens = []
    #all_masks = [] 
    #all_segments = []
    for  seq in seq_title :
        #tokens = tokenizer.tokenize(seq)
        #tokens = tokens[:max_seq-2]
        #tokens = ["[CLS]"] + tokens + ["[SEP]"]
        #tokens = tokenizer.convert_tokens_to_ids(tokens)
        #masks = [1] * len(tokens) + [0] * (max_seq - len(tokens))
        #segments = [1] * max_seq 
        #tokens = tokens + [0] * (max_seq - len(tokens))
        all_tokens.append(tokenizer.encode_plus(seq,padding="max_length",max_length=max_seq,\
                                               truncation=True).get("input_ids"))
        #all_masks.append(masks)
        #all_segments.append(segments)
        
    return np.vstack(all_tokens)

bert_layer = hub.KerasLayer("../input/bert-en-uncased-l-12-h-768-a-12-1",trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)

In [21]:
max_len = 61 

In [22]:
test["cleaned_title"] = test["title"].map(clean_title)

In [23]:
#arc_face_bert = TFBertModel.from_pretrained("cahya/bert-base-indonesian-522M")

In [24]:
model_bert = tf.keras.models.load_model("../input/arc-face-training-bert-part1/model/")

In [25]:
arc_face_bert = tf.keras.models.Model(inputs=model_bert.layers[1].input,outputs= model_bert.\
                                     layers[2].input)

In [26]:
# arc_face_bert = tf.keras.models.load_model("../input/arc-face-training-bert-part1/bert_embedding/")
chunk = 1024
cls = len(test) // chunk 
cls += int((len(test) % chunk) !=0)
embedding = []
for i in tqdm(range(cls)) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    seq = test.iloc[a:b].cleaned_title.values 
    all_tokens = encode(seq,tokenizer,max_len)
    embedding.append(arc_face_bert.predict([all_tokens]))
txt_embedding = np.concatenate(embedding,axis=0)
del(arc_face_bert)
gc.collect()

100%|██████████| 34/34 [01:36<00:00,  2.85s/it]


33268

In [27]:
embedding = cupy.array(txt_embedding)

In [28]:
duplicated_text = NearestNeighbors(n_neighbors=50)
duplicated_text.fit(embedding)

NearestNeighbors(n_neighbors=50, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7fc74efbd1b0>, algorithm='brute', metric='euclidean', p=2, metric_params=None, output_type='cupy')

In [29]:
pred = []
chunk = 1024 *4 
cls = len(test) // chunk 
cls += int((len(test) % chunk) !=0)
for i in tqdm(range(cls)):
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances,indices = duplicated_text.kneighbors(embedding[a:b,])
    for j in range(b-a) :
        distance = distances[j,:]
        ind = np.where(distance < 11)[0] # 0.1 for cos distance 
        indice = indices[j,ind]
        pred.append(test.iloc[cupy.asnumpy(indice)].posting_id.values)

100%|██████████| 9/9 [00:35<00:00,  3.93s/it]


In [30]:
test["pred_text"] = pred 

In [31]:
if submission :
    
    test["f3"] = test.apply(getMetric("pred_text"),axis=1)
    
    print('CV score for tf embedding text =',test.f3.mean())

CV score for tf embedding text = 0.8400313095426022


# 3.Text embedding prediction  & image embedding prediction combinaison :

In [32]:
test["comb_text_im"] = test.apply(combine("pred_text","pred_img"),axis=1)

In [33]:
if submission :
    
    test["f0"] = test.apply(getMetric("comb_text_im"),axis=1)
    
    print('CV score for combinaisons =',test.f0.mean())

CV score for combinaisons = 0.8518491883980326


# 4. Combinaisons between text embedding and image embedding :

In [34]:
comb_embedding = np.concatenate([img_embedding,txt_embedding],axis=1)

In [35]:
comb_embedding = cupy.array(comb_embedding)

In [36]:
combined_distance = NearestNeighbors(n_neighbors=50)
combined_distance.fit(comb_embedding)

NearestNeighbors(n_neighbors=50, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7fc750df6c50>, algorithm='brute', metric='euclidean', p=2, metric_params=None, output_type='cupy')

In [37]:
chunk = 4 * 1024 
cls = len(test) // chunk 
cls += int((len(test) % chunk) !=0)
prediction = []
for i in tqdm(range(cls)):
    a = i * chunk
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances,indices = combined_distance.kneighbors(comb_embedding[a:b,])
    for j in range(b-a) :
        distance = distances[j,:]
        ind = np.where(distance<12)[0]
        ind = indices[j,ind]
        ind = cupy.asnumpy(ind)
        prediction.append(test.loc[ind,"posting_id"].values)
        

100%|██████████| 9/9 [00:31<00:00,  3.48s/it]


In [38]:
test["com_embedding"] = prediction 

In [39]:
if submission :
    
    test["f4"] = test.apply(getMetric("com_embedding"),axis=1)
    
    print('CV score for combinaisons =',test.f4.mean())

CV score for combinaisons = 0.8589025070965064


# All combinaisons :

In [40]:
test["all_combinaisons"] = test.apply(combine("pred_text","pred_img","com_embedding"),axis=1)

In [41]:
if submission :
    
    test["f5"] = test.apply(getMetric("all_combinaisons"),axis=1)
    
    print('CV score for combinaisons =',test.f5.mean())

CV score for combinaisons = 0.8518554143344446


# Submission:

In [42]:
test["matches"] = test.apply(sub_matches("com_embedding"),axis=1)

In [43]:
test[["posting_id","matches"]].to_csv("submission.csv",index = False)
sub = pd.read_csv('submission.csv')
sub.head()

Unnamed: 0,posting_id,matches
0,train_129225211,train_129225211 train_2278313361
1,train_3386243561,train_3386243561 train_3423213080
2,train_2288590299,train_2288590299 train_3803689425
3,train_2406599165,train_2406599165 train_3576714541 train_174495...
4,train_3369186413,train_3369186413 train_921438619
