<a href="https://colab.research.google.com/github/greyhound101/shopee/blob/main/best_ensemble_5fold_best.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
!pip install ../input/keras-efficientnet-whl/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/keras-efficientnet-whl/efficientnet-1.1.1-py3-none-any.whl



In [None]:
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
import efficientnet.tfkeras as efn
from tqdm.notebook import tqdm
import math
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes of each fold
N_CLASSES = [8811, 8811, 8811, 8812, 8811]
GET_CV = True
# Flag to check ram allocations (debug)
CHECK_SUB = False

df = pd.read_csv('../input/shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Function to read out dataset
def read_dataset():
    
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
    image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, image_paths

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

# Function to get the embeddings of our images with the fine-tuned model


# Function to get our text title embeddings
def get_text_embeddings(df, max_features = 15500):
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df['title'])
    print(f'Our title text embedding shape is {text_embeddings.shape}')
    del model
    return text_embeddings

# Function to get 50 nearest neighbors of each image and text and apply thresholds find in the training phase that optimize f1 cv score
def get_neighbors(df, image_embeddings, text_embeddings, KNN = 50):
    # Get distances and indices from image and text embeddings
    neighbors_model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine').fit(image_embeddings)
    image_distances, image_indices = neighbors_model.kneighbors(image_embeddings)
    neighbors_model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine').fit(text_embeddings)
    text_distances, text_indices = neighbors_model.kneighbors(text_embeddings)
  
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        predictions = []
        for k in range(df.shape[0]):
            # This are the original thresholds that gives 0.8035 cv (optimize with a for loop)
            idx_image = np.where(image_distances[k,] < 0.35)[0]
            ids_image = image_indices[k,idx_image]
            idx_text = np.where(text_distances[k,] < 0.30)[0]
            ids_text = text_indices[k,idx_text]
            # Get the union of boths ids
            ids = list(set(list(ids_image) + list(ids_text)))
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
    
    else:
        predictions = []
        for k in range(df.shape[0]):
            # Reduce the thresholds because we are predicting more observations
            idx_image = np.where(image_distances[k,] < 0.35)[0]
            ids_image = image_indices[k,idx_image]
            idx_text = np.where(text_distances[k,] < 0.21)[0]
            ids_text = text_indices[k,idx_text]
            # Get the union of boths ids
            ids = list(set(list(ids_image) + list(ids_text)))
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
        
    del neighbors_model, image_distances, image_indices, text_distances, text_indices
    gc.collect()
    return df, predictions

# Read data and image paths
df, image_paths = read_dataset()
def get_image_embeddings(image_paths, fold = 4):
    embeds = []
    
    if fold == 3:
        mod = efn.EfficientNetB3(weights = None, include_top = False)
    elif fold == 2:
        mod = efn.EfficientNetB2(weights = None, include_top = False)
    elif fold == 1:
        mod = efn.EfficientNetB1(weights = None, include_top = False)
    elif fold == 0:
        mod = efn.EfficientNetB0(weights = None, include_top = False)

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x = mod(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    margin = ArcMarginProduct(
                n_classes = 11014, 
                s = 30, 
                m = 0.7, 
                name='head/arc_margin', 
                dtype='float32'
                )
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    if fold == 3:
        model.load_weights('../input/se-weights/b3.hdf5')
    elif fold == 2:
        model.load_weights('../input/se-weights/b2.hdf5')
    elif fold == 1:
        model.load_weights('../input/se-weights/b1.hdf5')
    elif fold == 0:
        model.load_weights('../input/se-weights/b0.hdf5')
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

image_embeddings_4 = get_image_embeddings(image_paths, fold = 0)
image_embeddings_3 = get_image_embeddings(image_paths, fold = 3)
image_embeddings_2 = get_image_embeddings(image_paths, fold = 2)
image_embeddings_1 = get_image_embeddings(image_paths, fold = 1)
gc.collect()

In [None]:
    text_embeddings = get_text_embeddings(df, max_features = 25000)
    image_embeddings = np.average([image_embeddings_4, image_embeddings_3[:,:1280], image_embeddings_2[:,:1280], image_embeddings_1[:,:1280]], axis = 0)
    if GET_CV:
        KNN=3
    else:
        KNN=50
    neighbors_model = NearestNeighbors(n_neighbors = KNN,metric='cosine').fit(image_embeddings)
    image_distances, image_indices = neighbors_model.kneighbors(image_embeddings)
    neighbors_model = NearestNeighbors(n_neighbors = KNN).fit(text_embeddings)
    text_distances, text_indices = neighbors_model.kneighbors(text_embeddings)
    print('got it')
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        predictions = []
        for k in range(df.shape[0]):
            # This are the original thresholds that gives 0.8035 cv (optimize with a for loop)
            idx_image = np.where(image_distances[k,] < 0.25)[0]
            ids_image = image_indices[k,idx_image]
            idx_text = np.where(text_distances[k,] < 0.6)[0]
            ids_text = text_indices[k,idx_text]
            # Get the union of boths ids
            ids = list(set(list(ids_image) + list(ids_text)))
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
    
    else:
        predictions = []
        for k in range(df.shape[0]):
            # Reduce the thresholds because we are predicting more observations
            idx_image = np.where(image_distances[k,] <0.35)[0]
            ids_image = image_indices[k,idx_image]
            idx_text = np.where(text_distances[k,] < 0.7)[0]
            ids_text = text_indices[k,idx_text]
            # Get the union of boths ids
            ids = list(set(list(ids_image) + list(ids_text)))
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
        
    del neighbors_model, image_distances, image_indices, text_distances, text_indices
    gc.collect()
    neighbors_model = NearestNeighbors(n_neighbors = KNN).fit(image_embeddings_2)
    image_distances, image_indices = neighbors_model.kneighbors(image_embeddings_2)
    neighbors_model = NearestNeighbors(n_neighbors = KNN).fit(image_embeddings_1)
    text_distances, text_indices = neighbors_model.kneighbors(image_embeddings_1)
    predictions1 = []
    for k in range(df.shape[0]):
            # This are the original thresholds that gives 0.8035 cv (optimize with a for loop)
            idx_image = np.where(image_distances[k,] < 0.5)[0]
            ids_image = image_indices[k,idx_image]
            idx_text = np.where(text_distances[k,] < 0.5)[0]
            ids_text = text_indices[k,idx_text]
            ids = list(set(list(ids_image) + list(ids_text)))
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions1.append(posting_ids)
    
    del neighbors_model, image_distances, image_indices, text_distances, text_indices
    gc.collect()
    neighbors_model = NearestNeighbors(n_neighbors = KNN).fit(image_embeddings_3)
    text_distances, text_indices = neighbors_model.kneighbors(image_embeddings_3)
    neighbors_model_1 = NearestNeighbors(n_neighbors = KNN).fit(image_embeddings_4)
    text_distances_1, text_indices_1 = neighbors_model_1.kneighbors(image_embeddings_4)
    predictions2 = []
    for k in range(df.shape[0]):
            # This are the original thresholds that gives 0.8035 cv (optimize with a for loop)
            idx_text = np.where(text_distances[k,] < 0.5)[0]
            ids_text = text_indices[k,idx_text]
            idx_text1 = np.where(text_distances_1[k,] < 0.5)[0]
            ids_text1 = text_indices[k,idx_text]
            # Get the union of boths idsi
            ids = list(set(list(ids_text)+ list(ids_text1)))
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions2.append(posting_ids)
    del neighbors_model, text_distances, text_indices,neighbors_model_1,text_distances_1, text_indices_1
    gc.collect()

In [None]:
del([image_embeddings_4, image_embeddings_3, image_embeddings_2, image_embeddings_1])
gc.collect()
def get_image_embeddings(image_paths, fold = 4):
    embeds = []
    
    if fold == 4:
        margin = ArcMarginProduct(
                n_classes =8793, 
                s = 30, 
                m = 0.7, 
                name='head/arc_margin', 
                dtype='float32'
                )
    elif fold == 3:
        margin = ArcMarginProduct(
                n_classes = 8804, 
                s = 30, 
                m = 0.7, 
                name='head/arc_margin', 
                dtype='float32'
                )
    elif fold == 2:
        margin = ArcMarginProduct(
                n_classes = 8811, 
                s = 30, 
                m = 0.7, 
                name='head/arc_margin', 
                dtype='float32'
                )
    elif fold == 1:
        margin = ArcMarginProduct(
                n_classes = 8819, 
                s = 30, 
                m = 0.7, 
                name='head/arc_margin', 
                dtype='float32'
                )
    elif fold == 0:
        margin = ArcMarginProduct(
                n_classes = 8829, 
                s = 30, 
                m = 0.7, 
                name='head/arc_margin', 
                dtype='float32'
                )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x = efn.EfficientNetB0(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    if fold == 4:
        model.load_weights('../input/b0-5fold/results(5)/weights.hdf5')
    elif fold == 3:
        model.load_weights('../input/b0-5fold/results(4)/weights.hdf5')
    elif fold == 2:
        model.load_weights('../input/b0-5fold/results(3)/weights.hdf5')
    elif fold == 1:
        model.load_weights('../input/b0-5fold/results(2)/weights.hdf5')
    elif fold == 0:
        model.load_weights('../input/b0-5fold/weights(1).hdf5')
    print('done')
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

image_embeddings_4 = get_image_embeddings(image_paths.values, fold = 4)
image_embeddings_3 = get_image_embeddings(image_paths.values, fold = 3)
image_embeddings_2 = get_image_embeddings(image_paths.values, fold = 2)
image_embeddings_1 = get_image_embeddings(image_paths.values, fold = 1)
image_embeddings_0 = get_image_embeddings(image_paths.values, fold = 0)
image_embeddings = np.average([image_embeddings_4, image_embeddings_3, image_embeddings_1, image_embeddings_0], axis = 0)

In [None]:
del ([image_embeddings_4, image_embeddings_3, image_embeddings_1, image_embeddings_0])
gc.collect()
def get_neighbors(df, embeddings,threshold = 0.0):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids =' '.join(df['posting_id'].iloc[ids].values)
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return predictions


predictions4 = get_neighbors(df, image_embeddings, threshold=0.35)

In [None]:
df['pre1']=predictions
df['pre2']=predictions1
df['pre3']=predictions2
df['pre4']=predictions4
def combine_predictions(row):
    x = list(set(row['pre1'].split(' ')).union(set(row['pre4'].split(' ')).intersection(row['pre3'].split(' ')).intersection(set(row['pre2'].split(' ')))))
    return ' '.join( np.unique(x))
# Get neighbors
df['matches'] = df.apply(combine_predictions, axis = 1)
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)