In [1]:
!pip install ../input/faiss-170-latest-cpu-gpu/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl

Processing /kaggle/input/faiss-170-latest-cpu-gpu/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.0


In [2]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [3]:
import os
import cv2
import math
import faiss
import torch
import timm
import imagehash
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import albumentations as a

from sklearn.feature_extraction.text import TfidfVectorizer
from albumentations.pytorch import ToTensorV2
from typing import List, Tuple, Dict
from pathlib import Path
from tqdm import tqdm
from transformers import AutoModel
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast

# NLP Model Utils

In [4]:
class ShopeeNLPModel(nn.Module):

    def __init__(self, model_path, num_classes, dropout,
                 margin_func, **margin_params):

        super(ShopeeNLPModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(dropout)
        self.feature_dim = self.model.config.hidden_size
        self.arc_margin = margin_func(
            in_features=self.feature_dim,
            out_features=num_classes,
            **margin_params)

    def forward(self, input_ids, token_type_ids, attention_mask, label):

        x = self.extract_features(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )
        x = self.dropout(x)
        x = self.arc_margin(x, label)

        return x

    def extract_features(self, **data):

        x = self.model(**data).last_hidden_state[:, 0, :]

        return x

In [5]:
class ArcMarginProduct(nn.Module):
    """
    Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
    Reference: https://github.com/ronghuaiyang/arcface-pytorch/blob/47ace80b128042cd8d2efd408f55c5a3e156b032/models/metrics.py#L10
    """  # noqa

    def __init__(self,
                 in_features,
                 out_features,
                 s=30.0, m=0.50,
                 easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(
            torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, data, label):

        # cos(theta) & phi(theta)
        cosine = F.linear(F.normalize(data), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m

        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)

        # convert label to one-hot
        one_hot = torch.zeros(cosine.size(), device=data.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)

        # torch.where(out_i = {x_i if condition_i else y_i)
        # you can use torch.where if your torch.__version__ is 0.4
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

In [6]:
class PriceMatchNLPData(Dataset):

    def __init__(
            self, df,
            text_col='title',
            label_col=None,
            **tokenizer_args):

        self.df = df.reset_index(drop=True)
        self.text_col = text_col
        self.label_col = label_col
        self.tokenizer = BertTokenizerFast.from_pretrained(**tokenizer_args)

        if self.label_col is not None:
            self.df = encode_label(
                self.df, col_to_encode=label_col, col_encoded=label_col)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        row = self.df.iloc[idx]
        data = self.tokenizer(
            row[self.text_col],
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        data = dict((k, v.squeeze()) for k, v in data.items())

        if self.label_col is not None:
            data['label'] = row[self.label_col]

        return data


In [7]:
def get_nlp_data_loader(
        df,
        text_col='title',
        label_col=None,
        shuffle=False,
        batch_size=32,
        **tokenizer_args):

    dataset = PriceMatchNLPData(df, text_col, label_col, **tokenizer_args)
    dataloader = DataLoader(dataset, shuffle=shuffle,
                            batch_size=batch_size, num_workers=os.cpu_count())

    return dataloader

# IMG Model Utils

In [8]:
import torch
import torch.nn as nn


class ShopeeIMGModel(nn.Module):

    def __init__(self, model_path, num_classes, dropout,
                 margin_func, feature_dim=512, **margin_params):
        super(ShopeeIMGModel, self).__init__()

        self.feature_dim = feature_dim

        self.model = torch.load(model_path)

        if 'efficientnet' in model_path.name:
            self.model.classifier = nn.Identity()
            self.model.global_pool = nn.Identity()

        elif 'nfnet' in model_path.name:
            self.model.head.fc = nn.Identity()
            self.model.head.global_pool = nn.Identity()

        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(dropout)
        self.lin = nn.Linear(self.model.num_features, self.feature_dim)
        self.bn = nn.BatchNorm1d(self.feature_dim)
        self.arc_margin = margin_func(
            in_features=self.feature_dim,
            out_features=num_classes,
            **margin_params)

    def forward(self, image, label):

        x = self.extract_features(image)
        x = self.arc_margin(x, label)

        return x

    def extract_features(self, image):

        batch_size = image.shape[0]
        x = self.model(image)
        x = self.pooling(x).view(batch_size, -1)
        x = self.dropout(x)
        x = self.lin(x)
        x = self.bn(x)

        return x

In [9]:
class PriceMatchImgData(Dataset):
    def __init__(self, df, img_folder, transforms, img_path_col='image',
                 label_col=None):

        self.df = df.reset_index(drop=True)
        self.img_folder = img_folder
        self.img_path_col = img_path_col
        self.label_col = label_col
        self.transforms = transforms

        if self.label_col is not None:
            self.df = encode_label(
                self.df, col_to_encode=label_col, col_encoded=label_col)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        row = self.df.iloc[idx]
        img_path = self.img_folder / row[self.img_path_col]
        image = cv2.imread(img_path.as_posix())
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        augmented = self.transforms(image=image)
        image = augmented['image']

        data = dict(image=image)

        if self.label_col is not None:
            data['label'] = row[self.label_col]

        return data

In [10]:
def get_val_transforms(img_dim):

    trans = a.Compose([
        a.Resize(img_dim, img_dim, always_apply=True),
        a.Normalize(),
        ToTensorV2(),
    ])

    return trans

In [11]:
def get_img_data_loader(
        df,
        img_dim,
        img_folder,
        img_path_col='image',
        label_col=None,
        shuffle=False,
        batch_size=32):

    if label_col is None:
        transforms = get_val_transforms(img_dim)
    else:
        transforms = get_train_transforms(img_dim)

    dataset = PriceMatchImgData(
        df, img_folder, transforms, img_path_col, label_col)

    dataloader = DataLoader(dataset, shuffle=shuffle,
                            batch_size=batch_size, num_workers=os.cpu_count())

    return dataloader


In [12]:
def generate_embeddings(model, dataloader, device, feature_dim):

    model.eval()
    pbar = tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc='>> Generating embeddings'
    )
    batch_size = dataloader.batch_size
    emb_arr = np.zeros(
        (len(dataloader.dataset), feature_dim), dtype=np.float32)

    for i, data in pbar:

        data = dict((k, v.to(device)) for k, v in data.items())

        # Compute output & loss
        with torch.no_grad():
            features = model.extract_features(**data)

        features = features.cpu().numpy()

        emb_arr[i*batch_size:(i+1)*batch_size] = features

    return emb_arr

In [13]:
def load_model(model_class, state_dict_path, device, final_layer_key, **model_config):
    """
    For `final_layer_key`, use 'lin.bias' for old model, 'arc_margin.weight' for new model
    """
    state_dict = torch.load(state_dict_path, map_location=device)
    output_dim = len(state_dict[final_layer_key])
    model = model_class(num_classes=output_dim, **model_config)
    model.load_state_dict(state_dict)
    
    return model

# KNN Utils

In [14]:
def faiss_knn_cosine(
        emb_arr: np.ndarray,
        query_idx: List,
        neighbors: int = 50,
        chunksize: int = 512,
        use_gpu: bool = True) -> Tuple[np.ndarray, np.ndarray]:
    
    # Infer feature dimension from emb_arr
    dim = emb_arr.shape[1]

    # Cast embeddings to float32 (Faiss only support this)
    emb_arr = emb_arr.astype(np.float32)

    # Normalize the emb arr in place
    faiss.normalize_L2(emb_arr)

    # Initialize Faiss Index (GPU)
    index = faiss.IndexFlatIP(dim)
    
    if use_gpu:
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)
        
    index.add(emb_arr)
    
    # Find neighbors

    chunks = np.ceil(len(query_idx) / chunksize).astype(int)

    for i in tqdm(range(chunks), total=chunks, desc='>> Finding Neighbours'):

        query_idx_sub = query_idx[i*chunksize:(i+1)*chunksize]

        chunk_dists, chunk_indices = index.search(
            emb_arr[query_idx_sub], neighbors)

        if i == 0:
            dists = chunk_dists
            indices = chunk_indices
        else:
            dists = np.row_stack([dists, chunk_dists])
            indices = np.row_stack([indices, chunk_indices])

    return dists, indices

In [15]:
def faiss_knn_hamming(
        emb_arr: np.ndarray,
        query_idx: List,
        neighbors: int = 50,
        chunksize: int = 512) -> Tuple[np.ndarray, np.ndarray]:
    """
    Binary arr should be packed to their bytes equivalent, before
    the packed array is passed as `emb_arr`

    i.e.
        emb_arr = array([0, 0, 0, 0, 0, 0, 1, 0])
        packed_arr = np.packbits(emb_arr) # array([2], dtype=uint8)

    Note:
        The distance returned is actually similarity ranges from 0 to 1.
        1 means both items are exactly the same.
    """
    # Infer feature dimension from emb_arr
    dim = emb_arr.shape[1] * 8

    # Cast embeddings to float32 (Faiss only support this)
    emb_arr = emb_arr.astype(np.uint8)

    # Initialize Faiss Index
    index = faiss.IndexBinaryFlat(dim)
    index.add(emb_arr)

    # Find neighbors
    chunks = np.ceil(len(query_idx) / chunksize).astype(int)

    for i in tqdm(range(chunks), total=chunks, desc='>> Finding Neighbours'):

        query_idx_sub = query_idx[i*chunksize:(i+1)*chunksize]

        chunk_dists, chunk_indices = index.search(
            emb_arr[query_idx_sub], neighbors)

        if i == 0:
            dists = chunk_dists
            indices = chunk_indices
        else:
            dists = np.row_stack([dists, chunk_dists])
            indices = np.row_stack([indices, chunk_indices])

    # Change dist to similarity
    dists = 1 - (dists / dim)

    return dists, indices


In [16]:
def get_similar_items(
        df: pd.DataFrame, emb: np.ndarray, val_idx: List,
        idx_to_id_col_mapping: Dict, id_col: str = 'posting_id',
        metric: str = 'cosine',
        n: int = 50, chunksize: int = 512) -> pd.DataFrame:
    """
    Given a `df`, its embeddings `emb` (must be of the same length as `df`),
    and the `val_idx` (row index for validation, must correspond to the
    indices in `df`), find the number of closest `n` neighbors in batches
    using `chunksize`.

    `id_col` is the item id that is used to identify the neighbors, and
    `idx_to_id_col_mapping` is the row index to id mapping to map the nearest
    neighbors index (returned by faiss) to the actual item id.

    `metric` should be either cosine or hamming.

    After search, a similarity df `sim_df` is returned, which contains the
    row index, distance and item id of each row's top n neighbors.
    """
    assert len(emb) == len(df),\
        f'Num of emb {len(emb)} != Num of rows in df {len(df)}'
    assert type(val_idx) is list, 'val_idx must be a list'
    assert len(set(val_idx)) <= len(df),\
        f'Num of elements in val_idx {len(set(val_idx))}'\
        f' > df len {len(df)}'
    assert max(val_idx) < len(df),\
        f'{max(val_idx)} is out of range for'\
        f'df of shape {df.shape}'

    if metric == 'cosine':
        distances, indices = faiss_knn_cosine(
            emb, val_idx, n, chunksize)
    elif metric == 'hamming':
        distances, indices = faiss_knn_hamming(
            emb, val_idx, n, chunksize)

    sim_df_base = df.loc[val_idx, [id_col]].reset_index(drop=True).copy()
    sim_df_base['neighbors'] = val_idx
    sim_df_base['distances'] = 1.0

    sim_df = sim_df_base.copy()

    sim_df['neighbors'] = pd.Series(indices.tolist())
    sim_df['distances'] = pd.Series(distances.tolist())

    # Exploding multiple columns of equal len & sequence
    # https://stackoverflow.com/a/59330040/10841164
    sim_df = (
        sim_df
        .set_index(id_col)
        .apply(pd.Series.explode)
        .reset_index()
    )

    # This step makes sure each item should at least be its own neighbour
    sim_df = (
        sim_df_base
        .append(sim_df)
        .drop_duplicates(subset=[id_col, 'neighbors'])
        .reset_index(drop=True)
    )

    sim_df['matches'] = sim_df['neighbors'].map(idx_to_id_col_mapping)

    return sim_df

In [17]:
def generate_idx_to_col_map(
        df: pd.DataFrame, col: str = 'posting_id') -> Dict:
    """
    Generate row index to column value mapping as a dictionary
    """
    idx_to_col_map = df[col].to_dict()

    return idx_to_col_map

# General Config

In [18]:
# KNN Config
KNN_N = 51 # Including self-match
CHUNKSIZE = 1024

# Others
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LOSS_PARAMS={"m": 0.5, "s": 30.0, "easy_margin": False}
IMG_FOLDER = Path('../input/shopee-product-matching/test_images/')

# Dataframes
TEST_DF_PATH = '../input/shopee-product-matching/test.csv'
test_df = pd.read_csv(TEST_DF_PATH)

TRAIN_DF_PATH = '../input/shopee-product-matching/train.csv'
train_df = pd.read_csv(TRAIN_DF_PATH)

idx_to_id_col_mapping = generate_idx_to_col_map(test_df)

# TFIDF

In [19]:
max_features = 25_000

vect = TfidfVectorizer(max_features=max_features, binary=True, stop_words='english')
vect.fit(test_df['title'])
tfidf_emb_arr = vect.transform(test_df['title']).astype(np.float32).toarray()

# Find neighbors & Generate submission

idx_to_id_col_mapping = generate_idx_to_col_map(test_df)

sim_df_tfidf = get_similar_items(
    df=test_df,
    emb=tfidf_emb_arr,
    val_idx=test_df.index.tolist(),
    idx_to_id_col_mapping=idx_to_id_col_mapping,
    id_col='posting_id',
    n=50,
    chunksize=8
)

del tfidf_emb_arr

>> Finding Neighbours: 100%|██████████| 1/1 [00:00<00:00, 641.33it/s]


# Indobert (NLP)

In [20]:
# NLP Model Config
NLP_PRETRAINED_MODEL_DIR = '../input/price-matchindobert-lite-p2-mlm-checkpoint/'
NLP_PRETRAINED_TOKENIZER_DIR = '../input/indobert-lite-p2-tokenizer-config/'
NLP_STATE_DICT = Path('../input/price-match-final-models/indobert_fold_0_best_train_loss.pt')
NLP_BATCH_SIZE = 512
MODEL_MAX_LENGTH = 48

In [21]:
test_nlp_loader = get_nlp_data_loader(
        test_df, text_col='title', label_col=None,
        shuffle=False, batch_size=NLP_BATCH_SIZE,
        pretrained_model_name_or_path=NLP_PRETRAINED_TOKENIZER_DIR,
        model_max_length=MODEL_MAX_LENGTH)

In [22]:
# for i, state_dict_path in enumerate(NLP_STATE_DICT_DIR.glob('*.pt')):
    
#     # Generate embeddings
#     model = load_model(model_class=ShopeeNLPModel,
#                        state_dict_path=state_dict_path,
#                        device=DEVICE,
#                        final_layer_key='arc_margin.weight',
#                        model_path=NLP_PRETRAINED_MODEL_DIR,
#                        dropout=0.0,
#                        margin_func=ArcMarginProduct,
#                        **LOSS_PARAMS
#                       )

#     model = model.to(DEVICE)
    
#     if i == 0:
#         nlp_emb_arr = generate_embeddings(model, test_nlp_loader, DEVICE, model.feature_dim)
        
#     else:
#         nlp_emb_arr += generate_embeddings(model, test_nlp_loader, DEVICE, model.feature_dim)


# Generate embeddings
model = load_model(model_class=ShopeeNLPModel,
                   state_dict_path=NLP_STATE_DICT,
                   device=DEVICE,
                   final_layer_key='arc_margin.weight',
                   model_path=NLP_PRETRAINED_MODEL_DIR,
                   dropout=0.0,
                   margin_func=ArcMarginProduct,
                   **LOSS_PARAMS
                  )

model = model.to(DEVICE)

nlp_emb_arr = generate_embeddings(model, test_nlp_loader, DEVICE, model.feature_dim)

normalize = lambda arr : (arr - arr.mean(axis=0)) / arr.std(axis=0)

nlp_emb_arr = normalize(nlp_emb_arr)

Some weights of AlbertModel were not initialized from the model checkpoint at ../input/price-matchindobert-lite-p2-mlm-checkpoint/ and are newly initialized: ['albert.pooler.weight', 'albert.pooler.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
>> Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]


In [23]:
# Find neighbors & Generate submission

sim_df_nlp = get_similar_items(
    df=test_df,
    emb=nlp_emb_arr,
    val_idx=test_df.index.tolist(),
    idx_to_id_col_mapping=idx_to_id_col_mapping,
    id_col='posting_id',
    n=KNN_N,
    chunksize=5120
)

del nlp_emb_arr, model

>> Finding Neighbours: 100%|██████████| 1/1 [00:00<00:00, 698.93it/s]


# EfficientNet B0 (IMG)

In [24]:
# # Img model config
# IMG_DIM = 512
# IMG_BATCH_SIZE = 128
# IMG_STATE_DICT_DIR = Path('../input/price-match-final-models-4fold-grouped/eb0_final_grouped/')
# IMG_PRETRAINED_MODEL_DIR = Path('../input/efficient-net-b0-b3/efficientnet_b0.pth')
# LOSS_PARAMS={"m": 0.5, "s": 30.0, "easy_margin": False}

In [25]:
# # Prepare dataloader

# test_img_loader = get_img_data_loader(
#         test_df, img_dim=IMG_DIM,
#         img_folder=IMG_FOLDER,
#         img_path_col='image',
#         label_col=None,
#         shuffle=False,
#         batch_size=IMG_BATCH_SIZE)

In [26]:
# for i, state_dict_path in enumerate(IMG_STATE_DICT_DIR.glob('*.pt')):
        
#     # Generate embeddings
#     model = load_model(model_class=ShopeeIMGModel,
#                        state_dict_path=state_dict_path,
#                        device=DEVICE,
#                        final_layer_key='arc_margin.weight',
#                        dropout=0.0,
#                        model_path=IMG_PRETRAINED_MODEL_DIR,
#                        margin_func=ArcMarginProduct,
#                        **LOSS_PARAMS
#                       )

#     model = model.to(DEVICE)

#     if i == 0:
#         img_emb_arr = generate_embeddings(model, test_img_loader, DEVICE, model.feature_dim)

#     else:
#         img_emb_arr += generate_embeddings(model, test_img_loader, DEVICE, model.feature_dim)

In [27]:
# # Find neighbors & Generate submission

# sim_df_img_eb0 = get_similar_items(
#     df=test_df,
#     emb=img_emb_arr,
#     val_idx=test_df.index.tolist(),
#     idx_to_id_col_mapping=idx_to_id_col_mapping,
#     id_col='posting_id',
#     n=KNN_N,
#     chunksize=5120
# )

# del img_emb_arr, model

# EfficientNet B3 (IMG)

In [28]:
# Img model config
IMG_DIM = 512
IMG_BATCH_SIZE = 64
IMG_STATE_DICT_DIR = Path('../input/price-match-final-models-4fold-grouped/eb3_final_grouped/')
IMG_PRETRAINED_MODEL_DIR = Path('../input/efficient-net-b0-b3/efficientnet_b3.pth')
LOSS_PARAMS={"m": 0.5, "s": 30.0, "easy_margin": False}

In [29]:
# Prepare dataloader

test_img_loader = get_img_data_loader(
        test_df, img_dim=IMG_DIM,
        img_folder=IMG_FOLDER,
        img_path_col='image',
        label_col=None,
        shuffle=False,
        batch_size=IMG_BATCH_SIZE)

In [30]:
for i, state_dict_path in enumerate(IMG_STATE_DICT_DIR.glob('*.pt')):
        
    # Generate embeddings
    model = load_model(model_class=ShopeeIMGModel,
                       state_dict_path=state_dict_path,
                       device=DEVICE,
                       final_layer_key='arc_margin.weight',
                       dropout=0.0,
                       model_path=IMG_PRETRAINED_MODEL_DIR,
                       margin_func=ArcMarginProduct,
                       **LOSS_PARAMS
                      )

    model = model.to(DEVICE)

    if i == 0:
        img_emb_arr = generate_embeddings(model, test_img_loader, DEVICE, model.feature_dim)

    else:
        img_emb_arr += generate_embeddings(model, test_img_loader, DEVICE, model.feature_dim)

>> Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
>> Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.72it/s]
>> Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.18it/s]
>> Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.68it/s]


In [31]:
# Find neighbors & Generate submission

sim_df_img_eb3 = get_similar_items(
    df=test_df,
    emb=img_emb_arr,
    val_idx=test_df.index.tolist(),
    idx_to_id_col_mapping=idx_to_id_col_mapping,
    id_col='posting_id',
    n=KNN_N,
    chunksize=5120
)

del img_emb_arr, model

>> Finding Neighbours: 100%|██████████| 1/1 [00:00<00:00, 1579.18it/s]


# Image Phash

In [32]:
# Util function to convert hex to hash array
hex_to_arr = lambda x: imagehash.hex_to_hash(x).hash.ravel().astype(int)

In [33]:
phash_emb = np.array(test_df['image_phash'].apply(hex_to_arr).tolist())
phash_emb = np.packbits(phash_emb, axis=1)

In [34]:
# Find neighbors & Generate submission

idx_to_id_col_mapping = generate_idx_to_col_map(test_df)

sim_df_phash = get_similar_items(
    df=test_df,
    emb=phash_emb,
    val_idx=test_df.index.tolist(),
    idx_to_id_col_mapping=idx_to_id_col_mapping,
    id_col='posting_id',
    metric='hamming',
    n=50,
    chunksize=5120
)

>> Finding Neighbours: 100%|██████████| 1/1 [00:00<00:00, 32.56it/s]


In [35]:
# Increase threshold by 0.1
# best_thres_eb0 = 100 #i.e. not used
# best_thres_eb3 = 100
best_thres_nlp = 0.90
best_thres_eb3 = 0.70
best_thres_tfidf = 0.75
best_thres_phash = 0.974375

submission_df = (
    sim_df_img_eb3
    .query('distances > @best_thres_eb3')
    .append(
        sim_df_phash
        .query('distances > @best_thres_phash')
    )
    .append(
        sim_df_nlp
        .query('distances > @best_thres_nlp')
    )
    .append(
        sim_df_tfidf
        .query('distances > @best_thres_tfidf')
    )
    .drop_duplicates(subset=['posting_id','matches'])
    .groupby('posting_id', as_index=False)
    ['matches']
    .apply(lambda x : ' '.join(x))
)

submission_df

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


In [36]:
submission_df.to_csv('submission.csv', index=False)