# **相似度计算专题**

### 此次Kernel的核心目的为后处理的**相似度计算代码填空**专题，关于图像和文本的特征张量已生成（对于图像已训练调优模型，直接加载使用即可），你需要利用这两个特征张量分别/结合进行内部相似度计算。

### 上传至Kaggle时，需要在dataset添加Add data：使用url搜索添加以下两个包：
### timm 图像模型构件库
### https://www.kaggle.com/kozodoi/timm-pytorch-image-models
### 已训练好的nfnet网络
### https://www.kaggle.com/winniy/pretrain-nfnet

## **第一步，简单了解预处理及特征提取部分**

### 以下部分仅需了解即可，可自行查阅谷歌或官方文档了解语句用途

### **1.1 导包**

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2

from tqdm import tqdm 

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 

import gc
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

# 新增：albumentations 是一个图像增强库
import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2
# timm 是一个快速构建图像网络的模型库
import timm

In [None]:

import transformers

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml import PCA
from cuml.neighbors import NearestNeighbors
from sklearn.preprocessing import Normalizer

### **1.2 基本参数设置**

In [None]:
class CFG:
    
    img_size = 512
    batch_size = 12
    seed = 2021
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'eca_nfnet_l0'
    model_path = '../input/pretrain-nfnet/nfnet_epoch_15.pt'

In [None]:
# 随机种子固定
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

### **1.3 加载数据集和Pytorch的数据集读取loader**

In [None]:
device = torch.device('cuda')
NUM_WORKERS = 4
BATCH_SIZE = 16
SEED = 42
CHECK_SUB = False
GET_CV = True
transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

################################################ MODEL PATH ###############################################################

TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'

model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
}

In [None]:
def read_dataset(mode):
    if(mode=='train'):
        text_path='./input/shopee-product-matching/'+ mode +'.csv'
        df = pd.read_csv(text_path)
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
    image_paths = '../input/shopee-product-matching/'+ mode +'_images/' + df['image']
    return df, df_cu, image_paths


In [None]:
class ShopeeTextDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

In [None]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

In [None]:
def get_xlmR_text_embeddings(df):
    embeds = []
    
    model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(device)

    text_dataset = ShopeeTextDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [None]:
def get_neighbors_knn(df, embeddings, KNN = 50):
    '''
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        thresholds = list(np.arange(0,2,0.1))
        
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [None]:
def get_neighbours_cos_sim(df,embeddings):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if GET_CV:
        thresholds = list(np.arange(0.5,0.7,0.05))

        scores = []
        for threshold in thresholds:
            
################################################# Code for Getting Preds #########################################
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
# ######################################################################################################################
#             df['pred_matches'] = preds
#             df['f1'] = f1_score(df['matches'], df['pred_matches'])
#             score = df['f1'].mean()
#             print(f'Our f1 score for threshold {threshold} is {score}')
#             scores.append(score)
            
#         thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
#         max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
#         best_threshold = max_score['thresholds'].values[0]
#         best_score = max_score['scores'].values[0]
#         print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4
        threshold = 0.6

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                    
    return df, preds

In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths):

        self.image_paths = image_paths
        self.augmentations = A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
            ToTensorV2(p=1.0)
        ]
        )

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)

### **1.4 图像特征提取模型构建**

In [None]:
class ShopeeModel(nn.Module):

    def __init__(self,n_classes = CFG.classes,model_name = CFG.model_name,fc_dim = 512,use_fc = True,pretrained = False):

        super(ShopeeModel,self).__init__()

        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        final_in_features = self.backbone.head.fc.in_features
        self.backbone.head.fc = nn.Identity()
        self.final = nn.Identity()
        self.backbone.head.global_pool = nn.Identity()
        self.pooling =  nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        final_in_features = fc_dim

    def forward(self, image, label):
        feature = self.extract_feat(image)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

### **1.5 图像特征图、文本特征图抽取（关键）**

#### **图像特征图**

In [None]:
def get_image_embeddings(image_paths, model_name = CFG.model_name):
    embeds = []

    model = ShopeeModel(model_name = model_name)
    model.eval()
    
    checkpoint = torch.load(CFG.model_path)
    del checkpoint['final.weight']
    model.load_state_dict(checkpoint,strict=True)
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths)
    image_loader = torch.utils.data.DataLoader(image_dataset,batch_size=CFG.batch_size,pin_memory=True,drop_last=False,num_workers=4)
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    image_embeddings = np.concatenate(embeds)
    del model, embeds
    gc.collect()
    return cupy.asarray(image_embeddings)

#### **文本特征图**

## **第二步，相似度计算**

In [None]:
# 运行读取数据集
df,df_cu,image_paths = read_dataset('test')
df.head()

In [None]:
# 获得特征张量embedding
image_embeddings = get_image_embeddings(image_paths.values)
text_embeddings = get_xlmR_text_embeddings(df)

In [None]:
print(image_embeddings.shape)
print(text_embeddings.shape)
print(type(image_embeddings))
print(type(text_embeddings))

### **现在你拥有两个特征张量，image_embeddings为提取的图像特征，text_embeddings为提取的文本特征。**
### **在这里，Query和Features Datebase为同一个，即内部检索相似度。**
### **张量的每一行为某一件商品，请匹配与之相近的商品。**
### **你可以两个张量分别检索相似度，再将结果合并去重。**
### **你也可以同时使用跨模态的两个张量，直接检索得到结果。**

----------------

## **现成例子演示，仅使用【文本】计算乘法相似度**
## **方法来源：https://www.kaggle.com/finlay/unsupervised-image-text-baseline-in-20min/data#image-CNN**
## **图像相似度KNN K近邻计算相似方法：https://www.kaggle.com/parthdhameliya77/pytorch-resnext50-32x4d-image-tfidf-inference**
## **你还可以使用其他计算相似度的方法**
## **参考文献：https://arxiv.org/pdf/1611.01747.pdf Figure1**

In [None]:
def get_image_predictions(df, embeddings,threshold = 1.2):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[cupy.asnumpy(ids)].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return predictions


In [None]:
# def get_predictions(df, embeddings, threshold=0.75):
    
#     preds = []
    
#     # 面对大数据时，矩阵过大计算成本较高，一般做分块处理
#     CHUNK = 1024*2
#     CTS = len(df)//CHUNK
#     if len(df)%CHUNK!=0: CTS += 1
#     for j in range( CTS ):

#         a = j*CHUNK
#         b = (j+1)*CHUNK
#         b = min(b,len(df))
#         print('chunk',a,'to',b)

#         # ================ 你需要修改这里 ================
#         # 乘法相似度计算
#         # ==============================================
#         cts = cupy.matmul( embeddings, embeddings[a:b].T).T
        
#         # ==============================================
        
        
#         for k in range(b-a):
#             IDX = cupy.where(cts[k,]>threshold)[0]
#             o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
#             preds.append(o)
    
#     del embeddings
#     gc.collect()
#     return preds
df,text_predictions = get_neighbours_cos_sim(df,text_embeddings)
print((text_predictions))
text_pre=[]
for p in text_predictions:
    list1=[]
    list1.append(p)
    c=np.array(list1)
    text_pre.append(c)
print(text_pre)
# text_prediction=[array(['test_2255846744'], dtype=object), array(['test_3588702337'], dtype=object), array(['test_4015706929'], dtype=object)]
# if not GET_CV:
#     text_predictions = [' '.join(text_preds) for text_preds in text_predictions]
#     df['matches'] = text_predictions
#     df[['posting_id','matches']].to_csv('submission.csv',index=False)
# else:
#     df['matches'] = text_predictions
#     df[['posting_id','matches']].to_csv('submission.csv',index=False)

In [None]:
# text_predictions = get_predictions(df, text_embeddings, 0.75)
image_predictions = get_image_predictions(df, image_embeddings, threshold = 4.8)
del text_embeddings
del image_embeddings
print((image_predictions))
print(type(image_predictions))

-----------------------

## **第三步，提交答案**

In [None]:
# 用于提交答案
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x))

In [None]:
# 使用图像维度记得修改此处👇
df['image_predictions'] = image_predictions
df['text_predictions'] = text_pre
df['matches'] = df.apply(combine_predictions, axis = 1)
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)