# Inference notebook with nearest neighbors using images and text

In [1]:
import numpy as np
import pandas as pd
import cudf, cuml, cupy
import cv2, matplotlib.pyplot as plt

from tqdm import tqdm_notebook
from PIL import Image
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [2]:
!mkdir -p /root/.cache/torch/hub/checkpoints/
!cp ../input/pretrained-pytorch-models/resnet18-5c106cde.pth /root/.cache/torch/hub/checkpoints/
!cp ../input/pretrained-pytorch-models/resnet50-19c8e357.pth /root/.cache/torch/hub/checkpoints/

# Config

In [3]:
DATA_PATH = '../input/shopee-product-matching/'
COMPUTE_CV = True # always true to test locally and get CV score
CHUNK = 1024*4

# images
IMG_SIZE = 512
BATCH_SIZE = 10
WORKERS = 2
THRESHOLD_IMG = 0.95

# titles
THRESHOLD_TEXT = 0.7
MAX_FETURES = 25000

# Load data

In [4]:
test = pd.read_csv(DATA_PATH + 'test.csv')

# if we are submitting
if len(test) > 3:
    COMPUTE_CV = False

if COMPUTE_CV:
    train = pd.read_csv(DATA_PATH + 'train.csv')
    train_gf = cudf.read_csv(DATA_PATH + 'train.csv')
    
    # change to full image paths
    train['image'] = DATA_PATH + 'train_images/' + train['image']

    # target = images with the same label_group
    # target is used in calculating F1 (getMetric)
    match_labels_images = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(match_labels_images)
    
else:
    # use unseen the test data provided
    train = test
    train_gf = cudf.read_csv(DATA_PATH + 'test.csv')
    
    train['image'] = DATA_PATH + 'test_images/' + train['image']

# Image CNN

In [5]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

class ShopeeImageDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, index):
        img = Image.open(self.img_path[index]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)
    

class ShopeeImageEmbeddingNet(nn.Module):
    def __init__(self):
        super(ShopeeImageEmbeddingNet, self).__init__()
        # get the embeddings
        model = models.resnet18(True)
        model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
        model = nn.Sequential(*list(model.children())[:-1])
        model.eval()
        self.model = model
        
    def forward(self, img):        
        out = self.model(img)
        return out

In [6]:
img_dataset = ShopeeImageDataset(
    train['image'].values,
    transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]))
    
img_loader = torch.utils.data.DataLoader(img_dataset,
                                          batch_size=BATCH_SIZE,
                                          shuffle=False,
                                          num_workers=WORKERS)

img_model = ShopeeImageEmbeddingNet().cuda()

# image features
image_features = []

with torch.no_grad():
    for data in tqdm_notebook(img_loader):
        feat = img_model(data.cuda())
        
        feat = feat.reshape(feat.shape[0], feat.shape[1])
        feat = feat.data.cpu().numpy()
        
        image_features.append(feat)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/3425 [00:00<?, ?it/s]

In [7]:
from sklearn.preprocessing import normalize

# L2 norm to kill all the sim in 0-1
image_features = np.vstack(image_features)
image_features = normalize(image_features)

In [8]:
# 50 matches is the maximum allowed
neighbors = 50

model = NearestNeighbors(n_neighbors=neighbors)
model.fit(image_features)

NearestNeighbors(n_neighbors=50, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7f656f28a190>, algorithm='brute', metric='euclidean', p=2, metric_params=None, output_type='numpy')

In [9]:
def findImageNeighbors(image_features):
    preds = []
    image_features = cupy.array(image_features)
    
    print('Finding similar images...')
    CTS = len(image_features) // CHUNK
    
    if len(image_features) % CHUNK != 0:
        CTS += 1

    for j in range(CTS):
        a = j*CHUNK # select 1
        b = (j+1)*CHUNK # select the next one
        b = min(b, len(image_features)) 
        
        print('chunk',a,'to',b)
        
        # COSINE SIMILARITY DISTANCE
        distances = cupy.matmul(image_features, image_features[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(distances[k,] > THRESHOLD_IMG)[0]
            o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    return preds

In [10]:
train['oof_cnn'] = findImageNeighbors(image_features)

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'),axis=1)
    print('CV score for only using images =',train.f1.mean())

Finding similar images...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
CV score for only using images = 0.6527899883424048


# Title TFIDF

In [11]:
model = TfidfVectorizer(stop_words=None, binary=True, max_features=MAX_FETURES)
text_embeddings = model.fit_transform(train_gf.title).toarray()

print('text embeddings shape', text_embeddings.shape)

  "right", dtype_r, dtype_l, libcudf_join_type


text embeddings shape (34250, 25000)


In [12]:
def findTitleNeighbors(text_embeddings):
    preds = []

    print('Finding similar titles...')
    CTS = len(train)//CHUNK

    if len(train) % CHUNK != 0:
        CTS += 1

    for j in range(CTS):
        a = j*CHUNK # select 1
        b = (j+1)*CHUNK # select the next one
        b = min(b,len(train))

        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        distances = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        
        for k in range(b-a):
            IDX = cupy.where(distances[k,] > THRESHOLD_TEXT)[0]
            o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    return preds

In [13]:
train['oof_text'] = findTitleNeighbors(text_embeddings)

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_text'),axis=1)
    print('CV score for text = ', train.f1.mean())

Finding similar titles...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
CV score for text =  0.6137154152579091


# Submission

In [14]:
def combine_for_cv(row):
    x = np.concatenate([row.oof_text, row.oof_cnn])
    return np.unique(x)

def combine_for_sub(row):
    x = np.concatenate([row.oof_text, row.oof_cnn])
    return ' '.join(np.unique(x))

In [15]:
if COMPUTE_CV:
    # target = images with the same label_group
    # target is used in calculating F1 (getMetric)
    match_labels_images = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(match_labels_images)

    # get OOF predictions
    train['oof'] = train.apply(combine_for_cv, axis=1)

    # calculate F1 score on OOF predictions
    train['f1'] = train.apply(getMetric('oof'), axis=1)
    print('CV Score =', train.f1.mean())


train['matches'] = train.apply(combine_for_sub, axis=1)

CV Score = 0.7342644345623364


In [16]:
train[['posting_id','matches']].to_csv('submission.csv', index=False)
sub = pd.read_csv('submission.csv')
sub

Unnamed: 0,posting_id,matches
0,train_129225211,train_129225211 train_2278313361
1,train_3386243561,train_3386243561
2,train_2288590299,train_2288590299
3,train_2406599165,train_1508100548 train_1744956981 train_240659...
4,train_3369186413,train_3369186413
...,...,...
34245,train_4028265689,train_4028265689
34246,train_769054909,train_1463059254 train_769054909
34247,train_614977732,train_1264798465 train_512157627 train_6034107...
34248,train_3630949769,train_1431563868 train_3419392575 train_363094...
