In [None]:
!apt install ../input/pyturbojpeg/libturbojpeg_1.4.2-0ubuntu3.4_amd64.deb
!pip install ../input/pyturbojpeg/PyTurboJPEG-1.4.1

In [None]:
import numpy as np
import pandas as pd 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.utils

from torch.utils.data import DataLoader,Dataset
from torchvision.models import *
from torchvision.datasets import ImageFolder
from torch.autograd import Variable
import torchvision.models as models

import torchvision.datasets as datasets
import torch.optim as optim
from torch.utils.data.dataset import Dataset

from pathlib import Path
import sys

from glob import glob
from PIL import Image

import itertools
import random

from tqdm.notebook import tqdm_notebook

from turbojpeg import TurboJPEG
import albumentations as A
from albumentations.pytorch import ToTensorV2
import matplotlib.pyplot as plt

torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

In [None]:
np.random.seed(42)

BATCH_SIZE = 32
NUMBER_EPOCHS = 5
IMG_SIZE = 200

In [None]:
def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.figure(figsize=(150, 150))
    plt.show()    

def show_plot(iteration,loss):
    plt.plot(iteration,loss)
    plt.show()

In [None]:
all_images = glob("../input/shopee-product-matching/train_images/*.jpg")

In [None]:
train = pd.read_csv("../input/shopee-product-matching/train.csv").sample(frac=1.0, random_state=666).reset_index(drop=True)

train.shape

In [None]:
correct = train.loc[:, ['posting_id', 'label_group']] \
    .merge(train.groupby('label_group', as_index=False).agg({'posting_id': list}), on='label_group', how='left') \
    .rename(columns={'posting_id_x': 'posting_id', 'posting_id_y': 'matches'}).drop(columns=['label_group']).explode('matches')\
    .reset_index(drop=True)

correct['check'] = 1
correct

In [None]:
val_groups = random.sample(list(train['label_group'].unique()), 1014)

splits = {
    'train': train.loc[~train['label_group'].isin(val_groups), :],
    'valid': train.loc[train['label_group'].isin(val_groups), :]
}

groups = dict()
pairs = dict()

for split in list(splits.keys()):
    groups[split] = splits[split].groupby('label_group', as_index=False).agg({'posting_id': list})
    
    combs = []

    for i in groups[split]['posting_id']:
        combs.extend(list(itertools.combinations(i, 2)))
    
    pairs[split] = pd.DataFrame({
        'item_1': [i[0] for i in combs],
        'item_2': [i[1] for i in combs]
    }).sample(frac=1.0).drop_duplicates() \
        .merge(train.rename(columns={'posting_id': 'item_1'}).loc[:, ['item_1', 'image']], on='item_1', how='left') \
        .merge(train.rename(columns={'posting_id': 'item_2'}).loc[:, ['item_2', 'image']], on='item_2', how='left')

pairs

In [None]:
tra = list(zip(pairs['train'].image_x.values, pairs['train'].image_y.values))
val = list(zip(pairs['valid'].image_x.values, pairs['valid'].image_y.values))

print("Total train pairs:", len(train))    
print("Total val pairs:", len(val))

In [None]:
jpeg_reader = TurboJPEG() 

img = random.choice(glob("../input/shopee-product-matching/train_images/*.jpg"))

with open(img, "rb") as f:
    img0 = jpeg_reader.decode(f.read(), 0) 
    
transform = A.Compose([
    A.SmallestMaxSize(max_size=200, p=1.0),
    A.RandomCrop(width=200, height=200, p=1.0)
])
    
plt.imshow(transform(image=img0)['image'])

In [None]:
class trainingDataset(Dataset):
    
    def __init__(self,relationships, transform=None): 
        self.relationships = relationships
        self.transform = transform
        self.jpeg_reader = TurboJPEG() 
        
    def __getitem__(self, index):
        img0_info = self.relationships[index]
        img0_path = glob("../input/shopee-product-matching/train_images/" + img0_info[0])
        img0_path = random.choice(img0_path)
        
        should_get_same_class = random.choice([0,1]) 

        if should_get_same_class==1:
            img1_path = glob("../input/shopee-product-matching/train_images/" + img0_info[1])
            img1_path = random.choice(img1_path)
        else:
            img1_path = glob("../input/shopee-product-matching/train_images/*.jpg")
            img1_path = random.choice(img1_path)
        
        with open(img0_path, "rb") as f:
            img0 = self.jpeg_reader.decode(f.read(), 1) 
            
        with open(img1_path, "rb") as f:
            img1 = self.jpeg_reader.decode(f.read(), 1) 
        
        if self.transform is not None:
            img0 = self.transform(image=img0)['image']
            img1 = self.transform(image=img1)['image']
        
        return img0, img1, should_get_same_class 
    
    def __len__(self):
        return len(self.relationships)

In [None]:
val[0]

In [None]:
transform = A.Compose([
    A.SmallestMaxSize(max_size=200, p=1.0),
    A.RandomCrop(width=200, height=200, p=1.0),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.Rotate(p=0.25),
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    ToTensorV2()
])


transform_val = A.Compose([
    A.SmallestMaxSize(max_size=200, p=1.0),
    A.RandomCrop(width=200, height=200, p=1.0),
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    ToTensorV2()
])

trainset = trainingDataset(relationships=tra[:10000],
                           transform=transform)

trainloader = DataLoader(trainset,
                        shuffle=True,
                        num_workers=8,
                        batch_size=BATCH_SIZE)

valset = trainingDataset(relationships=val[:1000],
                         transform=transform_val)

valloader = DataLoader(valset,
                        shuffle=True,
                        num_workers=8,
                        batch_size=BATCH_SIZE)

In [None]:
vis_dataloader = DataLoader(trainset,
                        shuffle=True,
                        num_workers=8,
                        batch_size=8)
dataiter = iter(vis_dataloader)


example_batch = next(dataiter)
concatenated = torch.cat((example_batch[0],example_batch[1]),0)

imshow(torchvision.utils.make_grid(concatenated))

print(example_batch[2])

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.cnn1 = models.resnet50(pretrained=True)
        self.fc1 = nn.Linear(2*1000, 500)
        self.fc2 = nn.Linear(500, 500)
        self.fc3 = nn.Linear(500, 2)


    def forward(self, input1, input2):
        output1 = self.cnn1(input1)
        output1 = output1.view(output1.size()[0], -1)
        output2 = self.cnn1(input2)
        output2 = output2.view(output2.size()[0], -1)
        
        output = torch.cat((output1, output2),1)
        output = F.relu(self.fc1(output))
        output = F.relu(self.fc2(output))
        output = self.fc3(output)
        return output

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

net = SiameseNetwork().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

counter = []
loss_history = [] 
iteration_number= 0
best = 0

for epoch in range(0,NUMBER_EPOCHS):
    print("Epoch：", epoch, " start.")
    for i, data in enumerate(tqdm_notebook(trainloader),0):
        img0, img1, labels = data
        img0, img1, labels = img0.to(device), img1.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(img0, img1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        if i %10 == 0:
            iteration_number +=10
            counter.append(iteration_number)
            loss_history.append(loss.item())
    
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for data in tqdm_notebook(valloader):
            img0, img1, labels = data
            img0, img1, labels = img0.to(device), img1.to(device) , labels.to(device)
            outputs = net(img0,img1)
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()
    
    if (100 * correct_val / total_val) > best:
        torch.save(net, "best_siamese.pth")
        best = 100 * correct_val / total_val
    print((100 * correct_val / total_val))
    show_plot(counter,loss_history)

In [None]:
img0.shape

In [None]:
def image_similarity(img0,img1):
    
    with open(img0, "rb") as f:
        img0 = jpeg_reader.decode(f.read(), 1) 
            
    with open(img1, "rb") as f:
        img1 = jpeg_reader.decode(f.read(), 1) 
        
    f, axarr = plt.subplots(1,2)
    axarr[0].imshow(img0)
    axarr[1].imshow(img1)
        
    img0 = transform_val(image=img0)['image'].view(1,3,200,200)
    img1 = transform_val(image=img1)['image'].view(1,3,200,200)
    
    return torch.softmax(net(img0,img1), 1)[0][1].item()

image_similarity("../input/shopee-product-matching/test_images/0006c8e5462ae52167402bac1c2e916e.jpg", 
                 "../input/shopee-product-matching/test_images/0007585c4d0f932859339129f709bfdc.jpg")