In [None]:
# Thanks for Code from https://www.kaggle.com/mirzarahim/introduction-to-pca-image-compression-example
# Introduction to PCA: Image Compression example

# https://github.com/vivekrmk/Image-Compression-Principal-Component-Analysis-Pytorch/blob/main/Pytorch_PCA_journey.ipynb
# https://github.com/Erikfather/PCA-python/blob/master/Face_Rec.py

In [1]:
# Checking GPU Units

import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

import collections
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.cuda.device_count())

True
NVIDIA GeForce RTX 2060
1


In [3]:
DATA_PATH = 'shopee-product-matching/'

import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from matplotlib.image import imread
from IPython.display import Image
from PIL import Image
import tensorflow as tf
from tqdm import tqdm

train = pd.read_csv(DATA_PATH + 'train.csv')
train['image'] = DATA_PATH + 'train_images/' + train['image']

n_train = 8000
n_test = 1000
n_valid = 300
K = 120 # PCA, num of principal components

sample = train.head(n_train)
tmp = sample.groupby('label_group').posting_id.agg('unique').to_dict()
sample['target'] = sample.label_group.map(tmp)
test   = train.loc[n_train+1:n_train+n_test]
test = test.reset_index(drop=True) # initialize indexing
image_idx = sample['image']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['target'] = sample.label_group.map(tmp)


In [4]:
def getF1score(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def getPrecision(col): # col = oof_cnn
    def precision(row):
        
        a = np.in1d(row.target,row[col])
        temp = collections.Counter(a)
        correct = temp[True]/len(a)
        
        # if np.where(row.oof_cnn == row.target[0]) != []:
        #     correct = 1
        # else:  
        #     correct = 0

        return correct

    return precision

def getRecall(col):
    def recall(row):
        return 1/len(row[col])
    return recall

In [5]:
class ImageTransform():
    def __init__(self, mean, std):
        self.data_transform = transforms.Compose([
            transforms.Resize((256,256)),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
    def __call__(self, img):
        return self.data_transform(img)

In [6]:
class Img_Dataset(torch.utils.data.Dataset):
    def __init__(self, file_list, transform):
        self.file_list = file_list
        self.transform = transform
        
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, index):
        img_path = self.file_list[index]
        img = Image.open(img_path)
        img = img.convert('L')
        img_transformed = self.transform(img)
        
        return img_transformed

In [7]:
# Preparing train dataset, 2021. 5. 19
train_img_list = image_idx

mean = (0.0,)
std = (1.0,)

train_dataset = Img_Dataset(file_list = train_img_list,
                            transform=ImageTransform(mean, std))
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=False)

batch_iterator = iter(train_dataloader)
images = next(batch_iterator)

print(images.size())

# torch.Size[Batch Size, Channel, Width, Height]

torch.Size([16, 1, 256, 256])


In [None]:
# prepare test features 2021. 5. 19
test_image_idx = test['image']
test_img_list = test_image_idx

test_dataset = Img_Dataset(file_list = test_img_list,
                            transform=ImageTransform(mean, std))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

batch_iterator = iter(test_dataloader)
test_images = next(batch_iterator)

len(batch_iterator)
#print(test_images.size())

In [8]:
# Making **train** features, 2021. 5. 22
from sklearn.decomposition import PCA

K = 80
DEVICE = 'cuda'

train_feature = []
train_feature = torch.tensor(train_feature)

# transfer to DEVICE (GPU memory)
train_feature = train_feature.to(DEVICE)

a = 1
with torch.no_grad():
    for batch in tqdm(train_dataloader):
        batch = batch.to(DEVICE)
        batch = batch.permute(0,2,3,1)[:,:,:,0]
        idx, row, col = batch.shape
        batch = batch.view([len(batch),-1])
        train_feature = torch.cat([train_feature, batch], dim = 0)
    
    pca_feature = PCA(n_components = K)
    train_feature = train_feature.data.cpu()
    principalComponents = pca_feature.fit_transform(train_feature)
    principalComponents = torch.tensor(principalComponents)
    principalComponents = principalComponents.to(DEVICE)
    
    train_feature = principalComponents
    
        

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:02<00:00,  4.07it/s]


In [9]:
train_feature.shape

torch.Size([8000, 80])

In [10]:
# Saving **train** Features 2021. 5. 19
train_feature = train_feature.data.cpu().numpy()
np.savetxt('trained_feature.csv', train_feature, delimiter=",")

In [11]:
# Loading **train** Features 2021. 5. 19
train_feature = np.loadtxt('trained_feature.csv', delimiter=",")
train_feature = torch.from_numpy(train_feature)
train_feature = train_feature.to(DEVICE)

# l2 norm to kill all the sim in 0-1   ** train_feature
from sklearn.preprocessing import normalize
train_feature = train_feature.data.cpu().numpy()
train_feature = np.vstack(train_feature)
train_feature = normalize(train_feature)
train_feature = torch.from_numpy(train_feature)
train_feature = train_feature.to(DEVICE)

In [None]:
# Making **test** features, 2021. 5. 19 
DEVICE = 'cuda'

test_feature = []
test_feature = torch.tensor(test_feature)

# transfer to DEVICE (GPU memory)
test_feature = test_feature.to(DEVICE)

a = 1
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        batch = batch.to(DEVICE)
        batch = batch.permute(0,2,3,1)[:,:,:,0]
        idx, row, col = batch.shape
        batch = batch.view([len(batch),-1])

        U,S,V = torch.pca_lowrank(batch, q=len(batch), center=True, niter=3)
        V = torch.tensor(V)
        #print('U shape : ', U.shape)
        #print('S shape : ', S.shape)
        #print('V shape : ', V.shape)
        
        test_feature = torch.cat([test_feature, V.T[:,:K]], dim = 0)
        #print('test_feature shape: ', test_feature.shape)
        
        # For debugging (breaking)
        #if a == 3:
        #    break
        #print('iter num : ', a)
        a = a + 1
        

In [None]:
# Saving **test** Features 2021. 5. 19
test_feature = test_feature.data.cpu().numpy()
np.savetxt('test_feature.csv', test_feature, delimiter=",")

In [None]:
# Loading **test** Features 2021. 5. 19
test_feature = np.loadtxt('test_feature.csv', delimiter=",")
test_feature = torch.from_numpy(test_feature)
test_feature = test_feature.to(DEVICE)

# l2 norm to kill all the sim in 0-1    ** test_feature
test_feature = test_feature.data.cpu().numpy()
test_feature = np.vstack(test_feature)
test_feature = normalize(test_feature)
test_feature = torch.from_numpy(test_feature)
test_feature = test_feature.to(DEVICE)

In [15]:
# Checking train_feature with train_feature, 2021. 5. 19
preds = []
CHUNK = 100

print('Finding similar images...')
CTS = len(train_feature)//CHUNK
if len(train_feature)%CHUNK != 0:
    CTS += 1
    
for j in tqdm(range(CTS)):
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(train_feature))
    #print('chunk', a, 'to', b)
    
    #distances = torch.cdist(train_feature, train_feature[a:b], p=2.0).T
    distances = torch.matmul(train_feature, train_feature[a:b].T).T
    distances = distances.data.cpu().numpy()
    
    #print(type(distances))
    #print(distances.shape)
    '''
    for k in range(b-a):
        IDX = np.argmin(distances[k][:])
        o = sample.iloc[IDX].label_group
        preds.append(o)
    '''
    
    for k in range(b-a):
        #IDX = np.argmax(distances[k][:])
        IDX = np.where(distances[k,]>0.90)[0][:]
        o = sample.iloc[IDX].posting_id.values
        preds.append(o)
        #print(len(IDX))
    
sample['predicted_label'] = preds


  4%|███                                                                                | 3/80 [00:00<00:02, 29.18it/s]

Finding similar images...


100%|██████████████████████████████████████████████████████████████████████████████████| 80/80 [00:02<00:00, 33.38it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['predicted_label'] = preds


In [16]:
sample

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,predicted_label,f1,Prec,Rec
0,train_129225211,shopee-product-matching/train_images/0000a6881...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,[train_129225211],[train_129225211],1.000000,1.0,1.0
1,train_3386243561,shopee-product-matching/train_images/00039780d...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,[train_3386243561],"[train_3386243561, train_2509069129]",1.000000,1.0,1.0
2,train_2288590299,shopee-product-matching/train_images/000a190fd...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,[train_2288590299],[train_2288590299],1.000000,1.0,1.0
3,train_2406599165,shopee-product-matching/train_images/00117e4fc...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,[train_2406599165],[train_2406599165],1.000000,1.0,1.0
4,train_3369186413,shopee-product-matching/train_images/00136d1cf...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,[train_3369186413],[train_3369186413],1.000000,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
7995,train_2193468166,shopee-product-matching/train_images/3c349438a...,d718282fbdf6420b,Mainan Anak Setrika Mesin Cuci Meja Laundry Ke...,3344605314,[train_2193468166],[train_2193468166],1.000000,1.0,1.0
7996,train_1834726971,shopee-product-matching/train_images/3c3660b60...,80bcbe1bb4c1cd99,kamper kg putih,4152361552,[train_1834726971],[train_1834726971],1.000000,1.0,1.0
7997,train_4264364778,shopee-product-matching/train_images/3c3abe4be...,be26c0c8d999b971,PRINCES - Flat Shoes Wanita,3184487800,[train_4264364778],[train_4264364778],1.000000,1.0,1.0
7998,train_526569647,shopee-product-matching/train_images/3c3cfd863...,f3cc4848e0379f59,CUP KERTAS 9.5 ISI 100LBR - KAP KERTAS - BAKIN...,1103215281,[train_526569647],[train_526569647],1.000000,1.0,1.0


In [13]:
# 2021. 5. 22. Scoring, LATEST

sample['f1'] = sample.apply(getF1score('predicted_label'),axis=1)
print('CV score for baseline = ', sample.f1.mean())
sample['Prec'] = sample.apply(getPrecision('predicted_label'),axis=1)
print('precision = ', sample.Prec.mean())
sample['Rec'] = sample.apply(getRecall('predicted_label'),axis=1)
print('recall = ', sample.Rec.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['f1'] = sample.apply(getF1score('predicted_label'),axis=1)


CV score for baseline =  0.7847582176595449
precision =  0.7808582625707626
recall =  0.8205647776981189


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['Prec'] = sample.apply(getPrecision('predicted_label'),axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['Rec'] = sample.apply(getRecall('predicted_label'),axis=1)


In [14]:
sample

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,predicted_label,f1,Prec,Rec
0,train_129225211,shopee-product-matching/train_images/0000a6881...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,[train_129225211],[train_129225211],1.000000,1.0,1.0
1,train_3386243561,shopee-product-matching/train_images/00039780d...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,[train_3386243561],[train_3386243561],1.000000,1.0,1.0
2,train_2288590299,shopee-product-matching/train_images/000a190fd...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,[train_2288590299],[train_2288590299],1.000000,1.0,1.0
3,train_2406599165,shopee-product-matching/train_images/00117e4fc...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,[train_2406599165],[train_2406599165],1.000000,1.0,1.0
4,train_3369186413,shopee-product-matching/train_images/00136d1cf...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,[train_3369186413],[train_3369186413],1.000000,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
7995,train_2193468166,shopee-product-matching/train_images/3c349438a...,d718282fbdf6420b,Mainan Anak Setrika Mesin Cuci Meja Laundry Ke...,3344605314,[train_2193468166],[train_2193468166],1.000000,1.0,1.0
7996,train_1834726971,shopee-product-matching/train_images/3c3660b60...,80bcbe1bb4c1cd99,kamper kg putih,4152361552,[train_1834726971],[train_1834726971],1.000000,1.0,1.0
7997,train_4264364778,shopee-product-matching/train_images/3c3abe4be...,be26c0c8d999b971,PRINCES - Flat Shoes Wanita,3184487800,[train_4264364778],[train_4264364778],1.000000,1.0,1.0
7998,train_526569647,shopee-product-matching/train_images/3c3cfd863...,f3cc4848e0379f59,CUP KERTAS 9.5 ISI 100LBR - KAP KERTAS - BAKIN...,1103215281,[train_526569647],[train_526569647],1.000000,1.0,1.0


In [None]:
# Checking train_feature with test_feature, 2021. 5. 19
preds = []
CHUNK = 100

print('Finding similar images...')
CTS = len(test_feature)//CHUNK
if len(test_feature)%CHUNK != 0:
    CTS += 1
    
for j in tqdm(range(CTS)):
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(test_feature))
    #print('chunk', a, 'to', b)
    
    #distances = torch.cdist(train_feature, test_feature[a:b], p=2.0).T
    distances = torch.matmul(train_feature, test_feature[a:b].T).T
    distances = distances.data.cpu().numpy()
    
    #print(type(distances))
    #print(distances.shape)
    
    for k in range(b-a):
        #IDX = np.argmax(distances[k][:])
        IDX = np.where(distances[k,]>0.9)[0][:]
        o = sample.iloc[IDX].label_group.values
        preds.append(o)
        #print(len(IDX))
        
test['predicted_label'] = preds


In [None]:
len(test['predicted_label'][0])

In [None]:
# Scoring
# Calculate Precision
correct = 0
for i in range(len(test)):
    if len( np.intersect1d(test['predicted_label'][i], test['label_group'][i])) == 1:
        correct = correct + 1

precision = correct/len(test) * 100
print('num of correct : ', correct)
print('precision : ', precision)

# Calculate Recall
correct = 0
recall = 0
temp = 0

for i in range(len(test)):
    if len( np.intersect1d(test['predicted_label'][i], test['label_group'][i])) == 1:
        L = len(test['predicted_label'][i])
        correct = correct + 1
        temp = 1/L
        recall = recall + temp

recall = recall / correct
print('recall : ', recall)

# Calculate F1 score
print('f1 : ', 2*(precision * recall)/(precision + recall))