In [1]:
# Thanks for Code from https://wikidocs.net/24603
# Extracting features by using TF-IDF features

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import linear_kernel
import collections


In [4]:
DATA_PATH = 'shopee-product-matching/'
train = pd.read_csv(DATA_PATH + 'train.csv')
train = train.head(10000)
text_data = train['title']
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)

In [5]:
def getF1score(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def getPrecision(col): # col = oof_cnn
    def precision(row):
        
        a = np.in1d(row.target,row[col])
        temp = collections.Counter(a)
        correct = temp[True]/len(a)
        
        # if np.where(row.oof_cnn == row.target[0]) != []:
        #     correct = 1
        # else:  
        #     correct = 0

        return correct

    return precision

def getRecall(col):
    def recall(row):
        return 1/len(row[col])
    return recall

In [None]:
def PCA_svd(X, k, center=True):
    n = X.size()[0]
    print('n = :',n)
    print('X shape :',X.shape)
    ones = torch.ones(n).view([n,1])
    h = ((1/n) * torch.mm(ones, ones.t())) if center  else torch.zeros(n*n).view([n,n])
    H = torch.eye(n) - h
    H = H.cuda()
    print('H.double() : ', H.double().shape)
    print('X.double() : ', X.double().shape)
    X_center =  torch.mm(H.double(), X.double())
    u, s, v = torch.svd(X_center)

    print('v shape : ',v.shape)
    components  = v[:k].t()
    #explained_variance = torch.mul(s[:k], s[:k])/(n-1)
    return components


In [6]:
train

Unnamed: 0,posting_id,image,image_phash,title,label_group,target
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,[train_129225211]
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,[train_3386243561]
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,[train_2288590299]
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,[train_2406599165]
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,[train_3369186413]
...,...,...,...,...,...,...
9995,train_3058694204,4aede00854990e26f645d4c842b96754.jpg,ab1fd07094942bbd,Kaos Hoodie Anak TERLARIS/ Kaos Anak DISTRO Or...,793355432,[train_3058694204]
9996,train_1422100530,4aefbe697fe7691317349d75d8de799e.jpg,dbb1314bd278054f,Kotak Tempat Perhiasan dan Aksesoris Cincin Ge...,1035443562,[train_1422100530]
9997,train_46290018,4af0235aee3fffabd7ba0867372d101b.jpg,bf0fc1e028e0571f,Makarizo hair energy shampoo 330ml,2944123046,[train_46290018]
9998,train_2643153468,4af24ae4093e6820f527483ac8e1bd8a.jpg,eaa9859689b796c1,Masker Wajah Partikel Rumput Laut 15g Untuk Pe...,114829279,"[train_2070419749, train_2643153468]"


In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(text_data)
tfidf_vectorizer.vocabulary_
# sorted(tfidf_vectorizer.vocabulary_.items())

[('00', 0),
 ('000', 1),
 ('0000', 2),
 ('000mah', 3),
 ('001', 4),
 ('002', 5),
 ('0026', 6),
 ('003', 7),
 ('0038', 8),
 ('003angka', 9),
 ('00405', 10),
 ('00406', 11),
 ('005', 12),
 ('006', 13),
 ('007', 14),
 ('0073', 15),
 ('008', 16),
 ('0088', 17),
 ('009', 18),
 ('01', 19),
 ('010', 20),
 ('011', 21),
 ('0116', 22),
 ('0136', 23),
 ('013p', 24),
 ('014', 25),
 ('0160', 26),
 ('018', 27),
 ('019', 28),
 ('02', 29),
 ('021', 30),
 ('021052', 31),
 ('0219', 32),
 ('022', 33),
 ('023', 34),
 ('029', 35),
 ('03', 36),
 ('0303n', 37),
 ('036', 38),
 ('04', 39),
 ('0402', 40),
 ('042', 41),
 ('044', 42),
 ('045', 43),
 ('04936', 44),
 ('05', 45),
 ('050', 46),
 ('050gr', 47),
 ('0515', 48),
 ('056', 49),
 ('058', 50),
 ('06', 51),
 ('060', 52),
 ('0629', 53),
 ('063', 54),
 ('0655', 55),
 ('069', 56),
 ('06dr', 57),
 ('07', 58),
 ('071', 59),
 ('075', 60),
 ('08', 61),
 ('0800', 62),
 ('0814', 63),
 ('09', 64),
 ('0a', 65),
 ('0k', 66),
 ('0m', 67),
 ('10', 68),
 ('100', 69),
 ('100

In [9]:
# Extracting features from text_data
feature = tfidf_vectorizer.transform(text_data).toarray()
#idx = np.where(feature != 0.)
#print((feature))
print('feature shape : ',feature.shape)
print('type : ', type(feature))
print(len(feature))

feature shape :  (10000, 14337)
type :  <class 'numpy.ndarray'>
10000


In [10]:
# Principal Components Analysis , temp, Latest one 2021. 5. 22
from sklearn.decomposition import PCA

K = 50
DEVICE = 'cuda'
train_feature = []
train_feature = torch.tensor(train_feature)
train_feature = train_feature.to(DEVICE)

batch = range(0, len(feature), 10)
a = 0

with torch.no_grad():

    pca_feature = PCA(n_components = K)
    principalComponents = pca_feature.fit_transform(feature)

    principalComponents = torch.tensor(principalComponents)
    principalComponents = principalComponents.to(DEVICE)

    train_feature = principalComponents    
        
        

In [11]:
train_feature.shape

torch.Size([10000, 50])

In [12]:
# Saving **train** Features 2021. 5. 19
train_feature = train_feature.data.cpu().numpy()
np.savetxt('trained_text_feature.csv', train_feature, delimiter=",")

In [13]:
# Loading **train** Features 2021. 5. 19
DEVICE = 'cuda'
train_feature = np.loadtxt('trained_text_feature.csv', delimiter=",")
train_feature = torch.from_numpy(train_feature)
train_feature = train_feature.to(DEVICE)

# l2 norm to kill all the sim in 0-1   ** train_feature
from sklearn.preprocessing import normalize
train_feature = train_feature.data.cpu().numpy()
train_feature = np.vstack(train_feature)
train_feature = normalize(train_feature)
train_feature = torch.from_numpy(train_feature)
train_feature = train_feature.to(DEVICE)

In [14]:
# Checking train_text_feature with train_text_feature, 2021. 5. 21
preds = []
CHUNK = 100

print('Finding similar images...')
CTS = len(train_feature)//CHUNK
if len(train_feature)%CHUNK != 0:
    CTS += 1
    
for j in tqdm(range(CTS)):
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(train_feature))
    
    distances = torch.matmul(train_feature, train_feature[a:b].T).T
    distances = distances.data.cpu().numpy()

    for k in range(b-a):
        #IDX = np.argmax(distances[k][:])
        IDX = np.where(distances[k,]>0.95)[0][:]
        #IDX = np.where(distances[k,]<0.1)[0][:]
        #o = sample.iloc[IDX].label_group.values
        o = train.iloc[IDX].posting_id.values
        preds.append(o)
        #print(len(IDX))
    
train['predicted'] = preds
#del train_feature

  1%|▊                                                                                 | 1/100 [00:00<00:20,  4.82it/s]

Finding similar images...


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 33.74it/s]


In [15]:
train['f1'] = train.apply(getF1score('predicted'),axis=1)
print('CV score for baseline = ', train.f1.mean())
train['Prec'] = train.apply(getPrecision('predicted'),axis=1)
print('precision = ', train.Prec.mean())
train['Rec'] = train.apply(getRecall('predicted'),axis=1)
print('recall = ', train.Rec.mean())

CV score for baseline =  0.5514604356997606
precision =  0.7496064232173055
recall =  0.5379044077041456


In [19]:
train['predicted'][0]

array(['train_129225211', 'train_1220997311', 'train_1941131050',
       'train_3243826013', 'train_2078576963', 'train_2344463199'],
      dtype=object)