In [3]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import os, shutil, glob, os.path
from PIL import Image as pil_image
from sklearn.decomposition import PCA
from tqdm import tqdm

import gc

import pickle

import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [9]:
# Variable 

image.LOAD_TRUNCATED_IMAGES = True 
model = VGG16(weights='imagenet', include_top=False)
imdir = "../data/train_images/"
df = pd.read_csv("../data/train.csv")
y = df.label_group
number_clusters = 50

In [20]:
#add path to df
df['path'] = imdir + df.image

#list every label targets
tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
df['targets'] = df['label_group'].map(tmp)
df['targets'] = df['targets'].apply(lambda x: ' '.join(x))

In [11]:
df.path[1]

'../data/train_images/00039780dfc94d01db8676fe789ecd05.jpg'

In [12]:
# Loop over files and get features
#filelist = glob.glob(os.path.join(imdir, '*.jpg'))
#filelist.sort()
featurelist = []
for i, imagepath in enumerate(df.path):
    print("    Status: %s / %s" %(i, len(df.path)), end="\r")
    img = image.load_img(imagepath, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = np.array(model.predict(img_data))
    featurelist.append(features.flatten())

    Status: 34249 / 34250

In [None]:
#save file into pickle
# with open("featurelist.txt", "wb") as fp:   #Pickling
#     pickle.dump(featurelist, fp)
 
# with open("test.txt", "rb") as fp:   # Unpickling
#     b = pickle.load(fp)

In [87]:
# image preds (KNN)
# original threshold: 3.4
# original KNN: 50

"""
To do: 
    Tune params
    Implement cosine version (?)
"""

def get_image_predictions(df, x, y):
    
    KNN = 50
    threshold = 3.4

    # model = NearestNeighbors(n_neighbors = KNN)
    model = KNeighborsClassifier(n_neighbors= KNN)
    model.fit(x,y)
    distances, indices = model.kneighbors(x)
    predictions = []
    for k in range(x.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
    del model, distances, indices
    gc.collect()
    return predictions

# text preds (cosine)
# original threshold: 0.7
# original max_feat: 25_000

"""
To do:
    Implement KNN version
    Tune params
    understand Chunk size (not important)
"""
def get_text_predictions(df, max_features = 25_000):
    stopwords_list = stopwords.words('english') + stopwords.words('indonesian')
    model = TfidfVectorizer(stop_words = stopwords_list, binary = True, max_features = max_features)
    
    text_embeddings = model.fit_transform(df['title']).toarray()
    preds = []
    # magic number
    CHUNK = 1024*4
    print('Finding similar titles...')
    # discern # of chunks
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):
        # start pos for chunk idx
        a = j*CHUNK
        # end pos for chunk idx
        b = (j+1)*CHUNK
        # change to end of input if necessary
        b = min(b,len(df))
        print('chunk',a,'to',b)
        # COSINE SIMILARITY DISTANCE
        # matrix multiply(text emb & transposed chunk) then transpose
        cts = np.matmul( text_embeddings, text_embeddings[a:b].T ).T
        for k in range(b-a):
            # find where cosine sim > 0.7
            IDX = np.where(cts[k,]>0.7)[0]
            # save to o
            o = df.iloc[IDX].posting_id.values
            # append to preds
            preds.append(o)
    # delet model/text emb
    del model,text_embeddings
    # garb collect
    gc.collect()
    return preds

#combine prediction into 1 row

def combine_predictions(df):
    x = np.concatenate([df['img_prediction'], df['text_prediction']])
    return ' '.join( np.unique(x) )

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [25]:
# convert featurelist into a matrix for processing
x = np.stack(featurelist, axis=0 )

# get predictions 
image_preds = get_image_predictions(df, x, y, threshold = 3.4)
text_preds = get_text_predictions(df, max_features = 25_000)

KeyboardInterrupt: 

In [27]:
df['img_prediction'] = image_preds
df['text_prediction'] = text_preds

In [29]:
df['matches'] = df.apply(combine_predictions, axis = 1)

In [31]:
f1 = f1_score(df.targets, df.matches)

In [32]:
f1.mean()
"""
Current best: 0.6288451402918852
(original setup)
""""

0.6288451402918852

In [70]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,path,targets,img_prediction,text_prediction,matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,../data/train_images/0000a68812bc7e98c42888dfb...,train_129225211 train_2278313361,[train_129225211],"[train_129225211, train_2278313361]",train_129225211 train_2278313361
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,../data/train_images/00039780dfc94d01db8676fe7...,train_3386243561 train_3423213080,[train_3386243561],[train_3386243561],train_3386243561
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,../data/train_images/000a190fdd715a2a36faed16e...,train_2288590299 train_3803689425,[train_2288590299],[train_2288590299],train_2288590299
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,../data/train_images/00117e4fc239b1b641ff08340...,train_2406599165 train_3342059966,[train_2406599165],"[train_2406599165, train_3576714541, train_150...",train_1508100548 train_1744956981 train_240659...
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,../data/train_images/00136d1cf4edede0203f32f05...,train_3369186413 train_921438619,[train_3369186413],[train_3369186413],train_3369186413


In [14]:
# Preparing Submission file
df[['posting_id', 'matches']].to_csv('submission.csv', index=False)