In [1]:
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import os, shutil, glob, os.path
from PIL import Image as pil_image
from sklearn.decomposition import PCA
from tqdm import tqdm

import gc

import pickle

import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


from tensorflow.keras.optimizers import SGD 
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.utils import Sequence

In [3]:
#image.LOAD_TRUNCATED_IMAGES = True 
model = EfficientNetB0(weights='imagenet',include_top=False,pooling="avg",input_shape=None)
imdir = "data/train_images/"
df = pd.read_csv("data/train.csv")
y = df.label_group
number_clusters = 50

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5


In [4]:
#add path to df
df['path'] = imdir + df.image

In [5]:
featurelist = []
for i, imagepath in enumerate(df.path):
    print("    Status: %s / %s" %(i, len(df.path)), end="\r")
    img = image.load_img(imagepath, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = np.array(model.predict(img_data))
    featurelist.append(features.flatten())

    Status: 34249 / 34250

In [7]:
#save file into pickle
with open("featurelist_effnet.txt", "wb") as fp:   #Pickling
    pickle.dump(featurelist, fp)
 
# with open("test.txt", "rb") as fp:   # Unpickling
#     b = pickle.load(fp)

In [9]:
def get_image_predictions(df, x,y ,threshold = 0.4):
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    #model = NearestNeighbors(n_neighbors = KNN)
    model = KNeighborsClassifier(n_neighbors= KNN, metric='cosine')
    model.fit(x,y)
    distances, indices = model.kneighbors(x)
    predictions = []
    for k in range(x.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
    del model, distances, indices
    gc.collect()
    return predictions


#text preds

def get_text_predictions(df, max_features = 25_000):
    stopwords_list = stopwords.words('english') + stopwords.words('indonesian')
    model = TfidfVectorizer(stop_words = stopwords_list, binary = True, max_features = max_features)
    
    text_embeddings = model.fit_transform(df['title']).toarray()
    preds = []
    # magic number
    CHUNK = 1024*4
    print('Finding similar titles...')
    # discern # of chunks
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):
        # start pos for chunk idx
        a = j*CHUNK
        # end pos for chunk idx
        b = (j+1)*CHUNK
        # change to end of input if necessary
        b = min(b,len(df))
        print('chunk',a,'to',b)
        # COSINE SIMILARITY DISTANCE
        # matrix multiply(text emb & transposed chunk) then transpose
        cts = np.matmul( text_embeddings, text_embeddings[a:b].T ).T
        for k in range(b-a):
            # find where cosine sim > 0.7
            IDX = np.where(cts[k,]>0.7)[0]
            # save to o
            o = df.iloc[IDX].posting_id.values
            # append to preds
            preds.append(o)
    # delet model/text emb
    del model,text_embeddings
    # garb collect
    gc.collect()
    return preds

# clean text from noise
def clean_text(text):
    # filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    # remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

#train['clean_text'] = train.tweet.apply(clean_text)

#combine prediction into 1 row

def combine_predictions(row):
    x = np.concatenate([row['img_prediction'],     row['text_predictions']])
    return ' '.join( np.unique(x) )

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [10]:
x = np.stack(featurelist, axis=0 )

In [31]:
#get image prediction 
preds = get_image_predictions(df, x, y, threshold = 0.2)

In [32]:
#list every label targets
tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
df['targets'] = df['label_group'].map(tmp)
df['targets'] = df['targets'].apply(lambda x: ' '.join(x))

In [15]:
text_predictions = get_text_predictions(df, max_features = 25_000)



Finding similar titles...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250


In [33]:
df['text_predictions'] = text_predictions
df['img_prediction'] = preds

In [34]:
df['matches'] = df.apply(combine_predictions, axis=1)

In [22]:
df.head(10)

Unnamed: 0,posting_id,image,image_phash,title,label_group,path,targets,text_predictions,img_prediction,matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,data/train_images/0000a68812bc7e98c42888dfb1c0...,train_129225211 train_2278313361,"[train_129225211, train_2278313361]","[train_129225211, train_197296533]",train_129225211 train_197296533 train_2278313361
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,data/train_images/00039780dfc94d01db8676fe789e...,train_3386243561 train_3423213080,[train_3386243561],"[train_3386243561, train_3423213080, train_212...",train_1387702006 train_1553039102 train_181696...
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,data/train_images/000a190fdd715a2a36faed16e2c6...,train_2288590299 train_3803689425,[train_2288590299],"[train_2288590299, train_2723454438, train_326...",train_2288590299 train_2723454438 train_326726...
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,data/train_images/00117e4fc239b1b641ff08340b42...,train_2406599165 train_3342059966,"[train_2406599165, train_3576714541, train_150...","[train_2406599165, train_1593362411, train_256...",train_1002655969 train_1029583218 train_106133...
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,data/train_images/00136d1cf4edede0203f32f05f66...,train_3369186413 train_921438619,[train_3369186413],"[train_3369186413, train_921438619, train_2194...",train_1043687807 train_1093166739 train_115421...
5,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,2660605217,data/train_images/0013e7355ffc5ff8fb1ccad3e42d...,train_2464356923 train_2753295474 train_305884580,[train_2464356923],"[train_2464356923, train_2753295474, train_305...",train_2464356923 train_2753295474 train_305884580
6,train_1802986387,00144a49c56599d45354a1c28104c039.jpg,f815c9bb833ab4c8,Jubah anak size 1-12 thn,1835033137,data/train_images/00144a49c56599d45354a1c28104...,train_1802986387 train_1396161074 train_713073...,"[train_1802986387, train_1396161074, train_249...","[train_1802986387, train_944158112, train_1513...",train_1385709310 train_1396161074 train_151399...
7,train_1806152124,0014f61389cbaa687a58e38a97b6383d.jpg,eea7e1c0c04da33d,KULOT PLISKET SALUR /CANDY PLISKET /WISH KULOT...,1565741687,data/train_images/0014f61389cbaa687a58e38a97b6...,train_1806152124 train_3227306976,[train_1806152124],"[train_1806152124, train_719431433, train_3560...",train_1064230135 train_1165410768 train_132369...
8,train_86570404,0019a3c6755a194cb2e2c12bfc63972e.jpg,ea9af4f483249972,"[LOGU] Tempelan kulkas magnet angka, tempelan ...",2359912463,data/train_images/0019a3c6755a194cb2e2c12bfc63...,train_86570404 train_2837452969 train_77364776,[train_86570404],"[train_86570404, train_2837452969, train_77364...",train_115157077 train_2264584728 train_2269068...
9,train_831680791,001be52b2beec40ddc1d2d7fc7a68f08.jpg,e1ce953d1a70618f,BIG SALE SEPATU PANTOFEL KULIT KEREN KERJA KAN...,2630990665,data/train_images/001be52b2beec40ddc1d2d7fc7a6...,train_831680791 train_3031035861,[train_831680791],"[train_831680791, train_3031035861, train_1480...",train_1035429011 train_1189949692 train_122084...


In [35]:
f1 = f1_score(df.targets, df.matches)

In [36]:
f1.mean()

0.7374797619328904

In [43]:
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x[0].split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [44]:
f1_img = f2_score(df.targets, df.img_prediction)

In [48]:
f1_img.mean()

0.46016187277377063

In [49]:
f1.mean()

0.7374797619328904