In [1]:
# Thanks for Code from https://wikidocs.net/24603
# Extracting features by using TF-IDF features

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import linear_kernel
import collections
from enum import Enum, auto


In [2]:
DATA_PATH = 'shopee-product-matching/'
train = pd.read_csv(DATA_PATH + 'train.csv')
train = train.head(25000)
text_train_data = train['title']


In [3]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_train_data)

In [4]:
def predict(list_arr):
    idx = np.argmax(list_arr)
    return idx


In [5]:
# Checking train_text_feature with train_text_feature
preds = []
CHUNK = 10

print('Finding similar images...')
CTS = (np.shape(tfidf_matrix)[0])//CHUNK

if (np.shape(tfidf_matrix)[0])%CHUNK != 0:
    CTS += 1
distances = cosine_similarity(tfidf_matrix)

for j in tqdm(range(CTS)):
    a = j*CHUNK
    b = (j+1)*CHUNK

    for k in range(b-a):
        
        indices = predict(distances[a + k,])
        o = train.iloc[indices].label_group
        preds.append(o)
    
train['predicted'] = preds



Finding similar images...


100%|█████████████████████████████████████████████████████████████████████████████| 2500/2500 [00:03<00:00, 759.80it/s]


In [6]:
def getScore(train):
    #print(train)
    label_group = train['label_group']
    #print(label_group[25001])
    
    prescision = 0
    recall = 0
    f1_score = 0
    for i in tqdm(range(25001, 25001 + len(label_group))):
        #print(i)
        query_label = label_group[i]
        tp = 0
        fp = 0
        fn = 0
        for j in range(25001, 25001 + len(train)):
            if train['label_group'][j] == query_label or train['predicted'][j] == query_label:
                if train['label_group'][j] == query_label and train['predicted'][j] != query_label:
                    fn = fn + 1
                elif train['label_group'][j] != query_label and train['predicted'][j] == query_label:
                    fp = fp + 1
                else:
                    tp = tp + 1
        if tp+fp == 0:
            pres = 0
        else:
            pres = tp / (tp + fp)
        if tp+fn == 0:
            rec = 0
        else:
            rec  = tp / (tp + fn)
        if pres + rec == 0:
            f1 = 0
        else:
            f1   = 2*pres*rec / (pres + rec)
        
        prescision = prescision + pres
        recall     = recall + rec
        f1_score   = f1_score + f1
        
    prescision = prescision / len(label_group)
    recall = recall / len(label_group)
    f1_score = f1_score / len(label_group)
    
    return prescision, recall, f1_score
    

In [7]:
test = pd.read_csv(DATA_PATH + 'train.csv')
test = test.loc[25001:30000]
#print(test)
text_test_data = test['title']
tfidf_matrix_test = tfidf_vectorizer.transform(text_test_data)
print("TF-IDF Matrix Shape for Train Data:", tfidf_matrix.shape)
print("TF-IDF Matrix Shape for Test Data:", tfidf_matrix_test.shape)
cosine_sim_mat = cosine_similarity(tfidf_matrix_test, tfidf_matrix)

TF-IDF Matrix Shape for Train Data: (25000, 21893)
TF-IDF Matrix Shape for Test Data: (5000, 21893)


In [8]:
# Checking train_text_feature with test_text_feature
preds = []
CHUNK = 10

print('Finding similar images...')
CTS = (np.shape(cosine_sim_mat)[0])//CHUNK

if (np.shape(cosine_sim_mat)[0])%CHUNK != 0:
    CTS += 1

for j in tqdm(range(CTS)):
    a = j*CHUNK
    b = (j+1)*CHUNK

    for k in range(b-a):
        
        indices = predict(cosine_sim_mat[a + k,])
        #print(indices)
        o = train.iloc[indices].label_group
        preds.append(o)
    
test['predicted'] = preds


Finding similar images...


100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 791.75it/s]


In [10]:
# training dataset 에 대한 score
# 1~25000 for train, 25001~30000 for test
prescision, recall, f1_score = getScore(test)
print("Precision: ", prescision)
print("Recall: ",recall)
print("F1-score: ", f1_score)

100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [02:57<00:00, 28.17it/s]

0.5839522632922635
0.5922
0.5760730949557257





In [9]:
def user_process(user_input):
    user_tfidf_matrix = tfidf_vectorizer.transform([user_input])
    cosine_sim_mat = cosine_similarity(user_tfidf_matrix, tfidf_matrix)
    sorted_idx = np.argsort(cosine_sim_mat)
    sorted_arr = cosine_sim_mat[0][sorted_idx]
    
    print("[Search Result]")
    for i in range(5):
        idx = sorted_idx[0][-(i+1)]
        print(i, ") ", train['title'][idx])
        


In [10]:
user_input = "bed"
user_process(user_input)



[Search Result]
0 )  Aero Bed/ Car Air Bed / Kasur Angin Mobil / Kasur Angin Portable mobil E-3 FREE POMPA ANGIN
1 )  b"Kintakun D'Luxe Bed Cover Set New Winter Minimalis Edition Uk. 180x200"
2 )  KASUR BAYI LIPAT KELAMBU BOAT MODEL, BED COVER MOTIF NAVY YELLOW  PERLENGKAPAN BAYI
3 )  Bestway 67002 Kasur Angin Double Biru [191cm x 137cm] / Air Bed Double
4 )  EDW 501 Bamboo Storage Jumbo 65 Liter Box 3 Sekat Organizer Pakaian, Selimut, Bed Cover, Sprei


In [15]:
# using SVM model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(tfidf_matrix, train['label_group'])

SVC(kernel='linear')

In [None]:
y_pred = svm_model.predict(tfidf_matrix_test)