In [1]:
import numpy as np
import sklearn
import underthesea
import json
import os
from tqdm.autonotebook import tqdm
from glob import glob
from pprint import pprint
import nltk
from underthesea import word_tokenize
from sklearn.decomposition import TruncatedSVD
import math
import sklearn
from difflib import get_close_matches



## Load data in json file

In dataset, item format is json with many fields and coresponding content 

In [2]:
import pandas as pd
df = pd.read_csv('./tdgd_laptop/laptops_all.csv')

def clean_laptop_name(name):
    name = name.lower()
    return name

items = []

for index, row in df.iterrows():
    items.append(str(row['code']) + '\n' + clean_laptop_name(row['full_name']))

### Tokenizer: word segmentation

In [22]:


class Tfidf:
    def __init__(self):
        self.corpus = {}
        self.corpus_counter = {}
        self.corpus_dict = []
        self.corpus_len = 0
        
    '''
    Xây dựng TF-IDF cho danh sách documents nhận đuợc
    documents: list<string>
    '''
    def fit_data(self, documents):
#         print(type(documents))
#         if type(documents) != "list":
#             raise ValueError('documents must be type of list')
        
#         đếm số lần xuất hiện của mỗi từ
        for d in documents:
#             tokens = word_tokenize(d, format='text').split()
            tokens = d.split()
            self.corpus_len += 1
            for t in tokens:
                if t in self.corpus_counter.keys():
                    self.corpus_counter[t] += 1
                else:
                    self.corpus_counter[t] = 1
        
        for k in self.corpus_counter.keys():
            self.corpus_dict.append(k)

    def print_data(self):
        print(self.corpus_counter)
        print(len(self.corpus_counter))
        
    def get_tfidf(self, string):
        string = string.lower()
        doc_counter = {}
        k = 2
        tokens = string.split()
#         tokens = word_tokenize(string, format='text').split()
        for t in tokens:
            if t in doc_counter.keys():
                doc_counter[t] += 1
            else:
                doc_counter[t] = 1
        
        vector_len = len(self.corpus_counter)
        tfidf_vector = np.zeros((vector_len,))
        for i, key in enumerate(self.corpus_counter.keys()):
            if key in doc_counter.keys():
                tf = (k+1)*doc_counter[key]/(k+doc_counter[key])
                idf = math.log((self.corpus_len+1)/(self.corpus_counter[key]))
                tfidf_vector[i] = tf*idf
        return tfidf_vector
    
    '''
    Xử lý query user đưa vào
    returns: list query mà hệ thống cho là người dùng muốn sử dụng để tìm kiếm
    '''
    def preprocess_query(self, query):
        tokens = query.split()
        refined = []
        # correct lại từng từ trong query
        for t in tokens:
            if t in self.corpus_counter.keys():
                refined.append(t)
                continue
            
            substitute = get_close_matches(t, self.corpus_dict, n=1, cutoff=0.5)
            if len(substitute) > 0:

                refined.append(substitute[0])
            else:
                refined.append(t)

        return " ".join(refined)
        # thêm bigram

class Storage:
    def __init__(self):
        self.tfidf_space = []
        self.tfidf = Tfidf()
        self.svd = TruncatedSVD(n_components=256)
        self.items = []
    
    '''
    items: list<string>
    '''
    def fit_data(self, items):
        self.tfidf.fit_data(items)
        
        for i in items:
            self.tfidf_space.append(self.tfidf.get_tfidf(i))
            self.items.append(i)
            
#         self.svd.fit(self.tfidf_space)
#         self.svd_tfidf_vector = self.svd.transform(self.tfidf_space)
        
    '''
    item: string
    '''
    def get_similiar_items(self, item):
        query_vector = self.tfidf.get_tfidf(item)
        query_vector = np.reshape(query_vector, (1,-1))
        # search
        sim_maxtrix = sklearn.metrics.pairwise.cosine_similarity(query_vector, self.tfidf_space)
        sim_maxtrix = np.reshape(sim_maxtrix, (-1,))
        idx = (-sim_maxtrix).argsort()[:30]
        for _id in idx:
            print(_id, sim_maxtrix[_id])
#             print(newItems[_id]['name'].upper())
            print(self.items[_id], "\n\n")

    def evaluate_query(self, query):
        
        query_vector = self.tfidf.get_tfidf(query)
        query_vector = np.reshape(query_vector, (1,-1))
        sim_maxtrix = sklearn.metrics.pairwise.cosine_similarity(query_vector, self.tfidf_space)
        sim_maxtrix = np.reshape(sim_maxtrix, (-1,))
        
        
        result = []
        for idx, val in enumerate(sim_maxtrix):
            if val > 0.2:
                result.append(self.items[idx].split()[0])
        
        return result

s = Storage()

item_descriptions = []
for i in items:
    item_descriptions.append(i)
    
s.fit_data(item_descriptions)

In [25]:
query_string = 'laptop tb'
print("Tokenize: ", word_tokenize(query_string, format='text'))
s.get_similiar_items(query_string)

Tokenize:  laptop tb
126 0.3025371936637314
220042001521
 laptop dell i5 10300h 2.5ghz 8 gb 1 tb hdd sata 3 có thể tháo ra, lắp thanh khác tối đa 2 tb card rời gtx 1650 4gb 15.6 full hd 1920 x 1080 1 x usb 3.2 2 x usb 3.0 vỏ nhựa  nắp lưng bằng kim loại 


50 0.28670735058463803
220042001872
 laptop hp i7 1165g7 2.8ghz 16 gb 1 tb ssd card tích hợp intel iris xe 13.3 full hd 1920 x 1080 2 x usb 3.1 jack tai nghe 3.5 mm vỏ kim loại 


21 0.2671995131565484
220042001561
 laptop msi i7 11800h 2.30 ghz 16 gb 2 tb ssd nvme pcie card rời rtx 3070 8gb 15.6 full hd 1920 x 1080 3 x usb 3.2 hdmi vỏ kim loại 


20 0.23553888494450173
220042001560
 laptop msi i7 11800h 2.30 ghz 32 gb 2 tb ssd nvme pcie card rời rtx 3070 maxq 8gb 15.6 full hd 1920 x 1080 3 x usb 3.2 hdmi vỏ kim loại 


34 0.22999378628169515
220042001691
 laptop hp i7 1165g7 2.8ghz 16 gb 1 tb ssd m.2 pcie card rời nvidia quadrot500, 4gb 14 full hd 1920 x 1080 2 x thunderbolt 4 usbc 2x superspeed usb a vỏ kim loại 


24 0.22853951513

In [5]:
s.evaluate_query("HP 16 GB")

['220042001441', '220042001743', '220042001872']

In [6]:
corpus_dict = []

keys = s.tfidf.corpus_counter.keys()
for k in keys:
    corpus_dict.append(k)

corpus_dict



['220042001752',
 'laptop',
 'acer',
 'ryzen',
 '5',
 '5600h',
 '3.3ghz',
 '8',
 'gb',
 '512',
 'ssd',
 'nvme',
 'pcie',
 'có',
 'thể',
 'tháo',
 'ra,',
 'lắp',
 'thanh',
 'khác',
 'tối',
 'đa',
 '1tb',
 'card',
 'rời',
 'gtx',
 '1650',
 '4gb',
 '15.6',
 'full',
 'hd',
 '1920',
 'x',
 '1080',
 '3',
 'usb',
 '3.2',
 'hdmi',
 'vỏ',
 'nhựa',
 '220042001666',
 'lenovo',
 'i7',
 '1165g7',
 '2.8ghz',
 'tích',
 'hợp',
 'intel',
 'iris',
 'xe',
 '14',
 '1',
 'always',
 'on',
 'kim',
 'loại',
 '220042001608',
 'msi',
 '10750h',
 '2.6ghz',
 '2tb',
 'maxq',
 'nắp',
 'lưng',
 'bằng',
 '220042001181',
 'i3',
 '1005g1',
 '1.2ghz',
 '4',
 '256',
 'uhd',
 '2',
 '2.0',
 'dài',
 '363.4',
 'mm',
 'rộng',
 '247.5',
 'dày',
 '19.9',
 'nặng',
 '1.7',
 'kg',
 '220042001754',
 'celeron',
 'n4020',
 '1.1ghz',
 '128',
 'm2',
 '600',
 '11.6',
 '1366',
 '768',
 '295',
 '215',
 '20.99',
 '1.4',
 '220042001724',
 '5500u',
 '2.1ghz',
 'radeon',
 'vega',
 '7',
 'và',
 'chiếu',
 'nghỉ',
 'tay',
 '220042001736',
 'dell

In [7]:
if 'hb' in s.tfidf.corpus_counter.keys():
    print("He")

In [8]:
matches = get_close_matches("hb", corpus_dict, n=1, cutoff=0.5)
matches

['tb']

In [23]:
s.tfidf.preprocess_query('laptop hb')

'laptop tb'

In [10]:
if 'hb' in s.tfidf.corpus_counter.keys():
    print(True)

In [24]:
s.tfidf.corpus_counter['tb']

14