In [4]:
import numpy as np
import sklearn
import underthesea
import json
import os
from tqdm.autonotebook import tqdm
from glob import glob
from pprint import pprint
import nltk
from underthesea import word_tokenize
from sklearn.decomposition import TruncatedSVD
import math
import sklearn


## Load data in json file

In dataset, item format is json with many fields and coresponding content 

In [62]:
import pandas as pd
df = pd.read_csv('./tdgd_laptop/laptop_all.csv')

def clean_laptop_name(name):
    name = name.lower()
    return name

items = []

for index, row in df.iterrows():
    items.append(str(row['code']) + '\n' + clean_laptop_name(row['full_name']))

### Tokenizer: word segmentation

In [66]:
class Tfidf:
    def __init__(self):
        self.corpus = {}
        self.corpus_counter = {}
        self.corpus_len = 0
        
    '''
    Xây dựng TF-IDF cho danh sách documents nhận đuợc
    documents: list<string>
    '''
    def fit_data(self, documents):
#         print(type(documents))
#         if type(documents) != "list":
#             raise ValueError('documents must be type of list')
        
#         đếm số lần xuất hiện của mỗi từ
        for d in documents:
#             tokens = word_tokenize(d, format='text').split()
            tokens = d.split()
            self.corpus_len += 1
            for t in tokens:
                if t in self.corpus_counter.keys():
                    self.corpus_counter[t] += 1
                else:
                    self.corpus_counter[t] = 1
        
        
        
    def print_data(self):
        print(self.corpus_counter)
        print(len(self.corpus_counter))
        
    def get_tfidf(self, string):
        string = string.lower()
        doc_counter = {}
        k = 2
        tokens = string.split()
#         tokens = word_tokenize(string, format='text').split()
        for t in tokens:
            if t in doc_counter.keys():
                doc_counter[t] += 1
            else:
                doc_counter[t] = 1
        
        vector_len = len(self.corpus_counter)
        tfidf_vector = np.zeros((vector_len,))
        for i, key in enumerate(self.corpus_counter.keys()):
            if key in doc_counter.keys():
                tf = (k+1)*doc_counter[key]/(k+doc_counter[key])
                idf = math.log((self.corpus_len+1)/(self.corpus_counter[key]))
                tfidf_vector[i] = tf*idf
        return tfidf_vector

In [42]:
class Storage:
    def __init__(self):
        self.tfidf_space = []
        self.tfidf = Tfidf()
        self.svd = TruncatedSVD(n_components=256)
        self.items = []
    
    '''
    items: list<string>
    '''
    def fit_data(self, items):
        self.tfidf.fit_data(items)
        
        for i in items:
            self.tfidf_space.append(self.tfidf.get_tfidf(i))
            self.items.append(i)
            
#         self.svd.fit(self.tfidf_space)
#         self.svd_tfidf_vector = self.svd.transform(self.tfidf_space)
        
    '''
    item: string
    '''
    def get_similiar_items(self, item):
        query_vector = self.tfidf.get_tfidf(item)
        query_vector = np.reshape(query_vector, (1,-1))
        # search
        sim_maxtrix = sklearn.metrics.pairwise.cosine_similarity(query_vector, self.tfidf_space)
        sim_maxtrix = np.reshape(sim_maxtrix, (-1,))
        idx = (-sim_maxtrix).argsort()[:20]
        for _id in idx:
            print(_id, sim_maxtrix[_id])
#             print(newItems[_id]['name'].upper())
            print(self.items[_id], "\n\n")


In [67]:
s = Storage()

item_descriptions = []
for i in items:
    item_descriptions.append(i)
    
s.fit_data(item_descriptions)

In [68]:
s.tfidf.corpus_counter

{'220042001752': 1,
 'laptop': 187,
 'acer': 20,
 'ryzen': 29,
 'ryzen_5': 15,
 '5600h': 7,
 '3.3ghz': 9,
 '8': 131,
 '8_gb': 127,
 '512': 133,
 'gb': 172,
 'ssd': 186,
 'nvme': 164,
 'pcie': 178,
 '(có': 101,
 'thể': 101,
 'tháo': 101,
 'ra,': 100,
 'lắp': 101,
 'thanh': 101,
 'khác': 101,
 'tối': 103,
 'đa': 103,
 '1tb)': 60,
 'card': 189,
 'rời': 68,
 'gtx': 13,
 '1650': 13,
 '4gb': 45,
 '15.6"': 87,
 'full': 149,
 'hd': 160,
 '(1920': 150,
 'x': 182,
 '1080)': 149,
 '220042001666': 1,
 'lenovo': 19,
 'i7': 50,
 '1165g7': 25,
 '2.8ghz': 25,
 'tích': 121,
 'hợp': 121,
 'intel': 106,
 'iris': 78,
 'xe': 75,
 '14"': 58,
 '220042001608': 1,
 'msi': 24,
 '10750h': 4,
 '2.6ghz': 5,
 '2tb)': 26,
 'max-q': 10,
 '220042001181': 1,
 'i3': 13,
 '1005g1': 2,
 '1.2ghz': 2,
 '4': 17,
 '4_gb': 17,
 '256': 35,
 'uhd': 25,
 '220042001754': 1,
 'celeron': 5,
 'n4020': 4,
 '1.1ghz': 10,
 '128': 4,
 'm2': 4,
 '600': 5,
 '11.6"': 3,
 '(1366': 11,
 '768)': 11,
 '220042001724': 1,
 '5500u': 4,
 '2.1ghz': 

In [77]:
query_string = 'Laptop HP 15s du1108TU i3 10110U'
print("Tokenize: ", word_tokenize(query_string, format='text'))
s.get_similiar_items(query_string)

Tokenize:  Laptop HP 15 s du1108TU i3 10110U
7 0.5903337572143876
220042001761
 laptop  hp  i3  10110u 2.1ghz 4 4_gb 256 gb ssd nvme pcie card tích hợp intel uhd 15.6" full hd (1920 x 1080) 


178 0.542669106536324
220042001621
 laptop  hp  i3  10110u 2.1ghz 4 4_gb 256 gb ssd nvme pcie card tích hợp intel uhd 15.6" hd (1366 x 768) 


170 0.18811513594401125
220042001869
 laptop  hp  i3  1125g4 2ghz 4 4_gb 512 gb ssd nvme pcie (có thể tháo ra, lắp thanh khác tối đa 1tb) card tích hợp intel uhd 14" full hd (1920 x 1080) 


165 0.18811513594401125
220042001808
 laptop  hp  i3  1125g4 2ghz 4 4_gb 512 gb ssd nvme pcie (có thể tháo ra, lắp thanh khác tối đa 1tb) card tích hợp intel uhd 14" full hd (1920 x 1080) 


176 0.18527859527794416
220042001807
 laptop  hp  i3  1125g4 2ghz 4 4_gb 256 gb ssd nvme pcie (có thể tháo ra, lắp thanh khác tối đa 1tb) card tích hợp intel uhd 14" full hd (1920 x 1080) 


179 0.1611556470131461
220042001602
 laptop  hp  i3  1005g1 1.2ghz 4 4_gb 256 gb ssd nvme p