In [29]:
import numpy as np
import sklearn
import underthesea
import json
import os
from tqdm.autonotebook import tqdm
from glob import glob
from pprint import pprint
import nltk
from underthesea import word_tokenize
from sklearn.decomposition import TruncatedSVD
import math
import sklearn


## Load data in json file

In dataset, item format is json with many fields and coresponding content 

In [30]:
import pandas as pd
df = pd.read_csv('./tdgd_laptop/laptop_all.csv')

def clean_laptop_name(name):
    name = name.lower()
    return name

items = []

for index, row in df.iterrows():
    items.append(str(row['code']) + '\n' + clean_laptop_name(row['full_name']))

### Tokenizer: word segmentation

In [31]:
class Tfidf:
    def __init__(self):
        self.corpus = {}
        self.corpus_counter = {}
        self.corpus_len = 0
        
    '''
    Xây dựng TF-IDF cho danh sách documents nhận đuợc
    documents: list<string>
    '''
    def fit_data(self, documents):
#         print(type(documents))
#         if type(documents) != "list":
#             raise ValueError('documents must be type of list')
        
#         đếm số lần xuất hiện của mỗi từ
        for d in documents:
#             tokens = word_tokenize(d, format='text').split()
            tokens = d.split()
            self.corpus_len += 1
            for t in tokens:
                if t in self.corpus_counter.keys():
                    self.corpus_counter[t] += 1
                else:
                    self.corpus_counter[t] = 1
        
        
        
    def print_data(self):
        print(self.corpus_counter)
        print(len(self.corpus_counter))
        
    def get_tfidf(self, string):
        string = string.lower()
        doc_counter = {}
        k = 2
        tokens = string.split()
#         tokens = word_tokenize(string, format='text').split()
        for t in tokens:
            if t in doc_counter.keys():
                doc_counter[t] += 1
            else:
                doc_counter[t] = 1
        
        vector_len = len(self.corpus_counter)
        tfidf_vector = np.zeros((vector_len,))
        for i, key in enumerate(self.corpus_counter.keys()):
            if key in doc_counter.keys():
                tf = (k+1)*doc_counter[key]/(k+doc_counter[key])
                idf = math.log((self.corpus_len+1)/(self.corpus_counter[key]))
                tfidf_vector[i] = tf*idf
        return tfidf_vector

In [32]:
class Storage:
    def __init__(self):
        self.tfidf_space = []
        self.tfidf = Tfidf()
        self.svd = TruncatedSVD(n_components=256)
        self.items = []
    
    '''
    items: list<string>
    '''
    def fit_data(self, items):
        self.tfidf.fit_data(items)
        
        for i in items:
            self.tfidf_space.append(self.tfidf.get_tfidf(i))
            self.items.append(i)
            
#         self.svd.fit(self.tfidf_space)
#         self.svd_tfidf_vector = self.svd.transform(self.tfidf_space)
        
    '''
    item: string
    '''
    def get_similiar_items(self, item):
        query_vector = self.tfidf.get_tfidf(item)
        query_vector = np.reshape(query_vector, (1,-1))
        # search
        sim_maxtrix = sklearn.metrics.pairwise.cosine_similarity(query_vector, self.tfidf_space)
        sim_maxtrix = np.reshape(sim_maxtrix, (-1,))
        idx = (-sim_maxtrix).argsort()[:30]
        for _id in idx:
            print(_id, sim_maxtrix[_id])
#             print(newItems[_id]['name'].upper())
            print(self.items[_id], "\n\n")

    def evaluate_query(self, query):
        
        query_vector = self.tfidf.get_tfidf(query)
        query_vector = np.reshape(query_vector, (1,-1))
        sim_maxtrix = sklearn.metrics.pairwise.cosine_similarity(query_vector, self.tfidf_space)
        sim_maxtrix = np.reshape(sim_maxtrix, (-1,))
        
        
        result = []
        for idx, val in enumerate(sim_maxtrix):
            if val > 0.2:
                result.append(self.items[idx].split()[0])
        
        return result

In [33]:
s = Storage()

item_descriptions = []
for i in items:
    item_descriptions.append(i)
    
s.fit_data(item_descriptions)

In [34]:
query_string = 'Acer Ryzen 7 8 GB'
print("Tokenize: ", word_tokenize(query_string, format='text'))
s.get_similiar_items(query_string)

Tokenize:  Acer Ryzen 7 8 GB
46 0.41874392929102455
220042001763
laptop  acer  ryzen 7 5800h 3.2ghz 8 gb card rời rtx 3060 6gb 15.6" full hd (1920 x 1080) 


35 0.41309352622558004
220042001764
laptop  acer  ryzen 7 5800h 3.2ghz 8 gb card rời rtx 3070 8gb 15.6" full hd (1920 x 1080) 


60 0.30698247825367475
220042001845
laptop  dell  ryzen 7 5800h 3.2ghz 8 gb card rời rtx 3050 4gb 15.6" full hd (1920 x 1080) 


53 0.29275709803325645
220042001843
laptop  lenovo  ryzen 7 5800h 3.2ghz 8 gb card rời rtx 3050ti 4gb 15.6" full hd (1920 x 1080) 


39 0.2877965177753064
220042001892
laptop  dell  ryzen 7 5800h 3.2ghz 16 gb card rời rtx 3050ti 4gb 15.6" full hd (1920 x 1080) 


62 0.2790578391494529
220042001809
laptop  hp  ryzen 7 5800h 3.2ghz 8 gb card rời rtx 3050ti 4gb 16.1" full hd (1920 x 1080) 


103 0.26649490677611365
220042001798
laptop  dell  ryzen 7 5700u 1.8ghz 8 gb card tích hợp radeon 15.6" full hd (1920 x 1080) 


141 0.26559909548836014
220042001726
laptop  msi  ryzen 7 5700u

In [35]:
s.evaluate_query("HP 16 GB")

['220042001373',
 '220042001441',
 '220042001691',
 '220042001743',
 '220042001872']