In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from underthesea import word_tokenize, pos_tag, sent_tokenize
import warnings
from gensim import corpora, models, similarities
import jieba
import re

In [2]:
warnings.filterwarnings('ignore')

### Load data

In [3]:
products = pd.read_csv("Cung cap HV/Product_new.csv")
reviews = pd.read_csv("Cung cap HV/Review_new.csv")

In [4]:
products.head(2)

Unnamed: 0,item_id,name,description,rating,price,list_price,brand,group,url,image
0,48102821,Tai nghe Bluetooth Inpods 12 - Cảm biến vân ta...,THÔNG TIN CHI TIẾT\nDung lượng pin 300\nThời g...,4.0,77000,300000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-inpods-12-cam-bien-...,https://salt.tikicdn.com/cache/280x280/ts/prod...
1,52333193,Tai nghe bluetooth không dây F9 True wireless ...,THÔNG TIN CHI TIẾT\nDung lượng pin 2000mah\nTh...,4.5,132000,750000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-khong-day-f9-true-w...,https://salt.tikicdn.com/cache/280x280/ts/prod...


In [5]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4373 entries, 0 to 4372
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   item_id      4373 non-null   int64  
 1   name         4373 non-null   object 
 2   description  4370 non-null   object 
 3   rating       4373 non-null   float64
 4   price        4373 non-null   int64  
 5   list_price   4373 non-null   int64  
 6   brand        4373 non-null   object 
 7   group        4373 non-null   object 
 8   url          4373 non-null   object 
 9   image        4373 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 341.8+ KB


In [6]:
products.shape

(4373, 10)

### Data Transformation

In [7]:
products = products[products['name'].notnull()]

In [8]:
products['name_description'] = products.name + products.description

In [9]:
products = products[products['name_description'].notnull()]

In [10]:
products['name_desciption_pre'] = products['name_description'].apply(lambda x: word_tokenize(x,format='text'))

In [11]:
type(products)

pandas.core.frame.DataFrame

In [12]:
products.shape

(4370, 12)

### Solution 2: Gensim

In [13]:
intro_products = [[text for text in x.split()] for x in products.name_desciption_pre]

In [14]:
len(intro_products)

4370

In [15]:
intro_products[:1]

[['Tai_nghe',
  'Bluetooth_Inpods',
  '12',
  '-',
  'Cảm_biến',
  'vân',
  'tay',
  ',',
  'chống',
  'nước',
  ',',
  'màu_sắc',
  'đa_dạng',
  '-',
  '5',
  'màu_sắc',
  'lựa',
  'chọnTHÔNG',
  'TIN',
  'CHI_TIẾT',
  'Dung_lượng',
  'pin',
  '300',
  'Thời_gian',
  'pin',
  '-',
  'Thời_gian',
  'nghe',
  'nhạc',
  'liên_tục',
  'từ',
  '2.5',
  '-',
  '4',
  'h',
  '-',
  'Thời_gian',
  'sạc',
  'đầy',
  'chỉ',
  'khoảng',
  '60',
  'p',
  '-',
  'Thời_gian',
  'chờ',
  'lên',
  'tới',
  '140',
  'giờ',
  'Bluetooth',
  '5',
  'Thương_hiệu',
  'OEM',
  'Xuất_xứ',
  'thương_hiệu',
  'Trung_Quốc',
  'Độ',
  'nhạy_cảm_biến',
  'vân',
  'tay',
  'Model',
  'i12',
  'Loại',
  'Jack',
  'cắm',
  'USB_Cable',
  'Trọng_lượng',
  '300',
  'g',
  'Thời_gian',
  'sử_dụng',
  '-',
  'Thời_gian',
  'nghe',
  'nhạc',
  'liên_tục',
  'từ',
  '2.5',
  '-',
  '4',
  'h',
  'SKU',
  '4096608751631',
  'MÔ_TẢ',
  'SẢN_PHẨM',
  'INPOD_12',
  'là',
  'phiên_bản',
  'nâng_cấp',
  'mới',
  'nhất',
  ',',

In [16]:
# remove ' ' element in texts
intro_products_re = [[t.lower() for t in text if not t in ['', ' ', ',', '.', '...', '-',':', ';', '?', '%', '(', ')', '+', '\d+','...']] for text in  intro_products]

In [17]:
intro_products_re[:1]

[['tai_nghe',
  'bluetooth_inpods',
  '12',
  'cảm_biến',
  'vân',
  'tay',
  'chống',
  'nước',
  'màu_sắc',
  'đa_dạng',
  '5',
  'màu_sắc',
  'lựa',
  'chọnthông',
  'tin',
  'chi_tiết',
  'dung_lượng',
  'pin',
  '300',
  'thời_gian',
  'pin',
  'thời_gian',
  'nghe',
  'nhạc',
  'liên_tục',
  'từ',
  '2.5',
  '4',
  'h',
  'thời_gian',
  'sạc',
  'đầy',
  'chỉ',
  'khoảng',
  '60',
  'p',
  'thời_gian',
  'chờ',
  'lên',
  'tới',
  '140',
  'giờ',
  'bluetooth',
  '5',
  'thương_hiệu',
  'oem',
  'xuất_xứ',
  'thương_hiệu',
  'trung_quốc',
  'độ',
  'nhạy_cảm_biến',
  'vân',
  'tay',
  'model',
  'i12',
  'loại',
  'jack',
  'cắm',
  'usb_cable',
  'trọng_lượng',
  '300',
  'g',
  'thời_gian',
  'sử_dụng',
  'thời_gian',
  'nghe',
  'nhạc',
  'liên_tục',
  'từ',
  '2.5',
  '4',
  'h',
  'sku',
  '4096608751631',
  'mô_tả',
  'sản_phẩm',
  'inpod_12',
  'là',
  'phiên_bản',
  'nâng_cấp',
  'mới',
  'nhất',
  'tai_nghe',
  'bluetooth_5.0',
  'có',
  'thiết_kế',
  'tỉ_lệ',
  'chuẩn',

In [18]:
# Obtain the number of features based on dictionary: Use corpora.Dictionary
dictionary = corpora.Dictionary(intro_products_re)

In [19]:
# List of features in dictionary
dictionary.token2id

{'._v': 0,
 '1': 1,
 '12': 2,
 '140': 3,
 '2': 4,
 '2.5': 5,
 '300': 6,
 '4': 7,
 '4096608751631': 8,
 '5': 9,
 '60': 10,
 'airpod': 11,
 'apple': 12,
 'bao_gồm': 13,
 'bluetooth': 14,
 'bluetooth_5.0': 15,
 'bluetooth_inpods': 16,
 'bấm': 17,
 'bằng': 18,
 'chi_phí': 19,
 'chi_tiết': 20,
 'cho': 21,
 'chuyển': 22,
 'chuẩn': 23,
 'chính': 24,
 'chạm': 25,
 'chất': 26,
 'chỉ': 27,
 'chọnthông': 28,
 'chống': 29,
 'chờ': 30,
 'cuộc': 31,
 'các': 32,
 'có': 33,
 'có_thể': 34,
 'cải_thiện': 35,
 'cảm_biến': 36,
 'cảm_ứng': 37,
 'cắm': 38,
 'cồng_kềnh': 39,
 'dock': 40,
 'dung_lượng': 41,
 'dễ_dàng': 42,
 'g': 43,
 'giao': 44,
 'giá': 45,
 'giờ': 46,
 'gọi': 47,
 'h': 48,
 'hiện_hành': 49,
 'hoặc': 50,
 'huawei': 51,
 'hàng': 52,
 'hãng': 53,
 'hơn': 54,
 'i12': 55,
 'inpod_12': 56,
 'jack': 57,
 'khi': 58,
 'khoảng': 59,
 'khác': 60,
 'kết_nối': 61,
 'lenovo': 62,
 'liên_tục': 63,
 'loại': 64,
 'luật': 65,
 'là': 66,
 'lên': 67,
 'lược_bỏ': 68,
 'lấy': 69,
 'lần': 70,
 'lựa': 71,
 'model':

In [20]:
# Number of features (words) in dictionaries
feature_count = len(dictionary.token2id)

In [21]:
feature_count

57970

In [22]:
# Obtain corpus based on dictionary (dense matrix)
corpus = [dictionary.doc2bow(text) for text in intro_products_re]

In [23]:
corpus[0]

[(0, 1),
 (1, 3),
 (2, 1),
 (3, 2),
 (4, 1),
 (5, 3),
 (6, 2),
 (7, 4),
 (8, 1),
 (9, 2),
 (10, 2),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 2),
 (15, 3),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 4),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 3),
 (28, 1),
 (29, 1),
 (30, 2),
 (31, 1),
 (32, 2),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 2),
 (41, 1),
 (42, 2),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 2),
 (47, 1),
 (48, 3),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 2),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 2),
 (60, 1),
 (61, 3),
 (62, 1),
 (63, 3),
 (64, 2),
 (65, 1),
 (66, 2),
 (67, 2),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 2),
 (75, 1),
 (76, 1),
 (77, 3),
 (78, 4),
 (79, 1),
 (80, 2),
 (81, 4),
 (82, 1),
 (83, 3),
 (84, 1),
 (85, 1),
 (86, 2),
 (87, 1),
 (88, 1),
 (89, 1),
 (90, 2),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 4),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 5),

In [24]:
# Use TF-IDF Model to process corpus, obtaining index
tfidf = models.TfidfModel(corpus)
# tính toán sự tương tự trong ma trận thưa thớt
index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=feature_count)

In [25]:
# When user choose one product: 1059892, 52333193
product_ID = 10001355
product = products[products.item_id == product_ID].head(1)

In [26]:
type(product['name_desciption_pre'])

pandas.core.series.Series

In [27]:
product[['item_id','name_desciption_pre']]

Unnamed: 0,item_id,name_desciption_pre
2660,10001355,RAM Laptop_Hynix_4GB_DDR4_2400MH z SODIMM - Hà...


In [28]:
# View product
name_description_pre = product['name_desciption_pre'].to_string(index=False)

In [29]:
name_description_pre

'RAM Laptop_Hynix_4GB_DDR4_2400MH z SODIMM - Hàn...'

In [30]:
# Suggest other products for customers
def recommendation (view_product, dictionary, tfidf, index):
    # Convert search words into Sparse Vectors
    view_product = view_product.lower().split()
    kw_vector = dictionary.doc2bow(view_product)
    print("View product 's vector:")
    print(kw_vector)
    # Similarity calculation
    sim = index[tfidf[kw_vector]]
    
    # print result
    list_id = []
    list_score = []
    for i in range(len(sim)):
        list_id.append(i)
        list_score.append(sim[i])
    
    df_result = pd.DataFrame({'id': list_id,
                              'score': list_score})
    
    # five highest scores
    five_highest_score = df_result.sort_values(by='score', ascending=False).head(6)
    print("Five highest scores:")
    print(five_highest_score)
    print("Ids to list:")
    idToList = list(five_highest_score['id'])
    print(idToList)
    
    products_find = products[products.index.isin(idToList)]
    results = products_find[['item_id','name']]
    results = pd.concat([results, five_highest_score], axis=1).sort_values(by='score', ascending=False)
    return results

In [31]:
results = recommendation(name_description_pre, dictionary, tfidf, index)

View product 's vector:
[(483, 1), (495, 1), (28985, 1), (38434, 1)]
Five highest scores:
        id     score
2658  2658  0.596752
2314  2314  0.263400
2505  2505  0.256717
2473  2473  0.251999
2413  2413  0.250081
2662  2662  0.239666
Ids to list:
[2658, 2314, 2505, 2473, 2413, 2662]


In [32]:
# Recommender 5 similarities products for the selected product
# Check and remove the selected product from the results
results = results[results.item_id!=product_ID]
results

Unnamed: 0,item_id,name,id,score
2658,57316596,Màn hình AOC 27G2 (27 inch/FHD/IPS/144Hz/1ms/G...,2658,0.596752
2314,58677681,N200RE_V5 - Mini Router Wi-Fi chuẩn N 300Mbps,2314,0.2634
2505,576880,Màn Hình Gaming Cong Samsung LC27F390FHEXXV 27...,2505,0.256717
2473,7975728,Chuột máy tính Newmen G10 - Hàng Chính Hãng,2473,0.251999
2413,14606621,Keo Tản Nhiệt ARCTIC MX4 - 4g (2020) - Hàng C...,2413,0.250081
2662,68308394,Card WIFI PC WIFI-6 AX200 GIG+ tích hợp Blueto...,2662,0.239666


### Save results

In [33]:
dictionary.save("Gensim_Model.sav")

In [34]:
dictionary.load("Gensim_Model.sav")

<gensim.corpora.dictionary.Dictionary at 0x1c04eb7ffa0>