# Topic 2: Recommendation System
### Demo: Underthesea, Gemsim, Cosin Similarity

In [26]:
!pip install gensim
!pip install underthesea



In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from underthesea import word_tokenize, pos_tag, sent_tokenize
import warnings
import re

In [12]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

%cd '/content/gdrive/MyDrive/LDS0/Topic_12_Hasaki/demo'

ModuleNotFoundError: No module named 'google'

In [None]:
STOP_WORD_FILE = 'vietnamese-stopwords.txt'

In [7]:
with open(STOP_WORD_FILE, 'r', encoding='utf-8') as file:
    stop_words = file.read()

stop_words = stop_words.split('\n')

# Underthesea
#### word_tokenize, pos_tag, sent_tokenize
- link: https://github.com/undertheseanlp/underthesea

In [8]:
# Dữ liệu thử nghiệm
df= pd.read_csv('Product.csv')

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,ma_san_pham,ten_san_pham,gia_ban,gia_goc,phan_loai,mo_ta,diem_trung_binh
0,0,318900012,Nước Hoa Hồng Klairs Không Mùi Cho Da Nhạy Cảm...,209000,435000.0,2x180ml 180ml Không Mùi Có Mùi Hương,Nước Hoa Hồng Klairs Supple Preparation là dòn...,4.8
1,1,205100137,"Nước Tẩy Trang L'Oreal Tươi Mát Cho Da Dầu, Hỗ...",147000,229000.0,2x400ml 95ml 400ml Làm Sạch Sâu Tươi Mát Da Ki...,Nước Tẩy Trang L'Oréal là dòng sản phẩm tẩy tr...,4.7
2,2,422208973,Sữa Rửa Mặt CeraVe Sạch Sâu Cho Da Thường Đến ...,343000,455000.0,88ml 236ml 473ml Da khô/Hỗn hợp khô Da dầu/Hỗn...,Sữa Rửa Mặt Cerave Sạch Sâu là sản phẩm sữa rử...,4.9
3,3,204900013,Kem Chống Nắng La Roche-Posay Kiểm Soát Dầu SP...,377000,560000.0,2x50ml 50ml,Kem chống nắng giúp bảo vệ da khỏi tia UVB & U...,4.6
4,4,253900006,Kem Chống Nắng Skin1004 Cho Da Nhạy Cảm SPF 50...,210000,445000.0,20ml 50ml,Kem Chống Nắng Skin1004 Cho Da Nhạy Cảm là sản...,4.6


In [10]:
# Tạo nội dung tổng hợp
# Lựa chọn các thông tin cần thiết để đưa vào phần content (giúp đề xuất nội dung)
# Có thể 1 cột hoặc nhiều cột thông tin
# ví dụ ở đây là chọn 1 cột, và chỉ lấy 200 từ đầu tiên trong phần mô tả (dễ quan sát khi làm demo)
df['Content'] = df['mo_ta'].apply(lambda x: ' '.join(x.split()[:200]))

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,ma_san_pham,ten_san_pham,gia_ban,gia_goc,phan_loai,mo_ta,diem_trung_binh,Content
0,0,318900012,Nước Hoa Hồng Klairs Không Mùi Cho Da Nhạy Cảm...,209000,435000.0,2x180ml 180ml Không Mùi Có Mùi Hương,Nước Hoa Hồng Klairs Supple Preparation là dòn...,4.8,Nước Hoa Hồng Klairs Supple Preparation là dòn...
1,1,205100137,"Nước Tẩy Trang L'Oreal Tươi Mát Cho Da Dầu, Hỗ...",147000,229000.0,2x400ml 95ml 400ml Làm Sạch Sâu Tươi Mát Da Ki...,Nước Tẩy Trang L'Oréal là dòng sản phẩm tẩy tr...,4.7,Nước Tẩy Trang L'Oréal là dòng sản phẩm tẩy tr...
2,2,422208973,Sữa Rửa Mặt CeraVe Sạch Sâu Cho Da Thường Đến ...,343000,455000.0,88ml 236ml 473ml Da khô/Hỗn hợp khô Da dầu/Hỗn...,Sữa Rửa Mặt Cerave Sạch Sâu là sản phẩm sữa rử...,4.9,Sữa Rửa Mặt Cerave Sạch Sâu là sản phẩm sữa rử...
3,3,204900013,Kem Chống Nắng La Roche-Posay Kiểm Soát Dầu SP...,377000,560000.0,2x50ml 50ml,Kem chống nắng giúp bảo vệ da khỏi tia UVB & U...,4.6,Kem chống nắng giúp bảo vệ da khỏi tia UVB & U...
4,4,253900006,Kem Chống Nắng Skin1004 Cho Da Nhạy Cảm SPF 50...,210000,445000.0,20ml 50ml,Kem Chống Nắng Skin1004 Cho Da Nhạy Cảm là sản...,4.6,Kem Chống Nắng Skin1004 Cho Da Nhạy Cảm là sản...


In [12]:
# word_tokenize
df["Content_wt"]=df["Content"].apply(lambda x: word_tokenize(x, format="text"))

In [13]:
df[["Content", "Content_wt"]].head(2)

Unnamed: 0,Content,Content_wt
0,Nước Hoa Hồng Klairs Supple Preparation là dòn...,Nước Hoa_Hồng Klairs_Supple Preparation là dòn...
1,Nước Tẩy Trang L'Oréal là dòng sản phẩm tẩy tr...,Nước Tẩy_Trang L'Oréal là dòng sản_phẩm tẩy_tr...


In [14]:
import re

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vector hóa nội dung
vectorizer = TfidfVectorizer(analyzer='word', stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(df['Content_wt'])

# Tính toán độ tương đồng
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [16]:
df_show = pd.DataFrame(cosine_sim)
df_show

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1176,1177,1178,1179,1180,1181,1182,1183,1184,1185
0,1.000000,0.046318,0.044475,0.035910,0.069248,0.058948,0.018264,0.058948,0.014693,0.046318,...,0.029752,0.052243,0.024346,0.046318,0.034860,0.012785,0.018341,0.010476,0.057437,0.044334
1,0.046318,1.000000,0.054836,0.205022,0.323700,0.248911,0.344537,0.248911,0.037613,1.000000,...,0.054686,0.136066,0.047740,1.000000,0.114735,0.168948,0.226573,0.041848,0.199116,0.341472
2,0.044475,0.054836,1.000000,0.062368,0.036074,0.074268,0.031508,0.074268,0.034999,0.054836,...,0.037336,0.040826,0.028214,0.054836,0.031245,0.028053,0.054286,0.024248,0.041111,0.046516
3,0.035910,0.205022,0.062368,1.000000,0.379898,0.111864,0.285196,0.111864,0.213205,0.205022,...,0.141100,0.350201,0.024438,0.205022,0.295167,0.203178,0.090201,0.034405,0.167217,0.200933
4,0.069248,0.323700,0.036074,0.379898,1.000000,0.152833,0.396838,0.152833,0.188476,0.323700,...,0.137511,0.321143,0.026912,0.323700,0.333040,0.247542,0.116906,0.031364,0.218714,0.284811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1181,0.012785,0.168948,0.028053,0.203178,0.247542,0.094574,0.260814,0.094574,0.250482,0.168948,...,0.094337,0.181597,0.008527,0.168948,0.205913,1.000000,0.061987,0.034977,0.105446,0.154940
1182,0.018341,0.226573,0.054286,0.090201,0.116906,0.147235,0.125313,0.147235,0.018774,0.226573,...,0.044534,0.053770,0.078876,0.226573,0.055424,0.061987,1.000000,0.036678,0.073900,0.127971
1183,0.010476,0.041848,0.024248,0.034405,0.031364,0.070822,0.034428,0.070822,0.017537,0.041848,...,0.008233,0.032805,0.007548,0.041848,0.033292,0.034977,0.036678,1.000000,0.018461,0.042098
1184,0.057437,0.199116,0.041111,0.167217,0.218714,0.075442,0.188614,0.075442,0.077795,0.199116,...,0.082927,0.106935,0.029593,0.199116,0.104109,0.105446,0.073900,0.018461,1.000000,0.174899


In [18]:
# Hàm đề xuất sản phẩm
# # với mỗi sản phẩm, lấy nums sản phẩm tương quan nhất
def get_recommendations_(ma_san_pham, cosine_sim=cosine_sim, nums=3):
    idx = df.index[df['ma_san_pham'] == ma_san_pham][0]
    print(idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:nums+1]  # Lấy 3 sản phẩm tương tự nhất
    hotel_indices = [i[0] for i in sim_scores]
    return df.iloc[hotel_indices]

In [19]:
# Gọi function
recommendations = get_recommendations_(318900012)
recommendations

0


Unnamed: 0.1,Unnamed: 0,ma_san_pham,ten_san_pham,gia_ban,gia_goc,phan_loai,mo_ta,diem_trung_binh,Content,Content_wt
97,97,318900011,Nước Hoa Hồng Klairs Dành Cho Da Nhạy Cảm 180ml,217000,409000.0,2x180ml 180ml Không Mùi Có Mùi Hương,Nước Hoa Hồng Klairs Supple Preparation là dòn...,4.8,Nước Hoa Hồng Klairs Supple Preparation là dòn...,Nước Hoa_Hồng Klairs_Supple Preparation là dòn...
596,596,422203798,Nước Hoa Hồng Klairs Không Mùi Cho Da Nhạy Cảm...,60000,135000.0,,Nước Hoa Hồng Klairs Supple Preparation Unscen...,0.0,Nước Hoa Hồng Klairs Supple Preparation Unscen...,Nước Hoa_Hồng Klairs_Supple Preparation_Unscen...
67,67,211300002,Nước Hoa Hồng Simple Làm Dịu Da & Cấp Ẩm 200ml,109000,180000.0,,Nước Hoa Hồng Simple Kind to Skin Soothing Fac...,4.8,Nước Hoa Hồng Simple Kind to Skin Soothing Fac...,Nước Hoa_Hồng Simple Kind to Skin_Soothing Fac...


#### Lưu ma trận kết quả consin và đọc lên khi cần đề xuất

In [21]:
# Save cosine_sim to file
import pickle
with open('products_cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

# Open and read file to cosine_sim_new
with open('products_cosine_sim.pkl', 'rb') as f:
    cosine_sim_new = pickle.load(f)
# cosine_sim_new

In [23]:
def get_recommendations(df, ma_san_pham, cosine_sim=cosine_sim, nums=5):
    # Get the index of the hotel that matches the hotel_id
    matching_indices = df.index[df['ma_san_pham'] == ma_san_pham].tolist()
    if not matching_indices:
        print(f"No hotel found with ID: {ma_san_pham}")
        return pd.DataFrame()  # Return an empty DataFrame if no match
    idx = matching_indices[0]

    # Get the pairwise similarity scores of all hotels with that hotel
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the hotels based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar hotels (Ignoring the hotel itself)
    sim_scores = sim_scores[1:nums+1]

    # Get the hotel indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top n most similar hotels as a DataFrame
    return df.iloc[product_indices]

In [24]:
# Example usage (make sure to use a valid ma_san_pham from your DataFrame)
recommendations = get_recommendations(df, 318900012, cosine_sim=cosine_sim_new, nums=3)
recommendations

Unnamed: 0.1,Unnamed: 0,ma_san_pham,ten_san_pham,gia_ban,gia_goc,phan_loai,mo_ta,diem_trung_binh,Content,Content_wt
97,97,318900011,Nước Hoa Hồng Klairs Dành Cho Da Nhạy Cảm 180ml,217000,409000.0,2x180ml 180ml Không Mùi Có Mùi Hương,Nước Hoa Hồng Klairs Supple Preparation là dòn...,4.8,Nước Hoa Hồng Klairs Supple Preparation là dòn...,Nước Hoa_Hồng Klairs_Supple Preparation là dòn...
596,596,422203798,Nước Hoa Hồng Klairs Không Mùi Cho Da Nhạy Cảm...,60000,135000.0,,Nước Hoa Hồng Klairs Supple Preparation Unscen...,0.0,Nước Hoa Hồng Klairs Supple Preparation Unscen...,Nước Hoa_Hồng Klairs_Supple Preparation_Unscen...
67,67,211300002,Nước Hoa Hồng Simple Làm Dịu Da & Cấp Ẩm 200ml,109000,180000.0,,Nước Hoa Hồng Simple Kind to Skin Soothing Fac...,4.8,Nước Hoa Hồng Simple Kind to Skin Soothing Fac...,Nước Hoa_Hồng Simple Kind to Skin_Soothing Fac...
