In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# --- 1. Load data ---
file_path = "/data/crawl_tiki_data/books_data_2025-10-01_14-30-45.csv"
df = pd.read_csv(file_path)

df['content'] = (
    df['name'].fillna('') + ' ' +
    df['authors'].fillna('') + ' ' +
    df['category'].fillna('') + ' ' +
    df['short_description'].fillna('')
)

In [5]:
# --- 2. Load model ---
model = SentenceTransformer('keepitreal/vietnamese-sbert')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
# --- 3. Encode embeddings ---
print("üîÑ ƒêang sinh embedding...")
embeddings = model.encode(df['content'], convert_to_tensor=False, show_progress_bar=True)
title_embeddings = model.encode(df['name'].fillna('').tolist(), convert_to_tensor=False, show_progress_bar=True)
authors_embeddings = model.encode(df['authors'].fillna('').tolist(), convert_to_tensor=False, show_progress_bar=True)
category_embeddings = model.encode(df['category'].fillna('').tolist(), convert_to_tensor=False, show_progress_bar=True)

cosine_sim = cosine_similarity(embeddings)

üîÑ ƒêang sinh embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:20<00:00,  2.24s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:28<00:00,  2.18it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:12<00:00,  4.94it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:10<00:00,  6.14it/s]


In [7]:
# --- 4. Ph√°t hi·ªán lo·∫°i query ---
def detect_query_type(query):
    query_emb = model.encode(query, convert_to_tensor=False)

    sim_title = cosine_similarity([query_emb], title_embeddings)[0].max()
    sim_author = cosine_similarity([query_emb], authors_embeddings)[0].max()
    sim_category = cosine_similarity([query_emb], category_embeddings)[0].max()
    sim_content = cosine_similarity([query_emb], embeddings)[0].max()

    sims = {
        "title": sim_title,
        "author": sim_author,
        "category": sim_category,
        "content": sim_content
    }
    query_type = max(sims, key=sims.get)
    return query_type, sims[query_type]


In [8]:
# --- 5. G·ªôp c√°c lo·∫°i embedding ---
def find_best_match_across_embeddings(query_text):
    query_embedding = model.encode(query_text, convert_to_tensor=False)
    sim_title = cosine_similarity([query_embedding], title_embeddings)[0]
    sim_content = cosine_similarity([query_embedding], embeddings)[0]
    sim_author = cosine_similarity([query_embedding], authors_embeddings)[0]
    sim_category = cosine_similarity([query_embedding], category_embeddings)[0]

    combined_sims = (
        0.25 * sim_title +
        0.25 * sim_content +
        0.25 * sim_author +
        0.25 * sim_category
    )
    top_idx = np.argmax(combined_sims)
    top_score = combined_sims[top_idx]
    return top_idx, top_score

In [9]:
# --- 6. G·ª£i √Ω s√°ch ---
def recommend_books(query_text, top_n=5, threshold=0.5):
    query_text = query_text.strip()
    
    #  X√°c ƒë·ªãnh lo·∫°i query (author / category / title / content)
    query_type, confidence = detect_query_type(query_text)
    print(f" Nh·∫≠n di·ªán query thu·ªôc lo·∫°i: {query_type} (ƒë·ªô tin c·∫≠y: {confidence:.2f})")

    #  Tr∆∞·ªùng h·ª£p query l√† t√°c gi·∫£
    if query_type == "author":
        idx = np.argmax(cosine_similarity([model.encode(query_text)], authors_embeddings)[0])
        author_name = df.loc[idx, 'authors']
        print(f" G·ª£i √Ω c√°c s√°ch c·ªßa t√°c gi·∫£: {author_name}")
        result = df[df['authors'].str.contains(author_name, case=False, na=False)]

    #  Tr∆∞·ªùng h·ª£p query l√† th·ªÉ lo·∫°i
    elif query_type == "category":
        idx = np.argmax(cosine_similarity([model.encode(query_text)], category_embeddings)[0])
        category_name = df.loc[idx, 'category']
        print(f" G·ª£i √Ω c√°c s√°ch thu·ªôc th·ªÉ lo·∫°i: {category_name}")
        result = df[df['category'].str.contains(category_name, case=False, na=False)]

    #  C√≤n l·∫°i (title ho·∫∑c content)
    else:
        #  Ki·ªÉm tra kh·ªõp tr·ª±c ti·∫øp t√™n s√°ch (∆∞u ti√™n nh·∫•t)
        matches = df[df['name'].str.contains(query_text, case=False, na=False)]
        if not matches.empty:
            print(f" Kh·ªõp ch√≠nh x√°c theo t√™n s√°ch: {matches.iloc[0]['name']}")
            result = matches[['name', 'authors', 'category', 'link']].head(top_n).copy()
            return result
            
        print(" Truy v·∫•n ƒë∆∞·ª£c xem l√† li√™n quan ƒë·∫øn ti√™u ƒë·ªÅ ho·∫∑c n·ªôi dung.")
        query_emb = model.encode(query_text, convert_to_tensor=False)

        # T√≠nh t∆∞∆°ng ƒë·ªìng d·ª±a tr√™n content t·ªïng h·ª£p (t√™n + m√¥ t·∫£)
        sim_content = cosine_similarity([query_emb], embeddings)[0]

        top_idx = np.argmax(sim_content)
        top_score = sim_content[top_idx]

        if top_score < threshold:
            print(" Kh√¥ng c√≥ match t·ªët ‚Äî g·ª£i √Ω ng·∫´u nhi√™n.")
            result = df.sample(top_n)
        else:
            print(f" T√¨m th·∫•y s√°ch g·∫ßn nh·∫•t: {df.loc[top_idx, 'name']} (similarity: {top_score:.2f})")

            # L·∫•y top N s√°ch t∆∞∆°ng t·ª±
            sim_scores = list(enumerate(sim_content))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n + 1]
            result = df.iloc[[i[0] for i in sim_scores]]

    # Chu·∫©n h√≥a k·∫øt qu·∫£
    result = result[['name', 'authors', 'category', 'link']].head(top_n).copy()
    return result


In [11]:
# --- 7. Test ---
if __name__ == "__main__":
    query = input("Nh·∫≠p n·ªôi dung/t√™n/t√°c gi·∫£/th·ªÉ lo·∫°i: ")
    recs = recommend_books(query, top_n=5)
    print("\n Top s√°ch g·ª£i √Ω:\n")
    print(recs.to_string(index=False))

 Nh·∫≠n di·ªán query thu·ªôc lo·∫°i: title (ƒë·ªô tin c·∫≠y: 0.51)
 Truy v·∫•n ƒë∆∞·ª£c xem l√† li√™n quan ƒë·∫øn ti√™u ƒë·ªÅ ho·∫∑c n·ªôi dung.
 Kh√¥ng c√≥ match t·ªët ‚Äî g·ª£i √Ω ng·∫´u nhi√™n.

 Top s√°ch g·ª£i √Ω:

                                          name                authors                      category                                                   link
                           Th√©p ƒê√£ T√¥i Th·∫ø ƒê·∫•y   Nikolai A. Ostrovsky T√°c Ph·∫©m Kinh ƒêi·ªÉn N∆∞·ªõc Ngo√†i https://tiki.vn/product-p195545297.html?spid=195545299
                              Gi√≥ L·∫°nh ƒê·∫ßu M√πa              Th·∫°ch Lam            B√∫t K√Ω - T·ª± Truy·ªán https://tiki.vn/product-p195545261.html?spid=195545262
                 Brian Tracy - Thu·∫≠t H√πng Bi·ªán            Brian Tracy         S√°ch k·ªπ nƒÉng l√†m vi·ªác https://tiki.vn/product-p208198122.html?spid=208198124
                            Tr√≤ Ch∆°i Gi·∫´m B√≥ng           Okamoto Kido                Truy·ªán kinh d·ªã https