In [1]:
from qdrant_client import QdrantClient
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz
from typing import List, Dict, Optional, Any, Tuple
from langdetect import detect_langs
import os
from keybert import KeyBERT


OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY =os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION_NAME = os.getenv("STORAGE")

qdrant_client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)
class RecommendationConfig:
    MAX_RESULTS = 10
    BRAND_MATCH_BOOST = 15
    PRICE_RANGE_MATCH_BOOST = 15
    TYPE_MATCH_BOOST = 5
    HISTORY_MATCH_BOOST = 10
    FUZZY_WEIGHT = 0.5
    COSINE_WEIGHT = 0.5


def ai_model():
    return ChatOpenAI(
        openai_api_key=OPENAI_API_KEY,       
        model="gpt-4o-mini",     
        temperature=0,
        max_tokens=3000
    )

def convert_to_string(value) -> str:
    """Convert any value to a string in a standardized way."""
    if isinstance(value, list):
        return " ".join(str(item) for item in value)
    return str(value)

def fuzzy_score(user_query: str, metadata: Dict) -> float:
    """Calculate enhanced fuzzy matching score using multiple algorithms."""
    if not user_query:
        return 0.0
    
    user_query_lower = user_query.lower()
    priority_fields = ['device_name','brand','category','discount_percent', 'sales_perks',
                    'payment_perks','sales_price']
    fuzzy_sim = 0
    for field in priority_fields:
        if field in metadata:
            value_str = convert_to_string(metadata[field])
            fuzzy_sim += fuzz.partial_ratio(user_query_lower, value_str.lower())
    return fuzzy_sim

def check_similarity(text1: str, texts: Optional[List[str]], vectorizer=None) -> float:
    """Calculate max cosine similarity between text1 and a list of strings. Optionally reuse a vectorizer."""
    if not text1 or not texts:
        return 0.0
    corpus = [text1] + texts
    if vectorizer is None:
        vectorizer = TfidfVectorizer().fit(corpus)
    vectors = vectorizer.transform(corpus)
    similarities = cosine_similarity(vectors[0], vectors[1:])[0]
    return max(similarities) if similarities.size > 0 else 0.0

kw_model = KeyBERT(model='paraphrase-multilingual-MiniLM-L12-v2')

def keyword(user_query: str) -> str:
    keywords_with_scores = kw_model.extract_keywords(
    user_query,
    keyphrase_ngram_range=(1, 2),
    stop_words=None,
    top_n=5,
    use_mmr=True,
    diversity=0.5,
    seed_keywords=["phone", "laptop/pc", "earphone", "power bank", "mouse", "case", "keyboard","apple","xiaomi","realme","honor","samsung", "oppo", "dell", "macbook", "msi", "asus", "hp","lenovo","acer",'gigabyte',"logitech","marshall"]
)

    keywords_str = ", ".join([kw for kw, _ in keywords_with_scores])

    return keywords_str

_cached_all_points = None
def get_all_points(batch_size: int = 100):
    """Retrieve all points from Qdrant and cache the results for performance."""
    global _cached_all_points
    if _cached_all_points is None:
        _cached_all_points = qdrant_client.scroll(
            collection_name=QDRANT_COLLECTION_NAME,
            scroll_filter=None,
            with_vectors=False,
            with_payload=True,
            limit=batch_size
        )[0]
    return _cached_all_points

def recommend_system(
    user_input: Optional[str] = None,
    types: Optional[str] = None,
    recent_history: Optional[List[Dict]] = None,
    preference: Optional[Dict[str, Any]] = None,
    custom_config: Optional[RecommendationConfig] = None,
) -> Tuple[str, Optional[List[str]]]:
    """
    Recommend products based on user input, types, preferences, and history.
    User input is optional - system can recommend based on other parameters if not provided.
    """
    config = custom_config or RecommendationConfig()
    use_user_input = user_input is not None and user_input.strip() != ""
    
    main_query = ""
    main_query_lower = ""
    if use_user_input:
        main_query = keyword(user_input)
        main_query_lower = main_query.lower()
    
    language_input = user_input or types or ""
    if not language_input and preference:
        if "brand" in preference and isinstance(preference["brand"], list) and preference["brand"]:
            language_input = preference["brand"][0]
    language = detect_langs(language_input)

    all_points = get_all_points()
    matched_docs = []
    has_brands = has_types = has_history = has_price = False
    brands = []
    if preference and "brand" in preference and preference["brand"]:
        brands = [brand.lower() for brand in preference["brand"]]
        has_brands = len(brands) > 0
    
    user_types = []
    if types:
        user_types = types.lower().split()
        has_types = len(user_types) > 0
    
    history_device_names = []
    if recent_history:
        history_device_names = [rh["device_name"].lower() for rh in recent_history 
                            if isinstance(rh, dict) and "device_name" in rh]
        has_history = len(history_device_names) > 0
    
    price_min = price_max = None
    if preference and "price_range" in preference and preference["price_range"]:
        price_range = preference["price_range"]
        if isinstance(price_range, list):
            has_price = True
            if len(price_range) == 1:
                price_max = price_range[0]
            elif len(price_range) >= 2:
                price_min, price_max = price_range[:2]

    # Calculate scores for each document
    for doc in all_points:
        metadata = doc.payload.get("metadata", {})
        total_score = 0
        
        if use_user_input:
            combined_fields = [
                convert_to_string(metadata.get("device_name", "")),
                convert_to_string(metadata.get("brand", "")),
                convert_to_string(metadata.get("category", "")),
                convert_to_string(metadata.get("sales_perks", "")),
                convert_to_string(metadata.get("sales_price", "")),
                convert_to_string(metadata.get("discount_percent", "")),
                convert_to_string(metadata.get("payment_perks", ""))
            ]
            cos_score = check_similarity(main_query_lower, combined_fields)
            user_input_score = fuzzy_score(main_query_lower, metadata)
            total_score += user_input_score * config.FUZZY_WEIGHT + cos_score * 100 * config.COSINE_WEIGHT
        
        if has_types:
            doc_category = metadata.get("suitable_for", "").lower()
            if doc_category:
                type_score = sum(fuzz.partial_ratio(t, doc_category) for t in user_types)
                total_score += (type_score / len(user_types)) * config.TYPE_MATCH_BOOST / 100
        
        if has_brands:
            doc_brand = metadata.get("brand", "").lower()
            if doc_brand:
                brand_score = sum(fuzz.partial_ratio(b, doc_brand) for b in brands)
                total_score += (brand_score / len(brands)) * config.BRAND_MATCH_BOOST / 100
        
        if has_price:
            sale_price = metadata.get("sale_price")
            if isinstance(sale_price, (int, float)):
                if price_min is not None and price_max is not None and price_min <= sale_price <= price_max:
                    total_score += config.PRICE_RANGE_MATCH_BOOST
                elif price_max is not None and sale_price <= price_max:
                    total_score += config.PRICE_RANGE_MATCH_BOOST * 0.7
                elif price_min is not None and sale_price >= price_min:
                    total_score += config.PRICE_RANGE_MATCH_BOOST * 0.7
        
        if has_history:
            doc_info = f"{metadata.get('category', '').lower()} {metadata.get('brand', '').lower()} {metadata.get('device_name', '').lower()}"
            history_score = sum(fuzz.partial_ratio(h, doc_info) for h in history_device_names)
            total_score += (history_score / len(history_device_names)) * config.HISTORY_MATCH_BOOST / 100
        
        matched_docs.append({
            "doc": doc,
            "score": total_score
        })

    matched_docs.sort(key=lambda x: x["score"], reverse=True)
    top_match_count = min(len(matched_docs), config.MAX_RESULTS)
    top_matches = matched_docs[:top_match_count]
    
    top_device_names = []
    for match in top_matches:
        device_name = match["doc"].payload.get("metadata", {}).get("device_name")
        if device_name:
            top_device_names.append(device_name)

    if not top_matches:
        return "I couldn't find any products matching your criteria. Could you provide more specific details?", []

    search_context = ""

    meta_fields = [
        "device_name", "cpu", "card", "screen", "storage", "image_link",
        "sale_price", "discount_percent", "installment_price",
        "colors", "sales_perks", "guarantee_program", "payment_perks", "source"
    ]

    for idx, item in enumerate(top_matches, start=1):
        meta = item["doc"].payload.get("metadata", {})
        content = f"Product {idx}:\n"
        for field in meta_fields:
            if field in meta:
                if field in ["sale_price","installment_price"]:
                    value = meta[field]
                    if isinstance(value, (int, float)):
                        content += f"- {field}: {value:,} VND\n"
                    else:
                        content += f"- {field}: {value} VND\n"
                elif field == "discount_percent":
                    content += f"- {field}: {meta[field]}%\n"
                else:
                    content += f"- {field}: {meta[field]}\n"
        search_context += content + "\n\n"
    
    source = top_matches[0]["doc"].payload.get("metadata", {}).get("source") if top_matches else ""
    images = top_matches[0]["doc"].payload.get("metadata", {}).get("image_link") if top_matches else ""

    llm = ai_model()
    prompt = ChatPromptTemplate.from_messages([
        ("system", """
            You are a friendly and polite salesman for FPT Shop, specializing in phones and other tech devices.  
            Your goal is to recommend products based on the `search_context` and the user's `user_query`.  

            **IMPORTANT:**  
            - Reply in the language that best matches the user's query (English or Vietnamese only; default to English if unclear).  
            - Identify the best matching device from {retrieved_devices} by focusing on brand and price range, especially prioritizing the device that fits the user's preferences most closely.  
            - Be brief and to the point. Focus on the best match.
            - include all details from {search_context} for your main recommendation,(make sure to include sales_perks, paymen_perks) and add:  
            - {source}  
            - {images} 
            - Very briefly mention 1-2 alternative products if available.
            - End with a simple question about which device they want more information about.
        """),
        ("human", "User query: {user_query}\n\nSearch results:\n{search_context}")
    ])
    chain = prompt | llm
    response = chain.invoke({
        "user_query": user_input,
        "search_context": search_context,
        "retrieved_devices": top_device_names,
        "source": source,
        "images": images,
        "language": language
    })
    
    return response.content if hasattr(response, 'content') else response, top_device_names

def get_device_details(user_query: str) -> str:
    """
    Retrieve detailed information about a specific device.
    Uses a cache to avoid repeated lookups for the same device.
    """
    
    language = detect_langs(user_query)
    
    # Initialize variables
    all_points = get_all_points()
    matching_doc = None
    source = None

    try:
        # Debug state contents
        
        # Get recommended devices from state
        recommended_devices = ['iPhone 15 128GB', 'iPhone 16 128GB', 'iPhone 11 64GB', 'iPhone 13 128GB', 'iPhone 16e 128GB', 'iPhone 14 128GB', 'iPhone 15 Plus 128GB', 'iPhone 15 Pro 128GB', 'iPhone 14 Plus 128GB', 'iPhone 16 Plus 128GB']

        
        # Initialize matching_doc to None
        matching_doc = None
        
        # Find the best match for the device
        best_score = -1
        best_match = None

        if recommended_devices:
            for rec_device in recommended_devices:
                score = fuzz.ratio(user_query, rec_device.lower())
                if score > best_score:
                    for doc in all_points:
                        metadata = doc.payload.get("metadata", {})
                        if metadata.get("device_name", "") == rec_device:
                            best_match = doc
                            best_score = score
                            print(f"Top fuzzy match from recommendations: {rec_device} (score: {score})")
                            break
        
        if best_match:
            matching_doc = best_match
            source = matching_doc.payload.get("metadata", {}).get("source")
            device_name = matching_doc.payload.get("metadata", {}).get("device_name", user_query)
            print(f"Using fuzzy match: {device_name} (score: {best_score})")
        
        if not matching_doc:
            return f"No detailed information found for {user_query}. Please try another product."
        
    except Exception as e:
        return f"An error occurred: {str(e)}"
    
    detail = matching_doc.payload['page_content']

    llm = ai_model()
    prompt = ChatPromptTemplate.from_messages([
        ("system", """
                You are a product assistant for FPT Shop selling tech devices.
                Provide concise information about the specific device.
                Include links and image links if present.
                Maintain the detected language.
                End with a reference to {source} for more details.
            """),
        ("human", "device_name: {device_name}\n\nSearch results:\n{detail}\n\nLanguage:\n{language}")
    ])

    chain = prompt | llm
    response = chain.invoke({
        "device_name": device_name,
        "detail": detail,
        "language": language,
        "source": source
    })
    
    # Return the result
    result = response.content if hasattr(response, 'content') else response
    return result



  from .autonotebook import tqdm as notebook_tqdm





In [2]:
response = recommend_system(
    user_input="Tôi muốn xem các mẫu của Iphone 14",
    types='',
    recent_history=[],
    preference={
        "brand": ["Iphone, Xiaomi, Oppo, MSI"],
        "price_range": [10000000]
    }
)


In [3]:
response

("I see you're interested in the iPhone 14. Here are the details for the **iPhone 14 128GB**:\n\n- **Sale Price**: 12,790,000 VND\n- **Discount**: 42%\n- **Installment Price**: 1,067,000 VND\n- **Colors**: Xanh dương, Trắng, Đen, Đỏ, Tím, Vàng\n- **Sales Perks**: \n  - Giảm ngay 9,200,000đ áp dụng đến 19/05\n  - AirPods giảm đến 500,000đ khi mua kèm iPhone\n  - Giảm thêm đến 2 triệu khi mua kèm SIM FPT FVIP150/F299 6-12 tháng\n- **Payment Perks**: Giảm ngay 500,000đ cho đơn trên 15 triệu khi trả góp 100% qua thẻ VISA (áp dụng Sacombank và Muadee by HDBank)\n- **Source**: [iPhone 14](https://fptshop.com.vn/dien-thoai/iphone-14)\n- ![iPhone 14](https://cdn2.fptshop.com.vn/unsafe/iphone_14_48a46d1684.png)\n\nIf you're also considering alternatives, you might look at the **iPhone 15 128GB** or **iPhone 16 128GB**.\n\nWhich device would you like more information about?",
 ['iPhone 15 Pro 128GB',
  'iPhone 16 Plus 128GB',
  'iPhone 15 Pro Max 256GB',
  'Samsung Galaxy S25 5G 12GB 256GB',
  '