In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import io
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

load_dotenv()

QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL')

qdrant_client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
collection_name = "car_data"  

# Load the sentence transformer model
model = SentenceTransformer("intfloat/multilingual-e5-small")

# Get the embedding dimension
sample_text = "Sample text for dimension check"
sample_embedding = model.encode(sample_text)
embedding_dim = len(sample_embedding)
print(f"Embedding dimension: {embedding_dim}")

Embedding dimension: 384


In [3]:
# Function to search car brands with typo tolerance
def search_car_brand(query, top_k=10):
    # Add context to the query to match our embeddings
    query_with_context = query
    query_vector = model.encode(query_with_context)
    
    # Use query_filter instead of filter parameter
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k * 3,
        query_filter={
            "must": [
                {
                    "key": "vector_type",
                    "match": {
                        "value": "brand"
                    }
                }
            ]
        }
    )
    
    # Remove duplicates based on car_brand
    unique_results_brand = {}
    for result in search_result:
        car_model = result.payload['car_brand']
        if car_model not in unique_results_brand:
            unique_results_brand[car_model] = result
    
    # Return top_k unique results
    return list(unique_results_brand.values())[:top_k]


# Function to search car models with typo tolerance
def search_car_model(query, top_k=10):
    # Add generic context to the query to match our embeddings
    query_with_context = query
    query_vector = model.encode(query_with_context)
    
    # Use query_filter instead of filter parameter
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k * 3,
        query_filter={
            "must": [
                {
                    "key": "vector_type",
                    "match": {
                        "value": "model"
                    }
                }
            ]
        }
    )
    
    # Remove duplicates based on car_model
    unique_results_model = {}
    for result in search_result:
        car_model = result.payload['car_model']
        if car_model not in unique_results_model:
            unique_results_model[car_model] = result
    
    # Return top_k unique results
    return list(unique_results_model.values())[:top_k]

In [None]:
from rapidfuzz import process as rapidfuzz_process
from rapidfuzz import fuzz
import re

# Load car data
df = pd.read_csv('car_dataset.csv')
model_choices = list(df['car_model'].unique())
brand_choices = list(df['car_brand'].unique())

def normalize_text(s):
    """Normalize text to improve matching"""
    if not isinstance(s, str):
        return s
    s = s.strip().lower()
    s = re.sub(r'[\-–—_/]', ' ', s)  # unify hyphens/underscores to spaces
    s = re.sub(r'\s+', ' ', s)       # collapse multiple spaces
    return s

def hybrid_search(query, choices, vector_type="brand", fuzzy_threshold=75, top_k=3):
    """
    Hybrid search that combines RapidFuzz and embeddings:
    1. Try fuzzy matching first with 75% threshold (fast + handles typos)
    2. If no good fuzzy matches, fall back to embeddings (semantic understanding)
    
    Args:
        query: The search query
        choices: List of choices to search against
        vector_type: "brand" or "model"
        fuzzy_threshold: Minimum score (0-100) for fuzzy matches
        top_k: Number of results to return
    """
    # Normalize query for better matching
    query_norm = normalize_text(query)
    
    # Adjust scorer based on query characteristics
    if ' ' in query_norm or len(query_norm) > 10:
        scorer = fuzz.token_sort_ratio  # Better for word order/spacing differences
    else:
        scorer = fuzz.ratio  # Standard for character-level typos
    
    # Step 1: Try RapidFuzz first (faster than embeddings)
    fuzzy_matches = rapidfuzz_process.extract(
        query_norm, 
        [normalize_text(c) for c in choices],  # Normalize choices too
        scorer=scorer,
        limit=top_k * 2  # Get more candidates for filtering
    )
    
    # Map normalized choices back to original labels
    norm_to_orig = {normalize_text(c): c for c in choices}
    fuzzy_matches = [(norm_to_orig.get(match, match), score, idx) for match, score, idx in fuzzy_matches]
    
    # Filter matches that meet our threshold
    good_fuzzy_matches = [(match, score) for match, score, _ in fuzzy_matches if score >= fuzzy_threshold]
    
    results = []
    
    # If we have good fuzzy matches, return those
    if good_fuzzy_matches:
        print(f"Found {len(good_fuzzy_matches)} good fuzzy matches for '{query}'")
        for match, score in good_fuzzy_matches:
            results.append({
                "text": match,
                "score": score / 100.0,  # Normalize to 0-1 scale
                "source": "fuzzy"
            })
    
    # Step 2: If not enough good fuzzy matches, use embeddings
    if len(results) == 0:
        print(f"No good fuzzy matches above threshold {fuzzy_threshold}, using embeddings")
        
        # Use appropriate search function based on vector_type
        if vector_type == "brand":
            search_fn = search_car_brand
        else:  # model
            search_fn = search_car_model
            
        # Get embedding results
        embedding_results = search_fn(query, top_k=top_k)
        
        # Extract relevant information
        for result in embedding_results:
            if vector_type == "brand":
                text = result.payload.get("car_brand")
            else:  # model
                text = result.payload.get("car_model")
                
            # Skip if this result is already in our list from fuzzy matching
            if any(r["text"] == text for r in results):
                continue
                
            # Add to results
            results.append({
                "text": text,
                "score": result.score,
                "source": "embedding"
            })
    
    # Return top_k results, sorted by score
    return sorted(results, key=lambda x: x["score"], reverse=True)[:top_k]

# Test cases from comprehensive eval_cases list
eval_cases = [
    # Models only
    ("bezxa", "Bezza"),
    ("bizza", "Bezza"),
    ("axla", "Axia"),
    ("aksia", "Axia"),
    ("myvee", "Myvi"),
    ("sagha", "Saga"),
    ("alzza", "Alza"),
    ("attiva", "Ativa"),
    ("vi0s", "Vios"),   # OCR: 0 vs o
    ("ciity", "City"),
    ("x5o", "X50"),    # 5 vs S vs o
    ("hilax", "Hilux"),
    ("hiluks", "Hilux"),

    # Brands
    ("perodue", "Perodua"),
    ("perjdia", "Perodua"),
    ("[roton", "Proton"),
    ("protoon", "Proton"),
    ("toyouta", "Toyota"),
    ("hinda", "Honda"),
    ("neesun", "Nissan"),          # phonetic
    ("merc benz", "Mercedes-Benz"),
    ("benz", "Mercedes-Benz"),     # abbreviation
    ("bmww", "BMW"),
    ("bydd", "BYD"),
    ("wolkswagen", "Volkswagen"), 
    ("p495on", "Proton"),          # OCR: numbers
    ("p4tons", "Proton"),
    ("jpnda", "Honda"),            # keyboard typo
    ("protin", "Proton"),
    ("e-mas seven", "E-Mas 7"),    # number word
    ("xseventy", "X70"),           # number word
    ("cx thirty", "CX-30"),        # number word
    ("x-7o", "X70"),               # OCR: 7 vs T
    (",yvi", "Myvi"),              # OCR: , vs m
    ("sivic", "Civic"),            # phonetic

    # Extra OCR confusions
    ("t0y0ta", "Toyota"),          # 0 vs o
    ("h0nda", "Honda"),
    ("v1os", "Vios"),

    # Extra phonetic cases
    ("nisan", "Nissan"),
    ("merz", "Mercedes-Benz"),
    ("bemer", "BMW"),
    ("civik", "Civic"),

    # Concatenation / spacing
    ("mercedesbenz", "Mercedes-Benz"),
    ("perod u a", "Perodua"),
    ("hondacity", "City"),

    # Word-number transcriptions
    ("x fifty", "X50"),
    ("c x thirty", "CX-30"),

    # Keyboard adjacency mistakes
    ("aqia", "Axia"),
    ("citu", "City"),
    ("hilud", "Hilux"),
]

print("\nTESTING HYBRID SEARCH WITH EVAL CASES\n")

# Test specific cases or run through them all
sample_cases = eval_cases[:]  # Use all cases, or select a subset with slicing

# Track stats
stats = {"total": len(sample_cases), "fuzzy": 0, "embedding": 0}

for query, expected in sample_cases:
    # Determine if it's likely a model or brand query
    is_model = expected in model_choices
    is_brand = expected in brand_choices
    
    if is_model:
        results = hybrid_search(query, model_choices, vector_type="model", fuzzy_threshold=75, top_k=3)
    elif is_brand:
        results = hybrid_search(query, brand_choices, vector_type="brand", fuzzy_threshold=75, top_k=3)
    else:
        # Try both if uncertain
        print(f"Warning: '{expected}' not found in model or brand lists, trying both")
        results = hybrid_search(query, model_choices + brand_choices, fuzzy_threshold=75, top_k=3)
    
    # Track which method provided the results
    if results and results[0]["source"] == "fuzzy":
        stats["fuzzy"] += 1
    elif results:
        stats["embedding"] += 1
        
    # Print results
    print(f"\nQuery: '{query}' (expected: {expected})")
    for i, res in enumerate(results, 1):
        match = "✓" if res["text"] == expected else " "
        print(f"  {i}. {res['text']} ({res['score']:.4f}, {res['source']}) {match}")
    
    # Visual separator
    print("-" * 60)

# Print summary stats
print("\n=== SUMMARY ===")
print(f"Total cases: {stats['total']}")
print(f"Resolved by fuzzy: {stats['fuzzy']} ({stats['fuzzy']/stats['total']*100:.1f}%)")
print(f"Resolved by embeddings: {stats['embedding']} ({stats['embedding']/stats['total']*100:.1f}%)")


TESTING HYBRID SEARCH WITH EVAL CASES

Found 1 good fuzzy matches for 'bezxa'

Query: 'bezxa' (expected: Bezza)
  1. Bezza (0.8000, fuzzy) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for 'bizza'

Query: 'bizza' (expected: Bezza)
  1. Bezza (0.8000, fuzzy) ✓
------------------------------------------------------------
Found 2 good fuzzy matches for 'axla'

Query: 'axla' (expected: Axia)
  1. Alza (0.7500, fuzzy)  
  2. Axia (0.7500, fuzzy) ✓
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings


  search_result = qdrant_client.search(



Query: 'aksia' (expected: Axia)
  1. Axia (0.8948, embedding) ✓
  2. Accord (0.8585, embedding)  
  3. Attrage (0.8569, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'myvee' (expected: Myvi)
  1. Myvi (0.9166, embedding) ✓
  2. Ativa (0.8590, embedding)  
  3. Viva (0.8530, embedding)  
------------------------------------------------------------
Found 1 good fuzzy matches for 'sagha'

Query: 'sagha' (expected: Saga)
  1. Saga (0.8889, fuzzy) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for 'alzza'

Query: 'alzza' (expected: Alza)
  1. Alza (0.8889, fuzzy) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for 'attiva'

Query: 'attiva' (expected: Ativa)
  1. Ativa (0.9091, fuzzy) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for 'vi0s'

Query: 'vi0s' (expected: Vios

  search_result = qdrant_client.search(



Query: 'perjdia' (expected: Perodua)
  1. Chery (0.8523, embedding)  
  2. Perodua (0.8396, embedding) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for '[roton'

Query: '[roton' (expected: Proton)
  1. Proton (0.8333, fuzzy) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for 'protoon'

Query: 'protoon' (expected: Proton)
  1. Proton (0.9231, fuzzy) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for 'toyouta'

Query: 'toyouta' (expected: Toyota)
  1. Toyota (0.9231, fuzzy) ✓
------------------------------------------------------------
Found 1 good fuzzy matches for 'hinda'

Query: 'hinda' (expected: Honda)
  1. Honda (0.8000, fuzzy) ✓
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'neesun' (expected: Nissan)
  1. Nissan (0.8647, embedding) ✓
  2. Toyota (0.8265, embedding)  
--