In [5]:
import pandas as pd
import numpy as np
import torch
import os
import sys
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

from datasets import Dataset 

notebook_dir = os.path.dirname(os.path.abspath("__file__"))
if notebook_dir not in sys.path:
    sys.path.append(notebook_dir)

MODEL_NAME = "intfloat/multilingual-e5-small"
OUTPUT_PATH = "./model/finetuned-embedding-model"
EPOCHS = 10
BATCH_SIZE = 4

In [6]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import io
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

load_dotenv()

QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL')

qdrant_client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

In [3]:
model = SentenceTransformer(MODEL_NAME)

In [7]:
from rapidfuzz import fuzz

collection_name = "car_data_modelbrand"  

# Function to search car brands with typo tolerance
def search_car_brand(query, top_k=10):
    # Add context to the query to match our embeddings
    query_with_context = f"car brand: {query}" 
    query_vector = model.encode(query_with_context)
    
    # Use query_filter instead of filter parameter
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k * 3,
        query_filter={
            "must": [
                {
                    "key": "vector_type",
                    "match": {
                        "value": "brand"
                    }
                }
            ]
        }
    )

    # Remove duplicates based on car_brand
    unique_results_brand = {}
    for result in search_result:
        car_model = result.payload['car_brand']
        if car_model not in unique_results_brand:
            unique_results_brand[car_model] = result
    
    # Return top_k unique results
    return list(unique_results_brand.values())[:top_k]


# Function to search car models with typo tolerance
def search_car_model(query, top_k=10):
    # Add generic context to the query to match our embeddings
    query_with_context = f"car model: {query}"  
    query_vector = model.encode(query_with_context)
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k * 3,
        query_filter={
            "must": [
                {
                    "key": "vector_type",
                    "match": {
                        "value": "model"
                    }
                }
            ]
        }
    )
    
    # Remove duplicates based on car_model
    unique_results_model = {}
    for result in search_result:
        car_model = result.payload['car_model']
        if car_model not in unique_results_model:
            unique_results_model[car_model] = result
    
    # Return top_k unique results
    return list(unique_results_model.values())[:top_k]

In [8]:
from rapidfuzz import process as rapidfuzz_process
from rapidfuzz import fuzz
import re

def normalize_text(s):
    """Normalize text to improve matching"""
    if not isinstance(s, str):
        return s
    s = s.strip().lower()
    s = re.sub(r'[\-–—_/]', ' ', s)  
    s = re.sub(r'\s+', ' ', s)     
    return s  

def hybrid_search(query, choices, vector_type="brand", fuzzy_threshold=75, top_k=3, search_model = None):
    """
    Hybrid search that combines RapidFuzz and embeddings:
    1. Try fuzzy matching first with 75% threshold (fast + handles typos)
    2. If no good fuzzy matches, fall back to embeddings (semantic understanding)
    
    Args:
        query: The search query
        choices: List of choices to search against
        vector_type: "brand" or "model"
        fuzzy_threshold: Minimum score (0-100) for fuzzy matches
        top_k: Number of results to return
    """
    # Normalize query for better matching
    query_norm = normalize_text(query)

    model_to_use = search_model if search_model is not None else model

    
    # Adjust scorer based on query characteristics
    if ' ' in query_norm or len(query_norm) > 10:
        scorer = fuzz.token_sort_ratio  # Better for word order/spacing differences
    else:
        scorer = fuzz.ratio  # Standard for character-level typos
    
    # Step 1: Try RapidFuzz first (faster than embeddings)
    fuzzy_matches = rapidfuzz_process.extract(
        query_norm, 
        [normalize_text(c) for c in choices],  # Normalize choices too
        scorer=scorer,
        limit=top_k * 2  # Get more candidates for filtering
    )
    
    # Map normalized choices back to original labels
    norm_to_orig = {normalize_text(c): c for c in choices}
    fuzzy_matches = [(norm_to_orig.get(match, match), score, idx) for match, score, idx in fuzzy_matches]
    
    # Filter matches that meet our threshold
    good_fuzzy_matches = [(match, score) for match, score, _ in fuzzy_matches if score >= fuzzy_threshold]
    
    results = []
    
    # If we have good fuzzy matches, return those
    if good_fuzzy_matches:
        print(f"Found {len(good_fuzzy_matches)} good fuzzy matches for '{query}'")
        for match, score in good_fuzzy_matches:
            results.append({
                "text": match,
                "score": score / 100.0,  # Normalize to 0-1 scale
                "source": "fuzzy"
            })
    
    # Step 2: If not enough good fuzzy matches, use embeddings
    if len(results) == 0:
        print(f"No good fuzzy matches above threshold {fuzzy_threshold}, using embeddings")
        
        # Use appropriate search function based on vector_type
        if vector_type == "brand":
            # Modify how you call search_car_brand to use the provided model
            query_with_context = f"car brand: {query}"
            query_vector = model_to_use.encode(query_with_context)
            
            # Call Qdrant directly with the new embedding
            search_result = qdrant_client.search(
                collection_name=collection_name,
                query_vector=query_vector,
                limit=top_k * 3,
                query_filter={"must": [{"key": "vector_type", "match": {"value": "brand"}}]}
            )
            
            # Process results as in search_car_brand
            embedding_results = []
            unique_results = {}
            for result in search_result:
                car_brand = result.payload['car_brand']
                if car_brand not in unique_results:
                    unique_results[car_brand] = result
                    embedding_results.append(result)
        else:  # model
            # Similar for model search
            query_with_context = f"car model: {query}"
            query_vector = model_to_use.encode(query_with_context)
            
            # Call Qdrant directly
            search_result = qdrant_client.search(
                collection_name=collection_name,
                query_vector=query_vector,
                limit=top_k * 3,
                query_filter={"must": [{"key": "vector_type", "match": {"value": "model"}}]}
            )
            
            # Process results
            embedding_results = []
            unique_results = {}
            for result in search_result:
                car_model = result.payload['car_model']
                if car_model not in unique_results:
                    unique_results[car_model] = result
                    embedding_results.append(result)
        
        # Extract relevant information
        for result in embedding_results:
            if vector_type == "brand":
                text = result.payload.get("car_brand")
            else:  # model
                text = result.payload.get("car_model")
                
            # Skip if this result is already in our list from fuzzy matching
            if any(r["text"] == text for r in results):
                continue
                
            # Add to results
            results.append({
                "text": text,
                "score": result.score,
                "source": "embedding"
            })
    
    # Return top_k results, sorted by score
    return sorted(results, key=lambda x: x["score"], reverse=True)[:top_k]

In [16]:
# 3. Collect training pairs from your examples

# Brand test cases
brand_typo_pairs = [
    ("neesun", "Nissan"),
    ("benz", "Mercedes-Benz"),
    ("merz", "Mercedes-Benz"),
    ("mercedesbenz", "Mercedes-Benz"),
    ("toyata", "Toyota"),
    ("toyta", "Toyota"),
    ("hunda", "Honda"),
    ("hoonda", "Honda"),
    ("protan", "Proton"),
    ("perodwa", "Perodua"),
]

# Model test cases - expanded
model_typo_pairs = [
    ("axla", "Axia"),
    ("xseventy", "X70"),
    ("vios", "Vios"),
    ("vios", "Vios"),
    ("sivic", "Civic"),
    ("civek", "Civic"),
    ("cityy", "City"),
    ("x fivty", "X50"),
    ("exora", "Exora"),
]

# Convert to DataFrame for easier manipulation
train_df = pd.DataFrame([
    {"query": query, "correction": correction, "domain": "brand"} 
    for query, correction in brand_typo_pairs
] + [
    {"query": query, "correction": correction, "domain": "model"} 
    for query, correction in model_typo_pairs
])

print(f"Total training pairs: {len(train_df)}")
train_df

Total training pairs: 19


Unnamed: 0,query,correction,domain
0,neesun,Nissan,brand
1,benz,Mercedes-Benz,brand
2,merz,Mercedes-Benz,brand
3,mercedesbenz,Mercedes-Benz,brand
4,toyata,Toyota,brand
5,toyta,Toyota,brand
6,hunda,Honda,brand
7,hoonda,Honda,brand
8,protan,Proton,brand
9,perodwa,Perodua,brand


In [9]:
# 4. Create sentence-transformers training examples

def prepare_training_examples(df):
    """Create training examples for fine-tuning"""
    train_examples = []
    
    for _, row in df.iterrows():
        query = row['query']
        correction = row['correction']
        domain = row['domain']
        
        # Format with context prefixes matching your ingest format
        if domain == "brand":
            # The typo should map to the correct brand embedding
            query_text = f"car brand: {query}"
            correction_text = f"car brand: {correction}"
            
            # Create training pair (these should map to the same vector)
            train_examples.append(InputExample(texts=[query_text, correction_text]))
            
            # Also add reverse to strengthen the connection
            train_examples.append(InputExample(texts=[correction_text, query_text]))
            
        else:  # model
            query_text = f"car model: {query}"
            correction_text = f"car model: {correction}"
            
            # Create training pair
            train_examples.append(InputExample(texts=[query_text, correction_text]))
            train_examples.append(InputExample(texts=[correction_text, query_text]))
    
    return train_examples

# Create training examples
train_examples = prepare_training_examples(train_df)
print(f"Created {len(train_examples)} training examples")

NameError: name 'train_df' is not defined

In [18]:
# 5. Load car data for testing
df = pd.read_csv('car_dataset.csv')
brand_choices = list(df['car_brand'].unique())
model_choices = list(df['car_model'].unique())

# Create test cases based on your examples
brand_eval_cases = [
    ("neesun", "Nissan"),
    ("benz", "Mercedes-Benz"),
    ("merz", "Mercedes-Benz"),
    ("mercedesbenz", "Mercedes-Benz"),
]
model = SentenceTransformer(MODEL_NAME)

print("\nTESTING BRAND TYPO DETECTION\n")

# Track stats
brand_stats = {"total": len(brand_eval_cases), "fuzzy": 0, "embedding": 0}

for query, expected in brand_eval_cases:
    # Test brand search
    results = hybrid_search(query, brand_choices, vector_type="brand", fuzzy_threshold=75, top_k=3, search_model=model)
    
    # Track which method provided the results
    if results and results[0]["source"] == "fuzzy":
        brand_stats["fuzzy"] += 1
    elif results:
        brand_stats["embedding"] += 1
        
    # Print results
    print(f"\nQuery: '{query}' (expected: {expected})")
    for i, res in enumerate(results, 1):
        match = "✓" if res["text"] == expected else " "
        print(f"  {i}. {res['text']} ({res['score']:.4f}, {res['source']}) {match}")
    
    # Visual separator
    print("-" * 60)

# Print summary stats
print("\n=== BRAND TYPO DETECTION SUMMARY ===")
print(f"Total cases: {brand_stats['total']}")
print(f"Resolved by fuzzy: {brand_stats['fuzzy']} ({brand_stats['fuzzy']/brand_stats['total']*100:.1f}%)")
print(f"Resolved by embeddings: {brand_stats['embedding']} ({brand_stats['embedding']/brand_stats['total']*100:.1f}%)")


TESTING BRAND TYPO DETECTION

No good fuzzy matches above threshold 75, using embeddings


  search_result = qdrant_client.search(



Query: 'neesun' (expected: Nissan)
  1. Perodua (0.9028, embedding)  
  2. Nissan (0.9001, embedding) ✓
  3. Proton (0.8953, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'benz' (expected: Mercedes-Benz)
  1. Perodua (0.9207, embedding)  
  2. Mercedes-Benz (0.9107, embedding) ✓
  3. Toyota (0.9030, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'merz' (expected: Mercedes-Benz)
  1. Chery (0.9141, embedding)  
  2. Perodua (0.9092, embedding)  
  3. Mercedes-Benz (0.8999, embedding) ✓
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'mercedesbenz' (expected: Mercedes-Benz)
  1. Perodua (0.9236, embedding)  
  2. Mercedes-Benz (0.9232, embedding) ✓
  3. Chery (0.9030, embedding)  
------------------------------------------

In [None]:
# 6. Fine-tune the model
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

print("\nFINE-TUNING THE MODEL\n")

# Load base model for fine-tuning
fine_tune_model = SentenceTransformer(MODEL_NAME)

# Create data loader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

# Define loss function - we use contrastive loss to make similar pairs closer
train_loss = losses.MultipleNegativesRankingLoss(fine_tune_model)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# Fine-tune
print(f"Starting fine-tuning for {EPOCHS} epochs...")
warmup_steps = int(len(train_dataloader) * EPOCHS * 0.1)  # 10% of training as warmup

fine_tune_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path=OUTPUT_PATH,
    show_progress_bar=True
)

print(f"Model fine-tuned and saved to {OUTPUT_PATH}")


FINE-TUNING THE MODEL

Training on: cpu
Starting fine-tuning for 10 epochs...


                                                                     

Step,Training Loss


Model fine-tuned and saved to ./model/finetuned-embedding-model


In [21]:
# 7. Test with fine-tuned model

print("\nTESTING FINE-TUNED MODEL\n")

# Load the fine-tuned model
finetuned_model = SentenceTransformer(OUTPUT_PATH)

# Create test cases based on your examples - include both brand and model
test_cases = [
    # Brand test cases
    ("neesun", "Nissan", "brand"),
    ("benz", "Mercedes-Benz", "brand"),
    ("merz", "Mercedes-Benz", "brand"),
    ("mercedesbenz", "Mercedes-Benz", "brand"),
    # Model test cases
    ("axla", "Axia", "model"),
    ("xseventy", "X70", "model"),
]

# Track stats
finetuned_stats = {"total": 0, "brand": {"fuzzy": 0, "embedding": 0}, "model": {"fuzzy": 0, "embedding": 0}}

# Count total cases by type
finetuned_stats["brand_total"] = sum(1 for _, _, domain in test_cases if domain == "brand")
finetuned_stats["model_total"] = sum(1 for _, _, domain in test_cases if domain == "model")
finetuned_stats["total"] = len(test_cases)

print("\nFINE-TUNED MODEL RESULTS\n")

for query, expected, domain in test_cases:
    # Set choices based on domain
    choices = brand_choices if domain == "brand" else model_choices
    
    # Test search with fine-tuned model
    results = hybrid_search(
        query, choices, vector_type=domain, 
        fuzzy_threshold=75, top_k=3, search_model=finetuned_model
    )
    
    # Track which method provided the results
    if results and results[0]["source"] == "fuzzy":
        finetuned_stats[domain]["fuzzy"] += 1
    elif results:
        finetuned_stats[domain]["embedding"] += 1
        
    # Print results
    print(f"\nQuery: '{query}' (expected: {expected})")
    for i, res in enumerate(results, 1):
        match = "✓" if res["text"] == expected else " "
        print(f"  {i}. {res['text']} ({res['score']:.4f}, {res['source']}) {match}")
    
    # Visual separator
    print("-" * 60)

# Print summary stats
print("\n=== FINE-TUNED MODEL DETECTION SUMMARY ===")
print(f"Total test cases: {finetuned_stats['total']}")

print(f"\nBrand cases: {finetuned_stats['brand_total']}")
if finetuned_stats['brand_total'] > 0:
    print(f"Resolved by fuzzy: {finetuned_stats['brand']['fuzzy']} ({finetuned_stats['brand']['fuzzy']/finetuned_stats['brand_total']*100:.1f}%)")
    print(f"Resolved by embeddings: {finetuned_stats['brand']['embedding']} ({finetuned_stats['brand']['embedding']/finetuned_stats['brand_total']*100:.1f}%)")

print(f"\nModel cases: {finetuned_stats['model_total']}")
if finetuned_stats['model_total'] > 0:
    print(f"Resolved by fuzzy: {finetuned_stats['model']['fuzzy']} ({finetuned_stats['model']['fuzzy']/finetuned_stats['model_total']*100:.1f}%)")
    print(f"Resolved by embeddings: {finetuned_stats['model']['embedding']} ({finetuned_stats['model']['embedding']/finetuned_stats['model_total']*100:.1f}%)")


TESTING FINE-TUNED MODEL


FINE-TUNED MODEL RESULTS

No good fuzzy matches above threshold 75, using embeddings


  search_result = qdrant_client.search(



Query: 'neesun' (expected: Nissan)
  1. Nissan (0.6039, embedding) ✓
  2. Toyota (0.4988, embedding)  
  3. Mitsubishi (0.4841, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'benz' (expected: Mercedes-Benz)
  1. Mercedes-Benz (0.5434, embedding) ✓
  2. Perodua (0.5218, embedding)  
  3. BMW (0.4875, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'merz' (expected: Mercedes-Benz)
  1. Mercedes-Benz (0.5536, embedding) ✓
  2. Chery (0.5491, embedding)  
  3. Perodua (0.5171, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'mercedesbenz' (expected: Mercedes-Benz)
  1. Mercedes-Benz (0.6245, embedding) ✓
  2. Perodua (0.5866, embedding)  
  3. Chery (0.5607, embedding)  
------------------------------------------

  search_result = qdrant_client.search(


## Retrain the trained model

In [10]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import os

# Path to your saved model
EXISTING_MODEL_PATH = "./model/finetuned-embedding-model"
OUTPUT_PATH = "./model/finetuned-embedding-model-v2"  # New save location

# Load the existing model
model = SentenceTransformer(EXISTING_MODEL_PATH)

In [None]:
# Cell to fetch training data from Supabase
import pandas as pd
from app.api.services.config import SUPABASE_ANON_KEY, SUPABASE_URL
from supabase import create_client, Client

def fetch_training_data_from_supabase():
    """Fetch typo correction data from Supabase"""
    try:
        # Initialize Supabase client
        client = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
        
        # Fetch all records from typo_training_dataset table
        response = client.table("typo_training_dataset").select("typo", "corrected", "domain").execute()
        
        if not response.data:
            print("No training data found in Supabase table")
            return pd.DataFrame()
        
        # Convert to DataFrame
        df = pd.DataFrame(response.data)
        
        # Check required columns exist
        required_cols = ["typo", "corrected", "domain"]
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"Missing required columns in Supabase data: {missing_cols}")
            return pd.DataFrame()
        
        # Rename columns to match existing code
        df = df.rename(columns={"typo": "query", "corrected": "correction"})
        
        print(f"Successfully fetched {len(df)} training examples from Supabase")
        return df
        
    except Exception as e:
        print(f"Error fetching data from Supabase: {e}")
        return pd.DataFrame()

# Fetch data from Supabase
supabase_train_df = fetch_training_data_from_supabase()
supabase_train_df

if not supabase_train_df.empty:
    
    # Generate training examples from the combined dataset
    train_examples = prepare_training_examples(supabase_train_df)
    print(f"Created {len(train_examples)} combined training examples")
    
    # Create data loader with the combined examples
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

# Define loss function
train_loss = losses.MultipleNegativesRankingLoss(model)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# Continue fine-tuning
print(f"Starting additional fine-tuning for 5 epochs...")
warmup_steps = int(len(train_dataloader) * 5 * 0.1)  # 10% of training as warmup

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,  # Fewer epochs for continued training
    optimizer_params={'lr': 1e-5},
    warmup_steps=warmup_steps,
    output_path=OUTPUT_PATH,
    show_progress_bar=True
)

print(f"Model fine-tuned and saved to {OUTPUT_PATH}")

Created 16 new training examples


In [12]:
# Create data loader for the new examples
train_dataloader = DataLoader(new_train_examples, shuffle=True, batch_size=4)

# Define loss function
train_loss = losses.MultipleNegativesRankingLoss(model)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# Continue fine-tuning
print(f"Starting additional fine-tuning for 5 epochs...")
warmup_steps = int(len(train_dataloader) * 5 * 0.1)  # 10% of training as warmup

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,  # Fewer epochs for continued training
    optimizer_params={'lr': 1e-5},
    warmup_steps=warmup_steps,
    output_path=OUTPUT_PATH,
    show_progress_bar=True
)

print(f"Model fine-tuned and saved to {OUTPUT_PATH}")

Training on: cpu
Starting additional fine-tuning for 5 epochs...




Step,Training Loss


Model fine-tuned and saved to ./model/finetuned-embedding-model-v2
