In [1]:
import pandas as pd
import numpy as np
import torch
import os
import sys
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

from datasets import Dataset 

notebook_dir = os.path.dirname(os.path.abspath("__file__"))
if notebook_dir not in sys.path:
    sys.path.append(notebook_dir)

MODEL_NAME = "intfloat/multilingual-e5-small"
OUTPUT_PATH = "./model/finetuned-embedding-model"
EPOCHS = 10
BATCH_SIZE = 4

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import io
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

load_dotenv()

QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL')

qdrant_client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

In [3]:
model = SentenceTransformer(MODEL_NAME)

In [3]:
from rapidfuzz import fuzz

collection_name = "car_data_modelbrand"  

# Function to search car brands with typo tolerance
def search_car_brand(query, top_k=10):
    # Add context to the query to match our embeddings
    query_with_context = f"car brand: {query}" 
    query_vector = model.encode(query_with_context)
    
    # Use query_filter instead of filter parameter
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k * 3,
        query_filter={
            "must": [
                {
                    "key": "vector_type",
                    "match": {
                        "value": "brand"
                    }
                }
            ]
        }
    )

    # Remove duplicates based on car_brand
    unique_results_brand = {}
    for result in search_result:
        car_model = result.payload['car_brand']
        if car_model not in unique_results_brand:
            unique_results_brand[car_model] = result
    
    # Return top_k unique results
    return list(unique_results_brand.values())[:top_k]


# Function to search car models with typo tolerance
def search_car_model(query, top_k=10):
    # Add generic context to the query to match our embeddings
    query_with_context = f"car model: {query}"  
    query_vector = model.encode(query_with_context)
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k * 3,
        query_filter={
            "must": [
                {
                    "key": "vector_type",
                    "match": {
                        "value": "model"
                    }
                }
            ]
        }
    )
    
    # Remove duplicates based on car_model
    unique_results_model = {}
    for result in search_result:
        car_model = result.payload['car_model']
        if car_model not in unique_results_model:
            unique_results_model[car_model] = result
    
    # Return top_k unique results
    return list(unique_results_model.values())[:top_k]

In [4]:
from rapidfuzz import process as rapidfuzz_process
from rapidfuzz import fuzz
import re

def normalize_text(s):
    """Normalize text to improve matching"""
    if not isinstance(s, str):
        return s
    s = s.strip().lower()
    s = re.sub(r'[\-–—_/]', ' ', s)  
    s = re.sub(r'\s+', ' ', s)     
    return s  

def hybrid_search(query, choices, vector_type="brand", fuzzy_threshold=75, top_k=3, search_model = None):
    """
    Hybrid search that combines RapidFuzz and embeddings:
    1. Try fuzzy matching first with 75% threshold (fast + handles typos)
    2. If no good fuzzy matches, fall back to embeddings (semantic understanding)
    
    Args:
        query: The search query
        choices: List of choices to search against
        vector_type: "brand" or "model"
        fuzzy_threshold: Minimum score (0-100) for fuzzy matches
        top_k: Number of results to return
    """
    # Normalize query for better matching
    query_norm = normalize_text(query)

    model_to_use = search_model if search_model is not None else model

    
    # Adjust scorer based on query characteristics
    if ' ' in query_norm or len(query_norm) > 10:
        scorer = fuzz.token_sort_ratio  # Better for word order/spacing differences
    else:
        scorer = fuzz.ratio  # Standard for character-level typos
    
    # Step 1: Try RapidFuzz first (faster than embeddings)
    fuzzy_matches = rapidfuzz_process.extract(
        query_norm, 
        [normalize_text(c) for c in choices],  # Normalize choices too
        scorer=scorer,
        limit=top_k * 2  # Get more candidates for filtering
    )
    
    # Map normalized choices back to original labels
    norm_to_orig = {normalize_text(c): c for c in choices}
    fuzzy_matches = [(norm_to_orig.get(match, match), score, idx) for match, score, idx in fuzzy_matches]
    
    # Filter matches that meet our threshold
    good_fuzzy_matches = [(match, score) for match, score, _ in fuzzy_matches if score >= fuzzy_threshold]
    
    results = []
    
    # If we have good fuzzy matches, return those
    if good_fuzzy_matches:
        print(f"Found {len(good_fuzzy_matches)} good fuzzy matches for '{query}'")
        for match, score in good_fuzzy_matches:
            results.append({
                "text": match,
                "score": score / 100.0,  # Normalize to 0-1 scale
                "source": "fuzzy"
            })
    
    # Step 2: If not enough good fuzzy matches, use embeddings
    if len(results) == 0:
        print(f"No good fuzzy matches above threshold {fuzzy_threshold}, using embeddings")
        
        # Use appropriate search function based on vector_type
        if vector_type == "brand":
            # Modify how you call search_car_brand to use the provided model
            query_with_context = f"car brand: {query}"
            query_vector = model_to_use.encode(query_with_context)
            
            # Call Qdrant directly with the new embedding
            search_result = qdrant_client.search(
                collection_name=collection_name,
                query_vector=query_vector,
                limit=top_k * 3,
                query_filter={"must": [{"key": "vector_type", "match": {"value": "brand"}}]}
            )
            
            # Process results as in search_car_brand
            embedding_results = []
            unique_results = {}
            for result in search_result:
                car_brand = result.payload['car_brand']
                if car_brand not in unique_results:
                    unique_results[car_brand] = result
                    embedding_results.append(result)
        else:  # model
            # Similar for model search
            query_with_context = f"car model: {query}"
            query_vector = model_to_use.encode(query_with_context)
            
            # Call Qdrant directly
            search_result = qdrant_client.search(
                collection_name=collection_name,
                query_vector=query_vector,
                limit=top_k * 3,
                query_filter={"must": [{"key": "vector_type", "match": {"value": "model"}}]}
            )
            
            # Process results
            embedding_results = []
            unique_results = {}
            for result in search_result:
                car_model = result.payload['car_model']
                if car_model not in unique_results:
                    unique_results[car_model] = result
                    embedding_results.append(result)
        
        # Extract relevant information
        for result in embedding_results:
            if vector_type == "brand":
                text = result.payload.get("car_brand")
            else:  # model
                text = result.payload.get("car_model")
                
            # Skip if this result is already in our list from fuzzy matching
            if any(r["text"] == text for r in results):
                continue
                
            # Add to results
            results.append({
                "text": text,
                "score": result.score,
                "source": "embedding"
            })
    
    # Return top_k results, sorted by score
    return sorted(results, key=lambda x: x["score"], reverse=True)[:top_k]

In [16]:
# 3. Collect training pairs from your examples

# Brand test cases
brand_typo_pairs = [
    ("neesun", "Nissan"),
    ("benz", "Mercedes-Benz"),
    ("merz", "Mercedes-Benz"),
    ("mercedesbenz", "Mercedes-Benz"),
    ("toyata", "Toyota"),
    ("toyta", "Toyota"),
    ("hunda", "Honda"),
    ("hoonda", "Honda"),
    ("protan", "Proton"),
    ("perodwa", "Perodua"),
]

# Model test cases - expanded
model_typo_pairs = [
    ("axla", "Axia"),
    ("xseventy", "X70"),
    ("vios", "Vios"),
    ("vios", "Vios"),
    ("sivic", "Civic"),
    ("civek", "Civic"),
    ("cityy", "City"),
    ("x fivty", "X50"),
    ("exora", "Exora"),
]

# Convert to DataFrame for easier manipulation
train_df = pd.DataFrame([
    {"query": query, "correction": correction, "domain": "brand"} 
    for query, correction in brand_typo_pairs
] + [
    {"query": query, "correction": correction, "domain": "model"} 
    for query, correction in model_typo_pairs
])

print(f"Total training pairs: {len(train_df)}")
train_df

Total training pairs: 19


Unnamed: 0,query,correction,domain
0,neesun,Nissan,brand
1,benz,Mercedes-Benz,brand
2,merz,Mercedes-Benz,brand
3,mercedesbenz,Mercedes-Benz,brand
4,toyata,Toyota,brand
5,toyta,Toyota,brand
6,hunda,Honda,brand
7,hoonda,Honda,brand
8,protan,Proton,brand
9,perodwa,Perodua,brand


In [None]:
# 4. Create sentence-transformers training examples



# Create training examples
train_examples = prepare_training_examples(train_df)
print(f"Created {len(train_examples)} training examples")

NameError: name 'train_df' is not defined

In [18]:
# 5. Load car data for testing
df = pd.read_csv('car_dataset.csv')
brand_choices = list(df['car_brand'].unique())
model_choices = list(df['car_model'].unique())

# Create test cases based on your examples
brand_eval_cases = [
    ("neesun", "Nissan"),
    ("benz", "Mercedes-Benz"),
    ("merz", "Mercedes-Benz"),
    ("mercedesbenz", "Mercedes-Benz"),
]
model = SentenceTransformer(MODEL_NAME)

print("\nTESTING BRAND TYPO DETECTION\n")

# Track stats
brand_stats = {"total": len(brand_eval_cases), "fuzzy": 0, "embedding": 0}

for query, expected in brand_eval_cases:
    # Test brand search
    results = hybrid_search(query, brand_choices, vector_type="brand", fuzzy_threshold=75, top_k=3, search_model=model)
    
    # Track which method provided the results
    if results and results[0]["source"] == "fuzzy":
        brand_stats["fuzzy"] += 1
    elif results:
        brand_stats["embedding"] += 1
        
    # Print results
    print(f"\nQuery: '{query}' (expected: {expected})")
    for i, res in enumerate(results, 1):
        match = "✓" if res["text"] == expected else " "
        print(f"  {i}. {res['text']} ({res['score']:.4f}, {res['source']}) {match}")
    
    # Visual separator
    print("-" * 60)

# Print summary stats
print("\n=== BRAND TYPO DETECTION SUMMARY ===")
print(f"Total cases: {brand_stats['total']}")
print(f"Resolved by fuzzy: {brand_stats['fuzzy']} ({brand_stats['fuzzy']/brand_stats['total']*100:.1f}%)")
print(f"Resolved by embeddings: {brand_stats['embedding']} ({brand_stats['embedding']/brand_stats['total']*100:.1f}%)")


TESTING BRAND TYPO DETECTION

No good fuzzy matches above threshold 75, using embeddings


  search_result = qdrant_client.search(



Query: 'neesun' (expected: Nissan)
  1. Perodua (0.9028, embedding)  
  2. Nissan (0.9001, embedding) ✓
  3. Proton (0.8953, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'benz' (expected: Mercedes-Benz)
  1. Perodua (0.9207, embedding)  
  2. Mercedes-Benz (0.9107, embedding) ✓
  3. Toyota (0.9030, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'merz' (expected: Mercedes-Benz)
  1. Chery (0.9141, embedding)  
  2. Perodua (0.9092, embedding)  
  3. Mercedes-Benz (0.8999, embedding) ✓
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'mercedesbenz' (expected: Mercedes-Benz)
  1. Perodua (0.9236, embedding)  
  2. Mercedes-Benz (0.9232, embedding) ✓
  3. Chery (0.9030, embedding)  
------------------------------------------

In [None]:
# 6. Fine-tune the model
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

print("\nFINE-TUNING THE MODEL\n")

# Load base model for fine-tuning
fine_tune_model = SentenceTransformer(MODEL_NAME)

# Create data loader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

# Define loss function - we use contrastive loss to make similar pairs closer
train_loss = losses.MultipleNegativesRankingLoss(fine_tune_model)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# Fine-tune
print(f"Starting fine-tuning for {EPOCHS} epochs...")
warmup_steps = int(len(train_dataloader) * EPOCHS * 0.1)  # 10% of training as warmup

fine_tune_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path=OUTPUT_PATH,
    show_progress_bar=True
)

print(f"Model fine-tuned and saved to {OUTPUT_PATH}")


FINE-TUNING THE MODEL

Training on: cpu
Starting fine-tuning for 10 epochs...


                                                                     

Step,Training Loss


Model fine-tuned and saved to ./model/finetuned-embedding-model


In [21]:
# 7. Test with fine-tuned model

print("\nTESTING FINE-TUNED MODEL\n")

# Load the fine-tuned model
finetuned_model = SentenceTransformer(OUTPUT_PATH)

# Create test cases based on your examples - include both brand and model
test_cases = [
    # Brand test cases
    ("neesun", "Nissan", "brand"),
    ("benz", "Mercedes-Benz", "brand"),
    ("merz", "Mercedes-Benz", "brand"),
    ("mercedesbenz", "Mercedes-Benz", "brand"),
    # Model test cases
    ("axla", "Axia", "model"),
    ("xseventy", "X70", "model"),
]

# Track stats
finetuned_stats = {"total": 0, "brand": {"fuzzy": 0, "embedding": 0}, "model": {"fuzzy": 0, "embedding": 0}}

# Count total cases by type
finetuned_stats["brand_total"] = sum(1 for _, _, domain in test_cases if domain == "brand")
finetuned_stats["model_total"] = sum(1 for _, _, domain in test_cases if domain == "model")
finetuned_stats["total"] = len(test_cases)

print("\nFINE-TUNED MODEL RESULTS\n")

for query, expected, domain in test_cases:
    # Set choices based on domain
    choices = brand_choices if domain == "brand" else model_choices
    
    # Test search with fine-tuned model
    results = hybrid_search(
        query, choices, vector_type=domain, 
        fuzzy_threshold=75, top_k=3, search_model=finetuned_model
    )
    
    # Track which method provided the results
    if results and results[0]["source"] == "fuzzy":
        finetuned_stats[domain]["fuzzy"] += 1
    elif results:
        finetuned_stats[domain]["embedding"] += 1
        
    # Print results
    print(f"\nQuery: '{query}' (expected: {expected})")
    for i, res in enumerate(results, 1):
        match = "✓" if res["text"] == expected else " "
        print(f"  {i}. {res['text']} ({res['score']:.4f}, {res['source']}) {match}")
    
    # Visual separator
    print("-" * 60)

# Print summary stats
print("\n=== FINE-TUNED MODEL DETECTION SUMMARY ===")
print(f"Total test cases: {finetuned_stats['total']}")

print(f"\nBrand cases: {finetuned_stats['brand_total']}")
if finetuned_stats['brand_total'] > 0:
    print(f"Resolved by fuzzy: {finetuned_stats['brand']['fuzzy']} ({finetuned_stats['brand']['fuzzy']/finetuned_stats['brand_total']*100:.1f}%)")
    print(f"Resolved by embeddings: {finetuned_stats['brand']['embedding']} ({finetuned_stats['brand']['embedding']/finetuned_stats['brand_total']*100:.1f}%)")

print(f"\nModel cases: {finetuned_stats['model_total']}")
if finetuned_stats['model_total'] > 0:
    print(f"Resolved by fuzzy: {finetuned_stats['model']['fuzzy']} ({finetuned_stats['model']['fuzzy']/finetuned_stats['model_total']*100:.1f}%)")
    print(f"Resolved by embeddings: {finetuned_stats['model']['embedding']} ({finetuned_stats['model']['embedding']/finetuned_stats['model_total']*100:.1f}%)")


TESTING FINE-TUNED MODEL


FINE-TUNED MODEL RESULTS

No good fuzzy matches above threshold 75, using embeddings


  search_result = qdrant_client.search(



Query: 'neesun' (expected: Nissan)
  1. Nissan (0.6039, embedding) ✓
  2. Toyota (0.4988, embedding)  
  3. Mitsubishi (0.4841, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'benz' (expected: Mercedes-Benz)
  1. Mercedes-Benz (0.5434, embedding) ✓
  2. Perodua (0.5218, embedding)  
  3. BMW (0.4875, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'merz' (expected: Mercedes-Benz)
  1. Mercedes-Benz (0.5536, embedding) ✓
  2. Chery (0.5491, embedding)  
  3. Perodua (0.5171, embedding)  
------------------------------------------------------------
No good fuzzy matches above threshold 75, using embeddings

Query: 'mercedesbenz' (expected: Mercedes-Benz)
  1. Mercedes-Benz (0.6245, embedding) ✓
  2. Perodua (0.5866, embedding)  
  3. Chery (0.5607, embedding)  
------------------------------------------

  search_result = qdrant_client.search(


## Retrain the trained model

In [10]:
import os
import shutil
import datetime
from supabase import create_client
import zipfile
from datetime import datetime, timedelta

def zip_model(model_path, output_zip=None):
    """
    Zip a model directory into a compressed file
    
    Args:
        model_path: Path to the model directory
        output_zip: Path for the output zip file (optional)
    
    Returns:
        Path to the created zip file
    """
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model directory not found: {model_path}")
    
    # Create a zip filename with current date if not provided
    if output_zip is None:
        today = datetime.now().strftime("%Y%m%d")
        output_zip = f"model_{today}.zip"
    
    # Create the zip file
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through all files in the directory
        for root, dirs, files in os.walk(model_path):
            for file in files:
                # Calculate path for file in zip
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(model_path))
                
                # Add file to zip
                zipf.write(file_path, arcname=arcname)
    
    print(f"Model zipped successfully to {output_zip}")
    return output_zip

def upload_model_to_supabase(zip_path, supabase_client, bucket_name="codenection-sss-model-training"):
    """
    Upload a zipped model to Supabase storage
    
    Args:
        zip_path: Path to the zipped model file
        supabase_client: Initialized Supabase client
        bucket_name: Name of the storage bucket
    
    Returns:
        URL of the uploaded file
    """
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Zip file not found: {zip_path}")
    
    # Get the filename
    file_name = os.path.basename(zip_path)
    
    # Upload to Supabase storage
    with open(zip_path, 'rb') as f:
        file_content = f.read()
        
    # Upload to model/ folder in the bucket
    path = f"model/{file_name}"
    response = supabase_client.storage.from_(bucket_name).upload(
        path,
        file_content,
        file_options={"content-type": "application/zip"}
    )
    
    # Generate the public URL
    url = supabase_client.storage.from_(bucket_name).get_public_url(path)
    
    print(f"Model uploaded to Supabase: {url}")
    return url

In [11]:
def get_latest_model_url(supabase_client, bucket_name="codenection-sss-model-training", days_ago=0):
    """
    Get the URL of the latest model (or from X days ago)
    
    Args:
        supabase_client: Initialized Supabase client
        bucket_name: Name of the storage bucket
        days_ago: How many days back to look for the model (0 = latest)
    
    Returns:
        URL of the model
    """
    # List all files in the model/ directory
    files = supabase_client.storage.from_(bucket_name).list("model")
    
    # Filter only zip files
    model_files = [f for f in files if f["name"].endswith(".zip")]
    
    if not model_files:
        raise FileNotFoundError("No model files found in storage")
    
    # Sort by name (which contains the date)
    model_files.sort(key=lambda x: x["name"], reverse=True)
    
    if days_ago == 0:
        # Get the most recent model
        target_file = model_files[0]
    else:
        # Try to find a model from days_ago days
        target_date = (datetime.now() - timedelta(days=days_ago)).strftime("%Y%m%d")
        matching_files = [f for f in model_files if target_date in f["name"]]
        
        if matching_files:
            target_file = matching_files[0]
        elif model_files:
            # Fall back to the most recent model before the target date
            earlier_files = [f for f in model_files if f["name"].split("_")[1].split(".")[0] < target_date]
            if earlier_files:
                target_file = earlier_files[0]
            else:
                target_file = model_files[-1]  # Oldest file
        else:
            raise FileNotFoundError(f"No model files found for {target_date} or earlier")
    
    # Generate the public URL
    path = f"model/{target_file['name']}"
    url = supabase_client.storage.from_(bucket_name).get_public_url(path)
    
    print(f"Found model: {target_file['name']}")
    return url

def download_model_from_supabase(url, output_dir=None):
    """
    Download a model from a Supabase URL and extract it
    
    Args:
        url: URL of the model zip file
        output_dir: Directory to extract the model to (optional)
    
    Returns:
        Path to the extracted model directory
    """
    import requests
    import tempfile
    
    # Create temporary file for downloading
    with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
        temp_path = tmp_file.name
    
    # Download the file
    print(f"Downloading model from {url}...")
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    with open(temp_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    # Create output directory if not specified
    if output_dir is None:
        output_dir = "./downloaded_model"
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Extract the zip file
    with zipfile.ZipFile(temp_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
    
    # Clean up the temporary file
    os.unlink(temp_path)
    
    print(f"Model downloaded and extracted to {output_dir}")
    return output_dir

def load_model_from_supabase(supabase_client, days_ago=0, output_dir=None):
    """
    Load a model from Supabase storage (either latest or from X days ago)
    
    Args:
        supabase_client: Initialized Supabase client
        days_ago: How many days back to look for the model (0 = latest)
        output_dir: Directory to extract the model to (optional)
    
    Returns:
        Loaded SentenceTransformer model
    """
    from sentence_transformers import SentenceTransformer
    
    # Get the URL of the model
    url = get_latest_model_url(supabase_client, days_ago=days_ago)
    
    # Download and extract the model
    model_dir = download_model_from_supabase(url, output_dir)
    
    # Load the model
    model = SentenceTransformer(model_dir)
    
    print(f"Model loaded successfully from {model_dir}")
    return model

In [5]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import os

def prepare_training_examples(df):
    """Create training examples for fine-tuning"""
    train_examples = []
    
    for _, row in df.iterrows():
        query = row['query']
        correction = row['correction']
        domain = row['domain']
        
        # Format with context prefixes matching your ingest format
        if domain == "brand":
            # The typo should map to the correct brand embedding
            query_text = f"car brand: {query}"
            correction_text = f"car brand: {correction}"
            
            # Create training pair (these should map to the same vector)
            train_examples.append(InputExample(texts=[query_text, correction_text]))
            
            # Also add reverse to strengthen the connection
            train_examples.append(InputExample(texts=[correction_text, query_text]))
            
        else:  # model
            query_text = f"car model: {query}"
            correction_text = f"car model: {correction}"
            
            # Create training pair
            train_examples.append(InputExample(texts=[query_text, correction_text]))
            train_examples.append(InputExample(texts=[correction_text, query_text]))
    
    return train_examples

# Path to your saved model
EXISTING_MODEL_PATH = "./model/finetuned-embedding-model-v3"
OUTPUT_PATH = "./model/finetuned-embedding-model-v4"  # New save location

# Load the existing model
model = SentenceTransformer(EXISTING_MODEL_PATH)

In [9]:
# Cell to fetch training data from Supabase
%pip install supabase

import pandas as pd
import sys
import os
from datetime import datetime, timedelta

# Add the parent directory to sys.path so 'app' can be imported
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from app.api.services.config import SUPABASE_ANON_KEY, SUPABASE_URL
from supabase import create_client, Client

def fetch_training_data_from_supabase():
    """Fetch typo correction data from Supabase"""
    try:
        # Initialize Supabase client
        client = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
        
        # Fetch all records from typo_training_dataset table
        response = client.table("typo_training_dataset").select("typo", "corrected", "domain").execute()
        
        if not response.data:
            print("No training data found in Supabase table")
            return pd.DataFrame()
        
        # Convert to DataFrame
        df = pd.DataFrame(response.data)
        
        # Check required columns exist
        required_cols = ["typo", "corrected", "domain"]
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"Missing required columns in Supabase data: {missing_cols}")
            return pd.DataFrame()
        
        # Rename columns to match existing code
        df = df.rename(columns={"typo": "query", "corrected": "correction"})
        
        print(f"Successfully fetched {len(df)} training examples from Supabase")
        return df
        
    except Exception as e:
        print(f"Error fetching data from Supabase: {e}")
        return pd.DataFrame()

# Fetch data from Supabase
def retrain_model_and_upload(supabase_train_df):
    """Retrain the model and upload it to Supabase"""
    # Initialize Supabase client
    supabase_client = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
    
    # Define paths
    today = datetime.now().strftime("%Y%m%d")
    LOCAL_MODEL_PATH = "./model/finetuned-embedding-model"
    OUTPUT_PATH = f"./model/finetuned-embedding-model-{today}"
    ZIP_PATH = f"./model/finetuned-embedding-model-{today}.zip"
    
    try:
        # Try to load the previous week's model for continued training
        try:
            print("Attempting to load last week's model...")
            model = load_model_from_supabase(supabase_client, days_ago=7)
            print(f"Successfully loaded model from last week: {model}")
        except Exception as e:
            print(f"Could not load last week's model: {e}. Using base model instead.")
            model = SentenceTransformer(MODEL_NAME)
    
        if not supabase_train_df.empty:
            # Generate training examples
            train_examples = prepare_training_examples(supabase_train_df)
            print(f"Created {len(train_examples)} training examples")
            
            # Create data loader
            train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
            
            # Define loss function
            train_loss = losses.MultipleNegativesRankingLoss(model)
            
            # Check if CUDA is available
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            print(f"Training on: {device}")
            
            # Fine-tune
            print(f"Starting fine-tuning for 5 epochs...")
            warmup_steps = int(len(train_dataloader) * 5 * 0.1)
            
            model.fit(
                train_objectives=[(train_dataloader, train_loss)],
                epochs=5,
                optimizer_params={'lr': 1e-5},
                warmup_steps=warmup_steps,
                output_path=OUTPUT_PATH,
                show_progress_bar=True
            )
            
            print(f"Model fine-tuned and saved to {OUTPUT_PATH}")
            
            # Zip the model
            zip_path = zip_model(OUTPUT_PATH, ZIP_PATH)
            
            # Upload to Supabase
            model_url = upload_model_to_supabase(zip_path, supabase_client)
            print(f"Model uploaded successfully: {model_url}")
            
            return model, model_url
        else:
            print("No training data available. Model not trained.")
            return None, None
    except Exception as e:
        print(f"Error during model training and upload: {e}")
        return None, None

# Call the function to retrain and upload
supabase_train_df = fetch_training_data_from_supabase()
model, model_url = retrain_model_and_upload(supabase_train_df)

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
2025-09-26 13:21:49,189 - httpx - INFO - HTTP Request: GET https://cciyfbgiyqdutxdxwyxj.supabase.co/rest/v1/typo_training_dataset?select=typo%2Ccorrected%2Cdomain "HTTP/2 200 OK"


Successfully fetched 29 training examples from Supabase


2025-09-26 13:21:49,594 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2025-09-26 13:21:49,595 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small


Attempting to load last week's model...
Could not load last week's model: name 'load_model_from_supabase' is not defined. Using base model instead.


KeyboardInterrupt: 

In [6]:
# Create data loader for the new examples
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

# Define loss function
train_loss = losses.MultipleNegativesRankingLoss(model)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# Continue fine-tuning
print(f"Starting additional fine-tuning for 5 epochs...")
warmup_steps = int(len(train_dataloader) * 5 * 0.1)  # 10% of training as warmup

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,  # Fewer epochs for continued training
    optimizer_params={'lr': 1e-5},
    warmup_steps=warmup_steps,
    output_path=OUTPUT_PATH,
    show_progress_bar=True
)

print(f"Model fine-tuned and saved to {OUTPUT_PATH}")

NameError: name 'train_examples' is not defined

In [10]:
import os
import sys
from datetime import datetime
from supabase import create_client
import zipfile
import requests
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="/path/to/local/model",
    repo_id="ShawnSean/AutoValidate-Embedding-Model",
    repo_type="model",
)


# Add the parent directory to sys.path so 'app' can be imported
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Import required functions from your notebook
from app.api.services.config import SUPABASE_ANON_KEY, SUPABASE_URL


def upload_model_to_supabase_direct_api(zip_path, bucket_name="codenection-sss-model-training"):
    """
    Upload a zipped model to Supabase storage using direct API calls and chunked upload
    
    Args:
        zip_path: Path to the zipped model file
        bucket_name: Name of the storage bucket
    
    Returns:
        URL of the uploaded file
    """
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Zip file not found: {zip_path}")
    
    # Get the file size
    file_size = os.path.getsize(zip_path)
    print(f"File size: {file_size / (1024 * 1024):.2f} MB")
    
    # Get the filename
    file_name = os.path.basename(zip_path)
    storage_path = f"model/{file_name}"
    SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") 
    print(f"Supabase Service Key")
    if not SUPABASE_SERVICE_KEY:
        print("SUPABASE_SERVICE_KEY environment variable not found.")
        print("Please set it to your service role key from Supabase dashboard.")
        return None
    
    supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
    # Supabase API URL and headers
    api_url = f"{SUPABASE_URL}/storage/v1/object/{bucket_name}/{storage_path}"
    headers = {
        "apikey": SUPABASE_ANON_KEY,
        "Authorization": f"Bearer {SUPABASE_SERVICE_KEY}",
        "Content-Type": "application/octet-stream",
        "x-upsert": "true"  # Overwrite if exists
    }
    
    # Upload in chunks to avoid memory issues
    CHUNK_SIZE = 5 * 1024 * 1024  # 5MB chunks
    
    # Upload with requests using chunked transfer
    with open(zip_path, 'rb') as f:
        # Create a generator that yields chunks of the file
        def read_in_chunks():
            while True:
                chunk = f.read(CHUNK_SIZE)
                if not chunk:
                    break
                yield chunk
        
        print(f"Starting upload of {file_name} to {bucket_name}/{storage_path}")
        response = requests.post(
            api_url,
            headers=headers,
            data=read_in_chunks()  # Use a generator to read and send chunks
        )
    
    if response.status_code == 200:
        print(f"Upload successful with status code: {response.status_code}")
        public_url = f"{SUPABASE_URL}/storage/v1/object/public/{bucket_name}/{storage_path}"
        print(f"Public URL: {public_url}")
        return public_url
    else:
        print(f"Upload failed with status code: {response.status_code}")
        print(f"Response: {response.text}")
        raise Exception(f"Failed to upload file: {response.text}")

def upload_initial_model():
    """Upload the existing fine-tuned model to Supabase storage"""
    
    # Path to your existing model
    model_path = "./model/finetuned-embedding-model"
    
    if not os.path.exists(model_path):
        print(f"Error: Model directory not found at {model_path}")
        return None
    
    # Create a zip filename with current date
    today = datetime.now().strftime("%Y%m%d")
    zip_path = f"./model/finetuned-embedding-model-{today}.zip"
    
    print(f"1. Zipping model from {model_path}...")
    zip_path = zip_model_in_chunks(model_path, zip_path)
    
    print(f"2. Uploading model to Supabase...")
    model_url = upload_model_to_supabase_direct_api(zip_path)
    
    print(f"3. Model uploaded successfully!")
    print(f"   URL: {model_url}")
    
    return model_url

if __name__ == "__main__":
    print("Starting initial model upload...")
    try:
        upload_initial_model()
        print("Upload process completed successfully.")
    except Exception as e:
        print(f"Upload failed: {e}")

ValueError: Provided path: 'S:\path\to\local\model' is not a directory

In [12]:
import os
from datetime import datetime
import zipfile

def zip_model_in_chunks(model_path, output_zip=None):
    """
    Zip a model directory into a compressed file
    
    Args:
        model_path: Path to the model directory
        output_zip: Path for the output zip file (optional)
    
    Returns:
        Path to the created zip file
    """
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model directory not found: {model_path}")
    
    # Create a zip filename with current date if not provided
    if output_zip is None:
        today = datetime.now().strftime("%Y%m%d")
        output_zip = f"model_{today}.zip"
    
    # Create the zip file with smaller compression for better memory usage
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_STORED) as zipf:
        # Walk through all files in the directory
        for root, dirs, files in os.walk(model_path):
            for file in files:
                # Calculate path for file in zip
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(model_path))
                
                # Add file to zip
                zipf.write(file_path, arcname=arcname)
    
    print(f"Model zipped successfully to {output_zip}")
    return output_zip


def upload_model_to_hf(zip_path, repo_id=None, repo_type="model"):
    """
    Upload a single file (zip) to Hugging Face Hub.
    Requires HF_TOKEN in env and HF_REPO (or pass repo_id).
    Returns public URL to the uploaded file in the repo.
    """
    from huggingface_hub import HfApi
    HF_TOKEN = os.getenv("HF_TOKEN")
    if not HF_TOKEN:
        print("HF_TOKEN not set in environment.")
        return None

    # repo id from env if not provided
    repo_id = repo_id or os.getenv("HF_REPO")
    if not repo_id:
        print("HF_REPO not set. Set env HF_REPO like 'username/repo-name' or pass repo_id.")
        return None

    file_name = os.path.basename(zip_path)
    path_in_repo = f"models/{file_name}"  # where file will live in the repo

    api = HfApi(token=HF_TOKEN)
    print(f"Uploading {zip_path} -> {repo_id}:{path_in_repo} ...")

    # upload_file will stream the file and does not require loading whole file into memory
    api.upload_file(
        path_or_fileobj=zip_path,
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type=repo_type,
        token=HF_TOKEN,
        commit_message=f"Upload model {file_name} ({datetime.now().isoformat()})"
    )

    url = f"https://huggingface.co/{repo_id}/resolve/main/{path_in_repo}"
    print(f"Upload complete. URL: {url}")
    return url

def upload_initial_model():
    """
    Zip local model and upload to Hugging Face Hub
    """
    # Path to your existing model
    model_path = "./model/finetuned-embedding-model"

    if not os.path.exists(model_path):
        print(f"Error: Model directory not found at {model_path}")
        return None

    # Create a zip filename with current date
    today = datetime.now().strftime("%Y%m%d")
    zip_path = f"./model/finetuned-embedding-model-{today}.zip"

    print(f"1. Zipping model from {model_path}...")
    zip_path = zip_model_in_chunks(model_path, zip_path)  # re-uses zip (no chunking while uploading)

    print(f"2. Uploading model to Hugging Face Hub...")
    model_url = upload_model_to_hf(zip_path)
    if model_url:
        print(f"3. Model uploaded successfully!\n   URL: {model_url}")
    else:
        print("Upload failed.")
    return model_url

if __name__ == "__main__":
    print("Starting initial model upload...")
    try:
        upload_initial_model()
        print("Upload process completed.")
    except Exception as e:
        print(f"Upload failed: {e}")

Starting initial model upload...
1. Zipping model from ./model/finetuned-embedding-model...
Model zipped successfully to ./model/finetuned-embedding-model-20250926.zip
2. Uploading model to Hugging Face Hub...
Uploading ./model/finetuned-embedding-model-20250926.zip -> ShawnSean/AutoValidate-Embedding-Model:models/finetuned-embedding-model-20250926.zip ...
Upload complete. URL: https://huggingface.co/ShawnSean/AutoValidate-Embedding-Model/resolve/main/models/finetuned-embedding-model-20250926.zip
3. Model uploaded successfully!
   URL: https://huggingface.co/ShawnSean/AutoValidate-Embedding-Model/resolve/main/models/finetuned-embedding-model-20250926.zip
Upload process completed.


In [13]:
import os
import zipfile
import tempfile
import shutil
from datetime import datetime
from huggingface_hub import snapshot_download, HfApi
from sentence_transformers import SentenceTransformer

def load_model_from_hf(
    repo_id="ShawnSean/AutoValidate-Embedding-Model", 
    model_date=None, 
    use_temp=True,
    cache_dir=None
):
    """
    Download and load a model from Hugging Face Hub
    
    Args:
        repo_id: Hugging Face repo ID (default: "ShawnSean/AutoValidate-Embedding-Model")
        model_date: Specific date of model to load (format: YYYYMMDD, default: latest)
        use_temp: Whether to use a temporary directory (deleted on program exit)
        cache_dir: Custom cache directory (if use_temp=False)
        
    Returns:
        Loaded SentenceTransformer model
    """
    # Set up HF_TOKEN from env if not already in environment
    if "HF_TOKEN" not in os.environ:
        from dotenv import load_dotenv
        load_dotenv()
    
    # Disable progress bars for cleaner output
    os.environ["HF_HUB_DISABLE_PROGRESS"] = "1"
    
    # Get all model files in the repo to find the latest if no date specified
    api = HfApi()
    
    try:
        # List model files in the repo
        model_files = [
            f for f in api.list_repo_files(repo_id=repo_id)
            if f.startswith("models/finetuned-embedding-model-") and f.endswith(".zip")
        ]
        
        if not model_files:
            raise FileNotFoundError(f"No model files found in {repo_id}")
        
        # Extract the date from filename
        date_from_file = None
        
        if model_date is None:
            # Find the latest model if no specific date requested
            # Sort by date in filename
            model_files.sort(reverse=True)
            model_file = model_files[0]
            # Extract date from filename (e.g., finetuned-embedding-model-20250925.zip -> 20250925)
            date_from_file = model_file.split('-')[-1].split('.')[0]
            print(f"Using latest model: {model_file} (date: {date_from_file})")
        else:
            # Find the model with the specified date
            date_str = str(model_date)
            matching_files = [f for f in model_files if date_str in f]
            if not matching_files:
                raise FileNotFoundError(f"No model found for date {model_date}")
            model_file = matching_files[0]
            date_from_file = date_str
            print(f"Using model from {model_date}: {model_file}")
        
        # Create extract directory with date
        base_dir = cache_dir if not use_temp else "./latest_model"
        extract_dir = os.path.join(base_dir, date_from_file)
        
        # Create the directory if it doesn't exist
        os.makedirs(extract_dir, exist_ok=True)
        
        # Define the final model directory path
        model_dir = os.path.join(extract_dir, "finetuned-embedding-model")
        
        # Skip download if model already exists
        if os.path.exists(model_dir) and os.path.isdir(model_dir):
            print(f"Model already exists at {model_dir}, skipping download")
        else:
            # Download the specific file from the repo
            print(f"Downloading model from Hugging Face Hub...")
            repo_dir = snapshot_download(
                repo_id=repo_id,
                allow_patterns=[model_file],
                repo_type="model"
            )
            
            # Path to the downloaded zip file
            zip_path = os.path.join(repo_dir, model_file)
            
            # Extract the model
            print(f"Extracting model to {extract_dir}...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
            
            # Clean up the download cache to save space
            if os.path.exists(repo_dir) and repo_dir != extract_dir:
                shutil.rmtree(repo_dir)
        
        # Load the model with SentenceTransformer
        print(f"Loading model from {model_dir}...")
        model = SentenceTransformer(model_dir)
        print("Model loaded successfully!")
        
        return model
        
    except Exception as e:
        print(f"Error loading model from Hugging Face: {e}")
        raise

# Example usage:
if __name__ == "__main__":
    # Load the latest model
    model = load_model_from_hf()
    
    # Or load a specific date's model
    # model = load_model_from_hf(model_date="20250925")

Using latest model: models/finetuned-embedding-model-20250926.zip (date: 20250926)
Downloading model from Hugging Face Hub...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 1 files: 100%|██████████| 1/1 [02:04<00:00, 124.08s/it]


Extracting model to ./latest_model\20250926...


2025-09-26 13:29:58,997 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2025-09-26 13:29:58,999 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: ./latest_model\20250926\finetuned-embedding-model


Loading model from ./latest_model\20250926\finetuned-embedding-model...
Model loaded successfully!
