In [None]:
import os
import sys
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.schema.document import Document
import argparse

EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
EXCEL_FILE_PATH = r'Data real\Car rate book.xlsx'
VECTOR_STORE_PATH = "car_rate_vectorstore"

def format_car_row(row):
    columns_labels = {
        'TYPECOD': 'Brand', 'MODELCOD': 'Main Model', 'MODELDESC': 'Sub Model',
        'MANUYR': 'Year', 'GEAR': 'Transmission', 'GCODE': 'Vehicle Type',
        'PRODUCT GROUP': 'Product Group', 'RATE': 'Appraisal Price'
    }

    parts = []
    for col, label in columns_labels.items():
        value = row.get(col)
        if pd.notna(value) and str(value).strip():
            if col in ['RATE', 'MANUYR']:
                try:
                    num_value = int(value)
                    if num_value > 0:
                        formatted_value = f"{num_value:,}" if col == 'RATE' else str(num_value)
                        parts.append(f"{label}: {formatted_value}")
                except (ValueError, TypeError):
                    parts.append(f"{label}: {value}")
            else:
                parts.append(f"{label}: {value}")
    return ", ".join(parts) if parts else "Insufficient information"

def load_car_data(file_path):
    
    df = pd.read_excel(file_path, header=0, dtype=str).fillna('')
    df['MANUYR'] = pd.to_numeric(df['MANUYR'], errors='coerce').astype('Int64')
    df['RATE'] = pd.to_numeric(df['RATE'], errors='coerce').astype('Int64')
    
    try:
        df['FDATEA'] = pd.to_datetime(df['FDATEA'], format='%d-%b-%y', errors='coerce')
        df['LDATEA'] = pd.to_datetime(df['LDATEA'], format='%d-%b-%y', errors='coerce')
    except Exception:
        pass
        
    return df

class SafeHuggingFaceBgeEmbeddings(HuggingFaceBgeEmbeddings):
    def embed_query(self, text):
        if text is None:
            text = ""
        return super().embed_query(text)

def create_embeddings_model():
    return SafeHuggingFaceBgeEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True},
        query_instruction="Represent this query for retrieving relevant documents: "
    )

def create_car_vector_store(file_path=EXCEL_FILE_PATH, vector_store_path=VECTOR_STORE_PATH):

    car_data = load_car_data(file_path)
        
    texts = [format_car_row(row) for _, row in car_data.iterrows()]
    documents = [Document(page_content=text, metadata={"id": str(i), "source": "car_rate_book"}) 
                for i, text in enumerate(texts)]
        
    embed_model = create_embeddings_model()
    vector_store = FAISS.from_documents(documents, embed_model)
    vector_store.save_local(vector_store_path)
    return True

def main():
    is_jupyter = 'ipykernel' in sys.modules
    
    if is_jupyter:
        input_path = EXCEL_FILE_PATH
        output_path = VECTOR_STORE_PATH
    else:
        parser = argparse.ArgumentParser(description="Convert car rate book to FAISS vector store")
        parser.add_argument("--input", "-i", default=EXCEL_FILE_PATH, 
                            help=f"Path to car rate Excel file (default: {EXCEL_FILE_PATH})")
        parser.add_argument("--output", "-o", default=VECTOR_STORE_PATH, 
                            help=f"Path to save FAISS vector store (default: {VECTOR_STORE_PATH})")
        args = parser.parse_args()
        input_path = args.input
        output_path = args.output
    
    success = create_car_vector_store(input_path, output_path)
    if success:
        print(f"Vector store successfully created at {output_path}")
    else:
        print("Failed to create vector store")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


Vector store successfully created at car_rate_vectorstore
