* Change Embedding model from nomic-> stella_en_1.5B_v5
* Modified JSON schema

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from opensearchpy import OpenSearch
import uuid
import json
import torch

#### Local Machine - Windows

In [3]:
client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    http_auth=('admin', 'SamplePassword1!'),
    use_ssl=False,
    verify_certs=False
)

# Verify the connection
if client.ping():
    print("Successfully connected to OpenSearch!")
else:
    print("Failed to connect to OpenSearch.")

Successfully connected to OpenSearch!


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "dunzhang/stella_en_1.5B_v5",
    trust_remote_code=True,
    device="cpu", 
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False, "cache_folder":"./"})

In [5]:
def CreateEmbed(input_text):
    return model.encode(input_text).tolist()

In [6]:
# %%time
# embed = CreateEmbed("Camry")

### Define EMBEDDING_DIM

In [7]:
EMBEDDING_DIM = model.encode(["Sample sentence"])[0].shape[0]
EMBEDDING_DIM

1024

### Data Formating

In [8]:
import json
with open("../Data.txt", "r") as f:
    # Read the entire file content and join lines into a single string
    cars_data = json.loads("".join(f.readlines()))

In [9]:
# cars_data['items'][0]['vin']

In [10]:
from tqdm.notebook import tqdm

def CreateMetadataandTextfromJSON(json_data):
    vehicle_data = {
        "vin": json_data['vin'],
        "vehicle_info": {
            "vehicle_serial_number": json_data['serialNbr'],
            "vehicle_body_style_description": json_data['bodyStyleDesc'],
            "vehicle_brand": json_data['brand'],
#             "marketing_grade_code": json_data['marketingGrade']['code'],
            "marketing_grade_title": json_data['marketingGrade']['title']
        },
        "engine_info": {
            "engine_code": json_data['engine']['engineCd'],
            "engine_number": json_data['engine']['engineNbr'],
#             "engine_name": json_data['engine']['name'],
#             "engine_fuel_type": json_data['engine']['fuelType'],
#             "engine_cylinders_count": json_data['engine']['noOfCylinders'],
#             "engine_horsepower": json_data['engine']['horsepower']
        },
        "price_info": {
#             "price_optional_total_msrp": json_data['price']['optTotalMsrp'],
            "price_total_msrp": json_data['price']['totalMsrp'],
#             "price_base_msrp": json_data['price']['baseMsrp'],
#             "price_ppo_holdback": json_data['price']['ppoHoldback']
        },
        "color_info": {
#             "interior_color_code": json_data['intColor']['colorCd'],
#             "interior_color_nvs_name": json_data['intColor']['nvsName'],
#             "exterior_color_code": json_data['extColor']['colorCd'],
#             "exterior_color_nvs_name": json_data['extColor']['nvsName'],
#             "exterior_color_hex_code": json_data['extColor']['colorHexCd'],
            "exterior_color_common_name_display": json_data['extColor']['commonName']['display'],
            "exterior_color_common_name_generic": json_data['extColor']['commonName']['generic'],
#             "exterior_color_common_name_specific": json_data['extColor']['commonName']['specific']
        },
        "drivetrain_info": {
            "drivetrain_code": json_data['drivetrain']['code'],
            "drivetrain_title": json_data['drivetrain']['title']
        },
        "year": json_data['year'],
        "saleClass": json_data['saleClassCode']
    }
    return vehicle_data

In [11]:
# cars_data['items'][105] #'extColor': {'colorCd': '02ZC', color json issue

# Ext Color issue
# cars_data['items'][104]
# cars_data['items'][105]
# cars_data['items'][106]
# cars_data['items'][477]

In [12]:
def CreateSingleString(data, parent_key='', sep='_'):
    items = []
    for k, v in data.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            # Recurse into dictionary
            items.extend(CreateSingleString(v, new_key, sep=sep).items())
        else:
            # Add the key-value pair to the items list
            items.append((new_key, v))
    return dict(items)

In [13]:
def extract_values(d):
    values = " "
    if isinstance(d, dict):
        for v in d.values():
            values += extract_values(v)
    else:
        values += str(d)  # Convert all values to strings
    return values

import re

def clean_text(text):
    # Step 1: Remove HTML tags
    text_cleaned = re.sub(r'<.*?>', '', text)

    # Step 2: Remove metadata/placeholder tags (anything in square brackets [])
    text_cleaned = re.sub(r'\[.*?\]', '', text_cleaned)

    # Step 3: Replace special characters or bullet points like • with regular text
    text_cleaned = text_cleaned.replace('•', '-')

    text_cleaned = re.sub(r'[^a-zA-Z0-9\s]', ' ', text_cleaned)

    # Step 4: Remove extra spaces or newlines
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()

    # Step 5: Optional - further cleaning, e.g., replace specific phrases or patterns
    # For example: "Includes 1-year trial of Drive Connect." can be simplified or removed.
    text_cleaned = re.sub(r'Includes \d+-year trial of Drive Connect\.', '', text_cleaned)

    return text_cleaned

In [14]:
def CategoryField(data):
    res = {}
    for key in data.keys():
        # Initialize the string to accumulate data
        combined_string = ''
        
        # Check if the field is a dictionary, meaning it contains subfields
        if isinstance(data[key], dict):
            for sub_key in data[key].keys():
                # Add the subfield value to the combined string
                combined_string += " " + str(data[key][sub_key])
        
        # If combined_string has content, store it in the result dictionary
        if combined_string:
            res[key] = combined_string.strip()  # Strip to remove leading/trailing spaces
            
        # Optionally print the key and combined value for debugging
        # print(f"Key: {key}, Combined String: {combined_string}")
    
    return res  # Return the result dictionary after processing

#### Create Marketing - Single String

In [15]:
def CreateSingleMarketingString(data):
    try:
        text =  clean_text(data['model']['marketingTitle'])
    except:
        text =  clean_text(data['model']['marketingName'])
    return text

#### Define Index

In [22]:
index_body = {
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": {
    "properties": {
      "text": { "type": "text" },
      "vin": { "type": "text" },
      "vehicle_info": { "type": "text" },
      "engine_info": { "type": "text" },
      "price_info": { "type": "text" },
      "color_info": { "type": "text" },
      "drivetrain_info": { "type": "text" },
      "marketingname": { "type": "text" },
      "year": { "type": "text" },
      "saleClass": { "type": "text" },
      
      "text_embedding": {
        "type": "knn_vector",
        "dimension": EMBEDDING_DIM,  # Example dimension of embeddings
        "method": {
          "name": "hnsw",
          "space_type": "cosinesimil",
          "engine": "nmslib",
          "parameters": {
            "ef_construction": 128,
            "m": 24
          }
        }
      },
      
      "vehicle_info_embedding": {
        "type": "knn_vector",
        "dimension": EMBEDDING_DIM,  # Example dimension
        "method": {
          "name": "hnsw",
          "space_type": "cosinesimil",
          "engine": "nmslib",
          "parameters": {
            "ef_construction": 128,
            "m": 24
          }
        }
      },
      
      "engine_info_embedding": {
        "type": "knn_vector",
        "dimension": EMBEDDING_DIM,  # Example dimension
        "method": {
          "space_type": "cosinesimil",
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {
            "ef_construction": 128,
            "m": 24
          }
        }
      },
      
      "price_info_embedding": {
        "type": "knn_vector",
        "dimension": EMBEDDING_DIM,  # Example dimension
        "method": {
          "space_type": "cosinesimil",
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {
            "ef_construction": 128,
            "m": 24
          }
        }
      },
      
      "color_info_embedding": {
        "type": "knn_vector",
        "dimension": EMBEDDING_DIM,  # Example dimension
        "method": {
          "space_type": "cosinesimil",
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {
            "ef_construction": 128,
            "m": 24
          }
        }
      },
      
      "drivetrain_info_embedding": {
        "type": "knn_vector",
        "dimension": EMBEDDING_DIM,  # Example dimension
        "method": {
          "space_type": "cosinesimil",
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {
            "ef_construction": 128,
            "m": 24
          }
        }
      },
      
      "marketingname_embedding": {
        "type": "knn_vector",
        "dimension": EMBEDDING_DIM,  # Example dimension
        "method": {
          "space_type": "cosinesimil",
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {
            "ef_construction": 128,
            "m": 24
          }
        }
      },
      
      "metadata": {
      "type": "object",
      "properties": {
        "vin": { "type": "keyword" },
        "vehicle_info": {
          "type": "object",
          "properties": {
            "vehicle_serial_number": { "type": "keyword" },
            "vehicle_body_style_description": { "type": "keyword" },
            "vehicle_brand": { "type": "keyword" },
            "marketing_grade_title": { "type": "keyword" }
          }
        },
        "engine_info": {
          "type": "object",
          "properties": {
            "engine_code": { "type": "keyword" },
            "engine_number": { "type": "keyword" }
          }
        },
        "price_info": {
          "type": "object",
          "properties": {
            "price_total_msrp": { "type": "float" }
          }
        },
        "color_info": {
          "type": "object",
          "properties": {
            "exterior_color_common_name_display": { "type": "keyword" },
            "exterior_color_common_name_generic": { "type": "keyword" }
          }
        },
        "drivetrain_info": {
          "type": "object",
          "properties": {
            "drivetrain_code": { "type": "keyword" },
            "drivetrain_title": { "type": "keyword" }
          }
        },
        "year": { "type": "integer" },
        "saleClass": { "type": "keyword" }
      }
    }
    }
  }
}

### Check Index exists

In [17]:
import requests
from requests.auth import HTTPBasicAuth
auth=HTTPBasicAuth('admin', 'SamplePassword1!')

payload = {}

headers = {'Content-Type': 'application/json'}
url = f"http://localhost:9200/"
response = requests.head(url, headers=headers, params=payload, auth=auth)
print(response.content.decode('utf-8'))




### Delete Index

In [18]:
index_name = "toyota-dvs-stella-reindex"
try:
    response = client.indices.delete(index=index_name)
    print(response)
except Exception as e:
    print(e)

{'acknowledged': True}


### Check Index List

In [19]:
import requests
from requests.auth import HTTPBasicAuth

In [20]:
auth=HTTPBasicAuth('admin', 'SamplePassword1!')
headers = {'Content-Type': 'application/json'}

In [21]:
route = "_list/indices"
url = f"http://localhost:9200/{route}"

payload = {}

response = requests.get(url, headers=headers,auth=auth, data=json.dumps(payload))
response.content.decode('utf-8').split("\n")

['green  open .opensearch-observability        fuQfTOsyRvGeNfMPVvUYHg 1 0    0 0    208b    208b',
 'green  open .opendistro_security             pPRVpS5BR7ODMOslQ7DQ1Q 1 0   10 0  82.5kb  82.5kb',
 'yellow open security-auditlog-2025.01.05     Vzx3XrZqQD-jbNHlloa-aQ 1 1 1462 0   1.6mb   1.6mb',
 'green  open .kibana_1                        EvcojlPIQDeppJTtkjawXA 1 0    1 0   5.3kb   5.3kb',
 'green  open .plugins-ml-config               RZqDFzd5RAa0wV3SViJZzg 1 0    1 0     4kb     4kb',
 'green  open .plugins-ml-model-group          RxuxKqOtT7y9G0YIKjQGmg 1 0    2 0   8.3kb   8.3kb',
 'green  open .plugins-ml-task                 BU4FpBhcT5ysJe5eGD0qpw 1 0   29 5  46.4kb  46.4kb',
 'green  open .plugins-ml-model                F9_O0QkEQP2J7JzEO_gyMg 1 0   97 0   1.1gb   1.1gb',
 'green  open .kibana_92668751_admin_1         LT6blP9LTnGVfCo4grC9_w 1 0    1 0   5.3kb   5.3kb',
 'green  open .ql-datasources                  d0jB_VZaS8CRYrB5DG8ZCQ 1 0    0 0    208b    208b',
 'yellow o

### Create Index

In [23]:
index_name = "toyota-dvs-stella-reindex"
try:
    response = client.indices.create(index=index_name, body=index_body)
    #  response = client.indices.upgrade(index=index_name)
    print(response)
except Exception as e:
    print(e)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'toyota-dvs-stella-reindex'}


In [24]:
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Function to create embeddings in parallel
def _create_embeddings_parallel(category_field_text, marketingname, SingleString):
    # List of fields to generate embeddings for
    fields = [
        ("text_info_embedding", SingleString),
        ("vehicle_info_embedding", category_field_text.get('vehicle_info')),
        ("engine_info_embedding", category_field_text.get('engine_info')),
        ("price_info_embedding", category_field_text.get('price_info')),
        ("color_info_embedding", category_field_text.get('color_info')),
        ("drivetrain_info_embedding", category_field_text.get('drivetrain_info')),
        ("marketingname_embedding", marketingname),
    ]
    
    # Create embeddings using ThreadPoolExecutor to parallelize the tasks
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(CreateEmbed, value): key for key, value in fields}
        embeddings = {}
        
        # Collect results as they are completed
        for future in as_completed(futures):
            key = futures[future]
            embeddings[key] = future.result()

    return embeddings

actions = []

# Loop through car data
for data in tqdm(cars_data['items']):  # Adjust the slice as needed
    try:
        # Extract metadata and clean text fields
        metadata = CreateMetadataandTextfromJSON(data)
        SingleString = clean_text(extract_values(metadata))
        category_field_text = CategoryField(metadata)
        marketingname = CreateSingleMarketingString(data)
        saleClass = metadata['saleClass']
        year = metadata['year']

        # Create embeddings in parallel using the batch function
        embeddings = _create_embeddings_parallel(category_field_text, marketingname, SingleString)

        # Constructing the action to index the document
        action = {
            "_op_type": "index",  # Operation type (index or create)
            "_index": index_name,  # Index name
            "_id": uuid.uuid4().hex,  # Document ID
            "_source": {
                "id": uuid.uuid4().hex,  # Document ID field
                "vin": metadata.get('vin'),  # Storing VIN
                "text_info": SingleString,  # Storing the text data
                "vehicle_info": category_field_text.get('vehicle_info'),  # Storing vehicle_info as text
                "engine_info": category_field_text.get('engine_info'),  # Storing engine_info as text
                "price_info": category_field_text.get('price_info'),  # Storing price_info as text
                "color_info": category_field_text.get('color_info'),  # Storing color_info as text
                "drivetrain_info": category_field_text.get('drivetrain_info'),  # Storing drivetrain_info as text
                "marketingname_info": marketingname,
                "year": year,
                "saleClass": saleClass,
                # Embedding results added for each field
                **embeddings,
                # Adding metadata for each document
                "metadata": metadata,  # Metadata field
            }
        }
        actions.append(action)
    except Exception as e:
        print(e)

 17%|████████████████████                                                                                                | 104/600 [06:00<29:32,  3.57s/it]

'commonName'
'commonName'
'commonName'


 29%|█████████████████████████████████▊                                                                                  | 175/600 [09:54<24:34,  3.47s/it]

'marketingGrade'


 80%|████████████████████████████████████████████████████████████████████████████████████████████▏                       | 477/600 [27:16<07:37,  3.72s/it]

'commonName'


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [34:27<00:00,  3.45s/it]


In [22]:
# actions[0]

#### Save & Load Embedding Data

In [25]:
filename = "../embed_stell-1.5B-09012025.json"

with open(filename,"w") as f:
    json.dump(actions,f, indent=4)

#### Indexing Data Count

In [26]:
print("Total Docs Count : {}".format(len(actions)))

Total Docs Count : 595


#### Single Doc Index

In [114]:
# for idx, data in enumerate(tqdm(FinalDocs[1:])):
#     res = client.index(
#         index=index_name,
#         body={"id":  data['ids'], "text": data['text'],"embedding": data['embedding']},
#         id = str(data['ids']),
#         refresh = True
#         )
#     print(f"Idx : {idx} Status : {res['_shards']}")
#     print("\n")

In [115]:
# actions[0]

### Bulk Index

#### With small datasets

In [45]:
from opensearchpy.helpers import bulk

# Bulk index documents
try:
    success, failed = bulk(client, actions)
    print(f"Successfully indexed {success} documents. {failed} failed.")
except Exception as e:
    print(f"Error during bulk indexing: {e}")

Error during bulk indexing: TransportError(429, 'rejected_execution_exception', 'rejected execution of coordinating operation [coordinating_and_primary_bytes=0, replica_bytes=0, all_bytes=0, coordinating_operation_bytes=70588183, max_coordinating_and_primary_bytes=53687091]')


In [9]:
# Open the file in write mode and dump the actions into it
# with open("embeddings_st_400.json", 'w') as json_file:
#     json.dump(actions, json_file, indent=4)
    
    
# with open("embeddings_st_400.json", 'r') as json_file:
#     actions = json.load(json_file)

#### Large Datasets

In [27]:
from opensearchpy.helpers import bulk

# Helper function to split actions into smaller batches
def chunk_actions(actions, chunk_size=100):
    for i in range(0, len(actions), chunk_size):
        yield actions[i:i + chunk_size]

# Bulk index documents in smaller batches
try:
    total_success = 0
    total_failed = 0
    
    # Process the bulk indexing in chunks
    for chunk in chunk_actions(actions):
        success, failed = bulk(client, chunk)
        
        # Add the number of successful documents and failed documents
        total_success += success
        total_failed += len(failed)  # Count the number of failed documents

    print(f"Successfully indexed {total_success} documents. {total_failed} failed.")
except Exception as e:
    print(f"Error during bulk indexing: {e}")

Successfully indexed 595 documents. 0 failed.


### Search Query

In [117]:
# index_name

In [118]:
# Execute the search query
# results = client.search(
#     body=query_body,
#     index=index_name
# )

### Price Filter from Query

In [30]:
import re
import requests
import json

def FilterPriceFromQuery(user_query):
    # Find all matching numbers that are 5 digits or more
    price_data = re.findall(r'\d{5,}', user_query)
    
    if len(price_data) == 0:
        return {'Min': 0, 'Max': 0}
    
    if len(price_data) == 1:
        price_data.append('0')  # Add a second price if there's only one
    
    # Convert to integers for comparison
    price_data = list(map(int, price_data))
    
    # Using ternary operator to find min and max
    min_value = price_data[0] if price_data[0] < price_data[1] else price_data[1]
    max_value = price_data[0] if price_data[0] > price_data[1] else price_data[1]
    
    return {"Min": min_value, "Max": max_value}

### Query Based on Filters

In [120]:
def GetQueryResponse(user_query, top_k):
    query_embedding = model.encode(user_query).tolist()
    query_body = {
        "size": top_k,  # Number of results to return
        "query": {
            "bool": {
                "should": [
                    # Knn search for embedding fields
                    {
                        "knn": {
                            "vehicle_info_embedding": {
                                "vector": query_embedding,
                                "k": 3  # Number of nearest neighbors
                            }
                        }
                    },
                    {
                        "knn": {
                            "engine_info_embedding": {
                                "vector": query_embedding,
                                "k": 3
                            }
                        }
                    },
                    {
                        "knn": {
                            "drivetrain_info_embedding": {
                                "vector": query_embedding,
                                "k": 3
                            }
                        }
                    },
                    {
                        "knn": {
                            "marketingname_embedding": {
                                "vector": query_embedding,
                                "k": 3
                            }
                        }
                    },
                    # Knn search for non-textual fields (color_info_embedding, price_info_embedding)
                    {
                        "knn": {
                            "color_info_embedding": {
                                "vector": query_embedding,
                                "k": 3
                            }
                        }
                    },
                    {
                        "knn": {
                            "price_info_embedding": {
                                "vector": query_embedding,
                                "k": 3
                            }
                        }
                    },
                    # Text-based search for vin with terms query
                    {
                        "match": {
                            "vin": {
                                "query": user_query,  # Match the full user query
                                "boost": 10  # Highest priority
                            }
                        }
                    },
                    # Match queries for other fields with minimum_should_match: -1
                    {
                        "match": {
                            "text": {
                                "query": user_query,
                                "minimum_should_match": "-1",
                                "boost": 4
                            }
                        }
                    },
                    {
                        "match": {
                            "vehicle_info": {
                                "query": user_query,
                                "minimum_should_match": "-1",
                                "boost": 8
                            }
                        }
                    },
                    {
                        "match": {
                            "engine_info": {
                                "query": user_query,
                                "minimum_should_match": "-1",
                                "boost": 2
                            }
                        }
                    },
                    {
                        "match": {
                            "drivetrain_info": {
                                "query": user_query,
                                "minimum_should_match": "-1",
                                "boost": 3
                            }
                        }
                    },
                    {
                        "match": {
                            "marketingname_info": {
                                "query": user_query,
                                "minimum_should_match": "-1",
                                "boost": 5
                            }
                        }
                    },
                    # Match_phrase for price_info and color_info (as per instruction)
                    {
                        "match_phrase": {
                            "price_info": {
                                "query": user_query,
                                "boost": 2.0
                            }
                        }
                    },
                    {
                        "match_phrase": {
                            "color_info": {
                                "query": user_query,
                                "boost": 1.0
                            }
                        }
                    }
                ]
            }
        },
        "_source": [
            "text", "id", "vin", "embedding", "metadata", "vehicle_info", "engine_info",
            "color_info", "drivetrain_info", "marketingname_info"
        ],
        "sort": [
            {
                "_score": {
                    "order": "desc"  # Sort by score in descending order
                }
            }
        ]
    }
    
    # Execute the search query
    results = client.search(
        body=query_body,
        index=index_name
    )
    
    return results

In [27]:
def GetQueryResponse_v2(user_query, top_k):
    # Generate the query embedding from the user query
    
    query_embedding = model.encode(user_query).tolist()
    
    Min = FilterPriceFromQuery(user_query).get('Min', 0)
    Max = FilterPriceFromQuery(user_query).get('Max', None)

    query_body = {
    "size": top_k,  # Number of results to return
    "query": {
        "bool": {
            "should": [
                # Knn search for embedding fields
                {
                    "knn": {
                        "vehicle_info_embedding": {
                            "vector": query_embedding,
                            "k": 3  # Number of nearest neighbors
                        }
                    }
                },
                {
                    "knn": {
                        "engine_info_embedding": {
                            "vector": query_embedding,
                            "k": 3
                        }
                    }
                },
                {
                    "knn": {
                        "drivetrain_info_embedding": {
                            "vector": query_embedding,
                            "k": 3
                        }
                    }
                },
                {
                    "knn": {
                        "marketingname_embedding": {
                            "vector": query_embedding,
                            "k": 3
                        }
                    }
                },
                # Knn search for non-textual fields (color_info_embedding, price_info_embedding)
                {
                    "knn": {
                        "color_info_embedding": {
                            "vector": query_embedding,
                            "k": 3
                        }
                    }
                },
                {
                    "knn": {
                        "price_info_embedding": {
                            "vector": query_embedding,
                            "k": 3
                        }
                    }
                },
                # Text-based search for vin with terms query
                {
                    "match": {
                        "vin": {
                            "query": user_query,  # Match the full user query
                            "boost": 10  # Highest priority
                        }
                    }
                },
                # Match queries for other fields with minimum_should_match: -1
                {
                    "match": {
                        "text": {
                            "query": user_query,
                            "minimum_should_match": "-1",
                            "boost": 4
                        }
                    }
                },
                {
                    "match": {
                        "vehicle_info": {
                            "query": user_query,
                            "minimum_should_match": "-1",
                            "boost": 8
                        }
                    }
                },
                {
                    "match": {
                        "engine_info": {
                            "query": user_query,
                            "minimum_should_match": "-1",
                            "boost": 2
                        }
                    }
                },
                {
                    "match": {
                        "drivetrain_info": {
                            "query": user_query,
                            "minimum_should_match": "-1",
                            "boost": 3
                        }
                    }
                },
                {
                    "match": {
                        "marketingname_info": {
                            "query": user_query,
                            "minimum_should_match": "-1",
                            "boost": 5
                        }
                    }
                },
                # Match_phrase for price_info and color_info (as per instruction)
                {
                    "match_phrase": {
                        "price_info": {
                            "query": user_query,
                            "boost": 2.0
                        }
                    }
                },
                {
                    "match_phrase": {
                        "color_info": {
                            "query": user_query,
                            "boost": 1.0
                        }
                    }
                },
                # Range filter for price_total_msrp (Optional filter)
                {
                    "range": {
                        "metadata.price_info.price_total_msrp": {
                            "gte": Min,  # Replace <Min> with the minimum value
                            "lte": Max   # Replace <Max> with the maximum value
                        }
                    }
                }
            ],
            "minimum_should_match": 0  # Ensure at least one "should" condition matches
            }
        },
        "_source": [
            "text", "id", "vin", "embedding", "metadata", "vehicle_info", "engine_info",
            "color_info", "drivetrain_info", "marketingname_info","saleClass","year","price_info"
        ],
        "sort": [
            {
                "_score": {
                    "order": "desc"  # Sort by score in descending order
                }
            }
        ]
    }
    
    # Execute the search query using the client and return the results
    results = client.search(
        body=query_body,
        index=index_name
    )
    return results

In [9]:
index_name = "toyota-dvs-stella-reindex"

### Follow-up Query

In [29]:
# Initialize context as an empty string
context = ''

# Begin an infinite loop to accept user input and process queries
while True:
    # Take user input
    user_query = input("Enter your query (or type 'exit' to quit): ")

    # Exit condition for the loop
    if user_query.lower() == 'exit':
        print("Exiting the program.")
        break

    # Accumulate the user query into the context (with a space before the new query)
    context = context + " " + user_query  

    # Print the current accumulated context
    print(f"Current Context: {context}")

    # Define the number of results to fetch from the query
    top_k = 30

    # Fetch the results based on the accumulated context
    try:
        results = GetQueryResponse_v2(context, top_k)

        # Check if results contain 'hits' and print them
        if 'hits' in results and 'hits' in results['hits']:
            for hit in results['hits']['hits']:
                print(f"Document ID: {hit['_id']}")
                
                # Safely retrieve fields from _source using .get() to avoid key errors
                print(f"Vin: {hit['_source'].get('vin', 'No VIN found')}")
                print(f"Score: {hit['_score']}")
                print(f"vehicle_info: {hit['_source'].get('vehicle_info', 'No vehicle_info found')}")
                print(f"Marketing Name: {hit['_source'].get('marketingname_info', 'No marketing name found')}")
                print(f"engine_info: {hit['_source'].get('engine_info', 'No engine_info found')}")
                print(f"color_info: {hit['_source'].get('color_info', 'No color_info found')}")
                print(f"price_info: {hit['_source'].get('price_info', 'No price_info found')}")
                print(f"Year: {hit['_source'].get('year', 'No year found')}")
                print(f"saleClass: {hit['_source'].get('saleClass', 'No saleClass found')}")
                print("\n")
        else:
            print("No results found.")  # If no results, inform the user
    except Exception as e:
        print(f"Error occurred while fetching results: {e}")
                      
    print("*"*100)

Enter your query (or type 'exit' to quit): Camy
Current Context:  Camy
Document ID: 4e4d3e89f8ac4d538a301118107628a9
Vin: JTEABFAJ6RK008464
Score: 0.81444585
vehicle_info: RK008464 4WD WAGON HYBRID TOYOTA First Edition
Marketing Name: Land Cruiser First Edition 2 4L 4 Cyl i FORCE MAX Hybrid Engine 4 Wheel Drive
engine_info: IFMH T24A5A19508
color_info: Tan Tan
price_info: 77275
Year: 2024
saleClass: new


Document ID: 8894bb346f6a4e008664f6efbbc1c177
Vin: JTDACACU9R3022350
Score: 0.7999408
vehicle_info: R3022350 XSE Premium TOYOTA XSE Premium
Marketing Name: Prius Prime XSE Premium 2 0L 4 Cyl Plug in Hybrid Engine Front Wheel Drive
engine_info: 24CPIH M20A6086709
color_info: Gray Gray
price_info: 40765
Year: 2024
saleClass: new


Document ID: 136711285abe4bfa839687327cf9aa93
Vin: JTDB4MEE6RJ016851
Score: 0.7999408
vehicle_info: RJ016851 LE TOYOTA LE
Marketing Name: Corolla LE 2 0L 4 Cyl Engine Front Wheel Drive
engine_info: 2L4C M20A6073468
color_info: Gray Gray
price_info: 23987
Year:

### Single Query

In [31]:
def GetQueryResponse_v3(user_query, top_k):
    # Generate the query embedding from the user query
    query_embedding = model.encode(user_query).tolist()

    # Get the min and max prices from the query (if available)
    price_filters = FilterPriceFromQuery(user_query)
    Min = price_filters.get('Min', 0)
    Max = price_filters.get('Max', None)

    query_body = {
        "size": top_k,  # Number of results to return
        "query": {
            "bool": {
                "should": [
                    {
                        "knn": {
                            "marketingname_embedding": {
                                "vector": query_embedding,
                                "k": top_k
                            }
                        }
                    },
                    {
                        "match": {
                            "vin": {
                                "query": user_query,  # Match the full user query
                                "boost": 2  # Highest priority
                            }
                        }
                    },
                ],
                
                "filter": []
            }
        },
        "_source": [
            "text", "id", "vin", "embedding", "metadata", "vehicle_info", "engine_info",
            "color_info", "drivetrain_info", "marketingname_info", "saleClass", "year", "price_info"
        ],
        "sort": [
            {
                "_score": {
                    "order": "desc"  # Sort by score in descending order
                }
            }
        ]
    }
    
    # Execute the search query using the client and return the results
    results = client.search(
        body=query_body,
        index=index_name
    )
    
    return results

In [32]:
user_query = "XSE Camry FWD"
top_k = 30
results = GetQueryResponse_v3(user_query, top_k)

In [33]:
print(f"Doc Count : {len(results['hits']['hits'])}")

Doc Count : 30


In [34]:
if 'hits' in results and 'hits' in results['hits']:
    # Iterate over the search results and print the relevant fields
    for hit in results['hits']['hits']:
        print(f"Document ID: {hit['_id']}")
        # Safely retrieve fields from _source using .get() to avoid key errors
        print(f"Vin: {hit['_source'].get('vin', 'No VIN found')}")
        print(f"Score: {hit['_score']}")
#         print(f"Text: {hit['_source'].get('text_info', 'No text found')}")
        print(f"vehicle_info: {hit['_source'].get('vehicle_info', 'No vehicle_info found')}")
        print(f"Marketing Name: {hit['_source'].get('marketingname_info', 'No marketing name found')}")
        print(f"engine_info: {hit['_source'].get('engine_info', 'No engine_info found')}")
        print(f"color_info: {hit['_source'].get('color_info', 'No color_info found')}")
        print(f"price_info: {hit['_source'].get('price_info', 'No price_info found')}")
        print(f"Year: {hit['_source'].get('year', 'No drivetrain_info found')}")
        print(f"saleClass: {hit['_source'].get('saleClass', 'No drivetrain_info found')}")
        print("\n")
else:
    print("No results found.")

Document ID: 8c01c58341074b21abd0ba463d1b1674
Vin: 4T1K61AK8RU241495
Score: 0.9493288
vehicle_info: RU241495 XSE SEDAN TOYOTA XSE
Marketing Name: Camry XSE 2 5L 4 Cyl Engine Front Wheel Drive
engine_info: 25l4cylxse A25AB374153
color_info: Black Black
price_info: 37943
Year: 2024
saleClass: new


Document ID: e2e8099fc44e4efd834262b301503206
Vin: 4T1K61AK1RU908651
Score: 0.9493288
vehicle_info: RU908651 XSE SEDAN TOYOTA XSE
Marketing Name: Camry XSE 2 5L 4 Cyl Engine Front Wheel Drive
engine_info: 25l4cylxse A25AH187567
color_info: Black Black
price_info: 37814
Year: 2024
saleClass: new


Document ID: ad6d5f8e8df64364b192967bbe85b9ed
Vin: 4T1K61BK8RU126670
Score: 0.9389044
vehicle_info: RU126670 XSE AWD TOYOTA XSE
Marketing Name: Camry XSE 2 5L 4 Cyl Engine All Wheel Drive
engine_info: 25l4cylxse A25AB386631
color_info: Gray Gray
price_info: 39214
Year: 2024
saleClass: new


Document ID: 108025296b004ccebaf20801f822ba43
Vin: 4T1K61BK9RU119890
Score: 0.9389044
vehicle_info: RU119890 XSE

In [55]:
# client.get(index_name,"bb6696c0fa354a78ae9a42500e9fa887")['_source']

#### With NMS

In [20]:
# thres = 10

# if 'hits' in results and 'hits' in results['hits']:
#     # Iterate over the search results and print the relevant fields
#     for hit in results['hits']['hits']:
#         if hit['_score'] >thres:
#             print(f"Document ID: {hit['_id']}")
#             # Safely retrieve fields from _source using .get() to avoid key errors
#             print(f"Vin: {hit['_source'].get('vin', 'No VIN found')}")
#             print(f"Score: {hit['_score']}")
#     #         print(f"Text: {hit['_source'].get('text_info', 'No text found')}")
#             print(f"vehicle_info: {hit['_source'].get('vehicle_info', 'No vehicle_info found')}")
#             print(f"Marketing Name: {hit['_source'].get('marketingname_info', 'No marketing name found')}")
#             print(f"engine_info: {hit['_source'].get('engine_info', 'No engine_info found')}")
#             print(f"color_info: {hit['_source'].get('color_info', 'No color_info found')}")
#             print(f"drivetrain_info: {hit['_source'].get('drivetrain_info', 'No drivetrain_info found')}")
#             print(f"Year: {hit['_source'].get('year', 'No drivetrain_info found')}")
#             print(f"saleClass: {hit['_source'].get('saleClass', 'No drivetrain_info found')}")
#             print("\n")
# else:
#     print("No results found.")

In [44]:
# client.get(index_name,"d084c17bea1c43a1bea0bf8fce48f511")['_source'].keys()

#### Future Scope
* Fine-tune keyword search based on query to replace filter.
* Retrieve the actual query for domain knowledge.
* Evaluate indexing parameters based on the actual query and re-index if required.
* Try incorporating re-ranking.
* Test with OpenAI embeddings.
* Experiment with indexing algorithms.
* Further R&D is needed

In [45]:
# Model
# Make
# 1. "vin": "JTDACACU2R3025624","serialNbr": "R3025624",
#             "bodyStyleDesc": "XSE",
#             "brand": "TOYOTA",
#             "grade": "xse", "drivetrain": {
#                 "code": "FWD",
#                 "title": "Front-Wheel Drive"
#             },
                    
# 2.               ExtColor "commonName": {
#             "display": "White",
#             "generic": "White",
#         }           
    
#     "model": {
#                 "marketingTitle": "Prius Prime XSE 2.0L 4-Cyl. Plug-in Hybrid Engine Front-Wheel Drive"
#             },
        
#         "price": {
#                 "totalMsrp": 40369, #Market 
#             },
            
#     "year": 2024,
        
#         "engine": {
#                 "name": "2.0L 4-cyl. Plug-in Hybrid Engine",
#             },
                
# 3. "unitId": 142858295, 
    
# "saleClassCode": "new"