In [1]:
# !pip  install -U docarray
# !pip  install pydantic
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from docarray import BaseDoc, DocList
from docarray.typing import NdArray
from vectordb import HNSWVectorDB
services = [
    ("Web Hosting", "Reliable and secure web hosting services with 99.9% uptime guarantee and 24/7 support."),
    ("Cloud Storage", "Scalable cloud storage solutions with advanced security features and easy access from anywhere."),
    ("SEO Optimization", "Comprehensive SEO services to boost your website's search engine ranking and drive more traffic."),
    ("Digital Marketing", "Targeted digital marketing campaigns to increase your brand visibility and reach more customers."),
    ("E-commerce Platform", "Robust e-commerce platform with integrated payment gateways, inventory management, and analytics."),
    ("Data Analytics", "Advanced data analytics services to help you make data-driven decisions and improve business outcomes."),
    ("Cybersecurity", "Comprehensive cybersecurity solutions to protect your business from threats and ensure data integrity."),
    ("Mobile App Development", "Custom mobile app development services for iOS and Android platforms with user-friendly interfaces."),
    ("Customer Support", "24/7 customer support services to help your customers with any queries or issues they may have."),
    ("Blockchain Solutions", "Innovative blockchain solutions to streamline your business processes and enhance security."),
]

queries = [
    "I need a reliable service to store my files and access them from anywhere.",
    "Looking for services to improve my website's search engine ranking.",
    "How can I protect my business from cyber threats?",
    "I'm interested in developing a mobile app for my business.",
    "I need help with my online marketing strategy to reach more customers.",
    "What are some solutions for hosting my website with high uptime?",
    "Can I get a service to analyze my business data for better decisions?",
    "Looking for an online platform to sell my products.",
    "How can I implement blockchain technology in my business?",
    "Need customer support solutions for my e-commerce website."
]


# Define constants
vector_dimension = 768

# Define the ServiceDoc class
class ServiceDoc(BaseDoc):
    text: str = ''
    embedding: NdArray[vector_dimension]

# Mean Pooling - Takes attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Embedding Model
def embedding_model(doc: str):
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

    # Tokenize the document
    encoded_input = tokenizer(doc, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    doc_embedding = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    doc_embedding = F.normalize(doc_embedding, p=2, dim=1)

    return doc_embedding.numpy()[0]

# Push Service Descriptions to the Vector Database
def vector_db_push(service_name:str,service_description:str, workspace: str):
#     doc_embedding = embedding_model(service_description)

    db = HNSWVectorDB[ServiceDoc](workspace=workspace)
    # Index a list of documents with random embeddings
    doc_list = [ServiceDoc(text=service_name, embedding=embedding_model(service_description))]
    db.index(inputs=DocList[ServiceDoc](doc_list))
#     db.index(inputs=DocList[ServiceDoc](record))


# Perform a Similarity Search Query
def vector_search(user_query: str, limit: int, workspace: str):
    db = HNSWVectorDB[ServiceDoc](workspace=workspace)
    
    # Generate embedding for the query
    query_embedding = embedding_model(user_query)
    len(query_embedding)
    query_doc = ServiceDoc(text=user_query, embedding=query_embedding)
    
    # Perform a search query
    results = db.search(inputs=DocList[ServiceDoc]([query_doc]), limit=limit)
    
    # Print out the matches
    print(f"Search results for query: '{user_query}'")
    serivces=[]
    for match in results[0].matches:
        print(match.text)
        serivces.append(match.text)
    return serivces

        
def parse_services(file_content):
    services = []
    # Split the content by '---' to separate each service
    service_blocks = file_content.strip().split('---')
    
    for block in service_blocks:
        lines = block.strip().split('\n', 1)  # Split each block into service name and description
        if len(lines) == 2:
            service_name = lines[0].strip()
            service_description = lines[1].strip().replace("\n", "")
            services.append((service_name, service_description))
    
    return services
 

def test_query(query):
    results=vector_search(query, limit=3, workspace=workspace_path)
    print(results)
    return results[0]



# vector_db_push(services=services, workspace=workspace_path)
from tqdm import tqdm
# [ vector_db_push(service_name=service_name, service_description=service_description,workspace=workspace_path) for service_name,service_description in services]
# Assuming 'services' is a list of tuples containing service_name and service_description
# For example: services = [('service1', 'description1'), ('service2', 'description2'), ...]




# Load the CSV file to inspect its contents
file_path="./assets/Services_description_V2.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()

services = parse_services(file_content)


    
workspace_path = "./vector_db_v2"
    
# Adding a progress bar to the loop
[vector_db_push(service_name=service_name, service_description=service_description, workspace=workspace_path) 
 for service_name, service_description in tqdm(services, desc="Pushing to vector DB")]


print("Service descriptions have been pushed to the vector database.")




# import pandas as pd

# # Load the CSV file to inspect its contents
# file_path = './assets/queries.csv'
# df = pd.read_csv(file_path)
    


# # Apply the mock LLM function to each row and create a new column 'LLM_Response'
# df['RAG_Response'] = df['Query'].apply(test_query)




Pushing to vector DB: 100%|██████████████████████████████████████████████████████████| 500/500 [47:06<00:00,  5.65s/it]


Service descriptions have been pushed to the vector database.
Search results for query: 'I want to take my dog for walking and playing catch the ball, so I can unleash him'
dog park
dog walker
dog trainer
None
Search results for query: 'I'm missing my home country, mmmm! I'm hungry. Oh, I want to eat shawarma. Can you suggest any nearby place that serves shawarma?'
shawarma restaurant
middle eastern restaurant
syrian restaurant
None
Search results for query: 'I'm planning to move to a new place do you know any moving agency close to me'
moving and storage service
car rental agency
travel agency
None
Search results for query: 'I have moved here recently, and I'm looking for a gym with a good reputation.'
gym
fitness center
rock climbing gym
None
Search results for query: 'I'm travelling tomorrow, and I want to rent a car. Do you know any car rental close to me?'
car rental agency
recreational vehicle rental agency
truck rental agency
None
Search results for query: 'My son loves hockey s

In [6]:
def vector_search(user_query: str, limit: int, workspace: str):
    db = HNSWVectorDB[ServiceDoc](workspace=workspace)
    
    # Generate embedding for the query
    query_embedding = embedding_model(user_query)
    len(query_embedding)
    query_doc = ServiceDoc(text=user_query, embedding=query_embedding)
    
    # Perform a search query
    results = db.search(inputs=DocList[ServiceDoc]([query_doc]), limit=limit)
    
    # Print out the matches
    print(f"Search results for query: '{user_query}'")
    serivces=[]
    for match in results[0].matches:
        print(match.text)
        serivces.append(match.text)
    return serivces

def test_query(query):
    results=vector_search(query, limit=3, workspace=workspace_path)
    return results
file_path = './assets/queries.csv'
df = pd.read_csv(file_path)
    
# Apply the mock LLM function to each row and create a new column 'LLM_Response'
df['RAG_Response'] = df['Query'].apply(test_query)
df.to_csv("evaluation.csv")

Search results for query: 'I want to take my dog for walking and playing catch the ball, so I can unleash him'
dog park
dog walker
dog trainer
Search results for query: 'I'm missing my home country, mmmm! I'm hungry. Oh, I want to eat shawarma. Can you suggest any nearby place that serves shawarma?'
shawarma restaurant
middle eastern restaurant
syrian restaurant
Search results for query: 'I'm planning to move to a new place do you know any moving agency close to me'
moving and storage service
car rental agency
travel agency
Search results for query: 'I have moved here recently, and I'm looking for a gym with a good reputation.'
gym
fitness center
rock climbing gym
Search results for query: 'I'm travelling tomorrow, and I want to rent a car. Do you know any car rental close to me?'
car rental agency
recreational vehicle rental agency
truck rental agency
Search results for query: 'My son loves hockey sport and I want him to start with professional practice playing it. do you know any hoc

In [None]:
vector_search("'Looking for having a hot drink.", limit=3, workspace=workspace_path)

In [None]:
vector_db_push(service_name="Coffee shop", service_description="it's a coffe shop where it serve hot coffee and other drinks", workspace=workspace_path)

In [None]:
def parse_services(file_content):
    services = []
    # Split the content by '---' to separate each service
    service_blocks = file_content.strip().split('---')
    
    for block in service_blocks:
        lines = block.strip().split('\n', 1)  # Split each block into service name and description
        if len(lines) == 2:
            service_name = lines[0].strip()
            service_description = lines[1].strip().replace("\n", "")
            services.append((service_name, service_description))
    
    return services

# Example usage:
file_content = """
fruit and vegetable store
A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars.
A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars...
A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars...
A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars...
A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars...




A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars...
A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars...
A marrow is a fruit used as a vegetable, the mature fruit of certain Cucurbita pepo cultivars...
---
produce market
An Agricultural Produce Market Committee (APMC) is a marketing board established by state governments in India...
---
grocery store
A grocery store (AE), grocery shop (BE) or simply grocery is a foodservice retail store that primarily retails a general range of food products...
"""
file_path="./assets/Services_description_V2.txt"
services = []
with open(file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()

services_list = parse_services(file_content)
for service in services_list[0:10]:
    print(service)


In [4]:
import pandas as pd

# Load the CSV file to inspect its contents
file_path = './assets/queries.csv'
df = pd.read_csv(file_path)

# Display the DataFrame to understand its structure
df.head()


Unnamed: 0,intent,Query
0,dog Park,I want to take my dog for walking and playing ...
1,shawarma restaurant,"I'm missing my home country, mmmm! I'm hungry...."
2,moving and storage service,I'm planning to move to a new place do you kno...
3,gym,"I have moved here recently, and I'm looking fo..."
4,car rental,"I'm travelling tomorrow, and I want to rent a ..."


In [6]:
# Define a mock LLM function to simulate processing
def test_query(query):
    return f"Processed query: {query[:30]}..."  # Simulating a response

# Apply the mock LLM function to each row and create a new column 'LLM_Response'
df['LLM_Response'] =  df['Query'].apply(test_query)

# # Display the resulting dataframe with the new column
# import ace_tools as tools; tools.display_dataframe_to_user(name="Processed Queries", dataframe=df)


In [7]:
df.head()

Unnamed: 0,intent,Query,LLM_Response
0,dog Park,I want to take my dog for walking and playing ...,Processed query: I want to take my dog for wal...
1,shawarma restaurant,"I'm missing my home country, mmmm! I'm hungry....","Processed query: I'm missing my home country, ..."
2,moving and storage service,I'm planning to move to a new place do you kno...,Processed query: I'm planning to move to a new...
3,gym,"I have moved here recently, and I'm looking fo...","Processed query: I have moved here recently, a..."
4,car rental,"I'm travelling tomorrow, and I want to rent a ...","Processed query: I'm travelling tomorrow, and ..."
