In [1]:
from openai import OpenAI
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


Getting data

In [253]:
with open('documents.json', 'r') as file:  # Replace 'data.json' with the path to your file
    data = json.load(file)

In [254]:
documents = []

for article_dict in data:
    for doc in article_dict['documents']:
        doc['Title'] = article_dict['Title']
        documents.append(doc)

documents[1]

{'body': 'A typical data team consists of the following roles: All these people work to create a data product. To explain the core responsibilities of each role, we will use a case scenario: Suppose we work at an online classifieds company. It’s a platform where users can go to sell things they don’t need (like OLX, where I work). If a user has an iPhone they want to sell — they go to this website, create a listing and sell their phone. On this platform, sellers sometimes have problems with identifying the correct category for the items they are selling. To help them, we want to build a service that suggests the best category. To sell their iPhone, the user creates a listing and the site needs to automatically understand that this iPhone has to go in the “mobile phones” category. Let’s start with the first role: product manager.',
 'Title': 'Roles in a Data Team'}

Connecting elastic search

In [94]:
es_client = Elasticsearch('http://localhost:9200') 
es_client.info()

ObjectApiResponse({'name': '12128a6410b3', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'QuuLk9-uSW28udPq3EYo1w', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [256]:
es_client.indices.delete(index="data-roles")

ObjectApiResponse({'acknowledged': True})

In [257]:
# indexing the data
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "heading": {"type": "text"},
            "body": {"type": "text"},
            "Title": {"type": "keyword"} 
        }
    }
}

index_name = "data-roles"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'data-roles'})

In [198]:
for doc in tqdm(data):
    es_client.index(index=index_name, document=doc)

100%|██████████| 6/6 [00:00<00:00, 60.30it/s]


In [258]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 42/42 [00:00<00:00, 122.35it/s]


Building an Elastic search model

In [264]:
def elastic_search(query, title, index_name="data-roles"):
    search_query = {
        "size": 2,  # Retrieve only the top result based on the highest score
        "query": {
            "bool": {
                "should": [
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["heading^4", "body"],  # Boost 'heading' field
                            "type": "best_fields",  # Best field matching strategy
                            "fuzziness": "AUTO"  # Allow slight variations in text
                        }
                    }
                ],
                "filter": [
                    {
                        "term": {
                            "Title": title  # Make sure 'Title' field is indexed as a keyword
                        }
                    }
                ]
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

Building a prompt

In [260]:
def build_prompt(query, search_results):
    prompt_template = """
You're a career assistant that helps people know more about data careers. Answer the QUESTION based on the CONTEXT from the knowledge.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"title: {doc['Title']}\n, heading: {doc['heading']}\n, content: {doc['body']}\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt



In [265]:
query = 'difference between data analyst and data scientist'
s = elastic_search(query, 'Roles in a Data Team')
prompt = build_prompt(query, s)

In [266]:
print(prompt)

You're a career assistant that helps people know more about data careers. Answer the QUESTION based on the CONTEXT from the knowledge.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: difference between data analyst and data scientist

CONTEXT: 
title: Roles in a Data Team
, heading: Data Analyst
, content: Data analysts know how to analyze the data available in the company. They discover insights in the data and then explain their findings to others. So, analysts need to know: Data analysts are also often responsible for defining key metrics and building different dashboards. This includes things like showing the company’s profits, displaying the number of listings, or how many contacts buyers made with sellers.  Thus, data analysts should know how to calculate all the important business metrics, and how to present them in a way that is understandable to others. When it comes to skills, data analysts should know: For our example, product managers turn to dat

Building the llm function

In [136]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [157]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gemma:2b',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

Building RAG

In [227]:
def rag(query):
    search_results = elastic_search(query, "Roles in a Data Team")
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [268]:
query = 'what is the article about'
rag(query)

'The article is about the roles of a data scientist and data analyst in a data team. The data scientist focuses more on predicting rather than explaining, creating machine learning services. The data analyst focuses more on fetching the data, looking at it, explaining what’s going on to the team, and giving recommendations on things to do.'

Semantic search with Elastic search

1. Preparing the documents

In [205]:
with open('documents.json', 'r') as file:  # Replace 'data.json' with the path to your file
    data = json.load(file)

In [208]:
documents = []

for article_dict in data:
    for doc in article_dict['documents']:
        doc['Title'] = article_dict['Title']
        documents.append(doc)

documents[1]

{'body': 'A typical data team consists of the following roles: All these people work to create a data product. To explain the core responsibilities of each role, we will use a case scenario: Suppose we work at an online classifieds company. It’s a platform where users can go to sell things they don’t need (like OLX, where I work). If a user has an iPhone they want to sell — they go to this website, create a listing and sell their phone. On this platform, sellers sometimes have problems with identifying the correct category for the items they are selling. To help them, we want to build a service that suggests the best category. To sell their iPhone, the user creates a listing and the site needs to automatically understand that this iPhone has to go in the “mobile phones” category. Let’s start with the first role: product manager.',
 'Title': 'Roles in a Data Team'}

Step 2: Create Embeddings using Pretrained Models

In [209]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")



In [211]:
#created the dense vector using the pre-trained model
operations = []
for doc in documents:
    # Transforming the body into an embedding using the model
    doc["body_vector"] = model.encode(doc["body"]).tolist()
    operations.append(doc)

Step 3: Setup ElasticSearch connection

In [None]:
es_client = Elasticsearch('http://localhost:9200') 

Step 4: Create Mappings and Index

In [212]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "heading": {"type": "text"},
            "body": {"type": "text"},
            "Title": {"type": "keyword"},
            "body_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [213]:
index_name = "data-roles"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'data-roles'})

Step 5: Add documents into index

In [214]:
for doc in tqdm(operations):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

100%|██████████| 42/42 [00:00<00:00, 82.17it/s]


Step 6: Performing the search

In [221]:
search_term = "data scientist vs data analyst?"
vector_search_term = model.encode(search_term)

In [244]:
def elastic_search(search_term, title, index_name="data-roles"):
    vector_search_term = model.encode(search_term) #semantic search on the query/search term
    knn_query = {
        "field": "body_vector",
        "query_vector": vector_search_term,
        "k": 5,
        "num_candidates": 10000
      }

    response = es_client.search(
    index=index_name,
    query={
        "match": {"Title": title}, #filtering the article
    },
    knn=knn_query,
    size=1
    )

    
    for hit in response['hits']['hits']:
        result = hit['_source']

    return result

In [247]:
def build_prompt(query, search_results):
    prompt_template = """
You're a career assistant that helps people know more about data careers. Answer the QUESTION based on the CONTEXT from the knowledge.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    
    context = context + f"content: {search_results['body']}\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [239]:
knn_query = {
    "field": "body_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [242]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"Title": "Roles in a Data Team"},
    },
    knn=knn_query,
    size=1
)

In [241]:
for hit in response["hits"]["hits"]:
    print(hit['_source'])

{'heading': 'Data Scientist', 'body': 'The roles of a data scientist and data analyst are pretty similar. In some companies, it’s the same person who does both jobs. However, data scientists typically focus more on predicting rather than explaining. A data analyst fetches the data, looks at it, explains what’s going on to the team, and gives some recommendations on what to do about it. A data scientist, on the other hand, focuses more on creating machine learning services. For example, one of the questions that a data scientist would want to answer is “How can we use this data to build a machine learning model for predicting something?” In other words, data scientists incorporate the data into the product. Their focus is more on engineering than analysis. Data scientists work more closely with engineers on integrating data solutions into the product. The skills of data scientists include: For our example, the data scientists are the people who develop the model used for predicting the 