In [None]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference
    !pip install transformers # for faster LLM inference

In [2]:
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm"
import re
import pandas as pd
import random
import numpy as np
import torch

### Load in Embeddings (better ways to store)

In [4]:
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")


# Step 5: Sematic Search Pipeline

see hugging face link for all-mpnet-base-2 https://huggingface.co/sentence-transformers/all-mpnet-base-v2

In [5]:
# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to('cuda')
embeddings.shape

torch.Size([4475, 768])

## It's important to embed your query with the same model you embedded your examples with.

In [6]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device='cuda') # choose the device to load the model to

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Define a query, run semantic search

In [7]:
# 1. Define the query
query = "Environmental, Social, Governance Reporting"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples

query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: Environmental, Social, Governance Reporting
Time take to get scores on 4475 embeddings: 0.00377 seconds.


torch.return_types.topk(
values=tensor([0.5620, 0.5384, 0.5225, 0.5114, 0.5058], device='cuda:0'),
indices=tensor([2113,  799, 3792, 1493,  639], device='cuda:0'))

In [8]:
# Define helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [9]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print(f"Stock Ticker: {pages_and_chunks[idx]['ticker']}")
    print("\n")

Query: 'Environmental, Social, Governance Reporting'

Results:
Score: 0.5620
Text:
If we are unable to enter into favorable contracts or to obtain the necessary
regulatory and land use approvals on favorable terms, we may not be able to
construct and operate our assets as anticipated, or at all, which could
negatively affect our business, results of operations and financial condition.
We could be negatively impacted by environmental, social, and governance (ESG)
and sustainability-related matters. Governments, investors, customers, employees
and other stakeholders are increasingly focusing on corporate ESG practices and
disclosures, and expectations in this area are rapidly evolving. We have
announced, and may in the future announce, sustainability-focused goals,
initiatives, investments and partnerships. These initiatives, aspirations,
targets or objectives reflect our current plans and aspirations and are not
guarantees that we will be able to achieve them. Our efforts to accomplish 

## Step 5: Fucntions

In [10]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query,
                                   convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

# Step 6: Load LLM locally

In [11]:
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 40 GB


## See which model to use

In [12]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 40 | Recommend model: Gemma 7B in 4-bit or float16 precision.
use_quantization_config set to: False
model_id set to: google/gemma-7b-it


## Speed up attention block for faster inference

In [13]:
from transformers.utils import is_flash_attn_2_available

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")


[INFO] Using attention implementation: flash_attention_2


## Log into hugging face to access gemma-7b-it

see link: https://huggingface.co/google/gemma-7b-it

In [18]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig

# 1. What percesion to load the model in (less bits less space)
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: google/gemma-7b-it


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [20]:
input_text = "what are the Enivormental Risk factors, and what roles do they play in the utility sector?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
what are the Enivormental Risk factors, and what roles do they play in the utility sector?

Prompt (formatted):
<bos><start_of_turn>user
what are the Enivormental Risk factors, and what roles do they play in the utility sector?<end_of_turn>
<start_of_turn>model



In [None]:
# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=365) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

In [22]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
what are the Enivormental Risk factors, and what roles do they play in the utility sector?<end_of_turn>
<start_of_turn>model
**Environmental Risk Factors**

Environmental risk factors are factors that are derived from the natural environment and have the potential to cause harm to human health and the environment. These factors include air pollution, water pollution, noise pollution, and climate change.

**Roles of Environmental Risk Factors in the Utility Sector:**

**1. Air Pollution:**

* Air pollution from utility facilities, such as power plants and industrial processes, can contribute to respiratory problems, heart disease, cancer, and other health issues.
* Air pollution can also damage crops, forests, and other ecosystems.

**2. Water Pollution:**

* Water pollution from utility operations, such as wastewater treatment plants and cooling water systems, can contaminate water sources and make them unsafe for drinking, swimming

In [43]:
# Nutrition-style questions generated with GPT4
gpt4_questions = [
"Which utility companies reported an increase in net income last year?",
"Which companies have set specific targets for reducing their carbon emissions in the next five years?",
"What are the major technological challenges facing utility companies according to their 10-K filings?",
"How have employment numbers changed in the utility sector over the last year?",
"What new markets are utility companies aiming to enter in the next year?",
"Which utility companies have plans to expand into solar energy production?",
"Which utility companies have made changes to their governance structures to improve transparency?"
]

# Manually created question list
manual_questions = [
  #"Total number of employees at the company?"
  #"Which companies mention renewable engergy?"
  #"What are specific State/Federal Regulations in the utility sector?"
  #"What are common Risk factors, found within the text"
  #"Who mentioned Enviromental, Social, Governance in the 10-K text"
  #"Who uses coal as an energy source?"
  #"Who would be negatively impacted by environmental, social, and governance (ESG)"
  #"Which companies mention ESG?"
]

query_list = gpt4_questions + manual_questions

In [24]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: How many new power plants did utility companies commission in the last year?How does the regulatory environment in California impact utility operations compared to Texas?Which utility companies reported an increase in net income last year?
[INFO] Time taken to get scores on 4475 embeddings: 0.00006 seconds.


(tensor([0.5691, 0.5672, 0.5652, 0.5496, 0.5487], device='cuda:0'),
 tensor([3971, 1307,  431, 1598, 3942], device='cuda:0'))

In [25]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are specific State/Federal Regulations in the utility sector?
Answer: Sure, here is the answer to the query:  The utility sector is subject to
regulation by a number of federal, state, and local agencies. The federal
government regulates wholesale sales of electricity rates and interstate
transmission of electricity, including System Energys sales of capacity and
energy from Grand Gulf to Entergy Arkansas, Entergy Louisiana, Entergy
Mississippi, and Entergy New Orleans pursuant to the Unit Power Sales Agreement.
State public utility commissions have jurisdiction over services and facilities,
rates and charges, accounting, valuation of property, depreciation rates and
various other matters.
≈
Query: Which companies mention renewable engergy?
Answer: The text mentions several companies that are involved in renewable energy,
including AES, Asus, AWR, GSWC, BVESI, and Air Products. AES Clean Energy is
specifically highlighted as a company that is actively developing and
implementing renewable energy solutions.
\nExample 3:
Query: How many employees do utility companies have?
Answer: The text describes various companies and their employee counts as of December 31, 2022:

- **ASUS:** Had a total of 264 employees.
- **AWR:** Had a total of 811 employees.
- **GSWC:** Had 501 employees.
- **BVESI:** Had 46 employees.

\nExample 4:
Query: Who uses coal as an energy source?
Answer: The text mentions several companies that use coal as an energy source, including
Consumers, Avista Utilities, and NorthWestern. Consumers' coal-fueled generating
units burned six million tons of coal and produced a combined total of 10,217
GWh of electricity in 2022. Avista Utilities owns the following thermal
generating resources that use coal as fuel: the combined cycle natural gas-fired
CT, known as Coyote Springs 2, located near Boardman, Oregon, a 15 percent
interest in Units 3 and 4 of Colstrip, a coal-fired boiler generating facility
located in southeastern Montana, and the Kettle Falls GS in northeastern
Washington.


\nExample 5:
Query: Who would be negatively impacted by environmental, social, and governance (ESG)?
Answer: The text describes the potential negative impacts of rapidly
changing stakeholder expectations and standards with respect to PGEs
environmental, social, and governance (ESG) programs on various parties.
According to the text, individuals and organizations that would be negatively
impacted include Consumers, Avista Utilities, and PGE.  Consumers' coal-fueled
generating units burned six million tons of coal and produced a combined total
of 10,217 GWh of electricity in 2022. Avista Utilities owns thermal generating
resources that use coal as fuel, including the combined cycle natural gas-fired
CT, known as Coyote Springs 2, located near Boardman, Oregon, and the Kettle
Falls GS in northeastern Washington. Therefore, Consumers and Avista Utilities
are directly impacted by the environmental impact of coal-fired generating
units.  PGE, on the other hand, faces the risk of increased costs and reduced
access to capital due to rapidly changing stakeholder expectations and standards
with respect to ESG programs. Investors, lenders, rating agencies, customers,
regulators, employees, and other stakeholders are increasingly evaluating
companies based on their ESG programs and metrics. Based on PGEs ESG profile,
investors and lenders may elect to increase their required returns on capital
offered to the Company, reallocate capital, or not commit capital as a result of
their assessment of the Companys ESG profile. Such actions by investors and
lenders could increase PGEs cost of, or access to, capital and financing.


\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [26]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: How many new power plants did utility companies commission in the last year?How does the regulatory environment in California impact utility operations compared to Texas?Which utility companies reported an increase in net income last year?
[INFO] Time taken to get scores on 4475 embeddings: 0.00006 seconds.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are specific State/Federal Regulations in the utility sector?
Answer: Sure, here is the answer to the query:  The utility sector is subject to
regulation by a number of federal, state, and local agencies. The federal
government regulates wholesale sales of electricity rates and interstate
tra

In [28]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Query: How many new power plants did utility companies commission in the last year?How does the regulatory environment in California impact utility operations compared to Texas?Which utility companies reported an increase in net income last year?
RAG answer:
<bos>The text does not provide information about new power plant commissioning or information about the regulatory environment in California and Texas, therefore I cannot answer the query.<eos>
CPU times: user 1.39 s, sys: 0 ns, total: 1.39 s
Wall time: 1.39 s


In [40]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [48]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.3,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items

Query: What new markets are utility companies aiming to enter in the next year?
[INFO] Time taken to get scores on 4475 embeddings: 0.00006 seconds.
Answer:

The text does not mention any new markets that utility companies are aiming to
enter in the next year, therefore I cannot answer this query.
Context items:


[{'page_number': 887,
  'ticker': 'NWE',
  'sector': 'Utilities',
  'filing_date': '2023-02-16T18:48:32-05:00',
  'sentence_chunk': 'Ticker: NWE, Sector: Utilities, Filed At: 2023-02-16T18:48:32-05:00 technologies that produce power, including fuel cells, micro-turbines, wind turbines and solar cells, may reduce the cost of alternative methods of producing power to a level competitive with central power station electric production. Customer-owned generation itself reduces the amount of electricity purchased from utilities and may have the effect of inappropriately increasing rates generally and increasing rates for customers who do not own generation, unless retail rates are designed to collect distribution grid costs across all customers in a manner that reflects the benefit from their use. Such developments could affect the price of energy, could affect energy deliveries as customer-owned generation becomes more cost-effective, could require further improvements to our distribution s