In [1]:
!pip install llama_index llama-index-llms-huggingface llama-index-embeddings-huggingface accelerate bitsandbytes



In [2]:
# setup prompts - specific to StableLM
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
import torch
from llama_index.core.prompts.prompts import SimpleInputPrompt




In [3]:
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

model_name="microsoft/phi-2"
model_name="microsoft/Phi-3-mini-4k-instruct"


llm = HuggingFaceLLM(
    context_window=1024,
    max_new_tokens=64,
    generate_kwargs={"temperature": 0.25, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_name,
    model_name=model_name,
    #device_map="cuda",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": "auto"}
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


from llama_index.core import Settings
Settings.chunk_size = 512
Settings.llm = llm
Settings.embed_model = embed_model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the disk and cpu.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_files=["events.txt"],
    required_exts=[".txt"],
    num_files_limit=100, # Maximum number of files to load
    encoding="latin-1",
)
documents = reader.load_data()

In [5]:
%env OPENAI_API_KEY=<your key>

env: OPENAI_API_KEY=<your key>


In [6]:
import os
print(os.environ['OPENAI_API_KEY'])

<your key>


In [7]:
from llama_index.core import VectorStoreIndex

# 1. Load VectorStoreIndex directly from Documents
index = VectorStoreIndex.from_documents(documents, show_progress=True)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/120 [00:00<?, ?it/s]

#Load VectorStoreIndex by selecting the splitter(chunk_size, chunk_overlap) and embedded model directly

from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex

node_parser = SentenceSplitter(chunk_size=256, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True)

In [8]:
query_engine = index.as_query_engine()

In [9]:
response = query_engine.query("When Rotational speed is about 1342 what can be the anomaly?")
print(f"{response}")

You are not running the flash-attention implementation, expect numerical differences.


Based on the provided context, when the Rotational speed is about 1342 rpm, the anomaly that can be observed is a Heat dissipation failure (HDF). This anomaly occurs when heat dissipation causes a process failure, if the difference between air- and process temperature
