In [1]:
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, LLMPredictor, LangchainEmbedding, PromptHelper, ServiceContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

In [3]:
%ls ./models

ggml-gpt4all-j-v1.3-groovy.bin  ggml-vicuna-7b-1.1-q4_2.bin


In [None]:
local_path = './models/ggml-vicuna-7b-1.1-q4_2.bin'
callbacks = [StreamingStdOutCallbackHandler()]
llm = GPT4All(model=local_path, callbacks=callbacks, streaming=True, verbose=True)

In [5]:
llm_predictor = LLMPredictor(llm=llm)

In [6]:
# Configure prompt parameters and initialise helper
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20

prompt_helper = PromptHelper(
    max_input_size=max_input_size,
    num_output=num_output,
    max_chunk_overlap=max_chunk_overlap
)

In [7]:
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    embed_model=embed_model,
    prompt_helper=prompt_helper
)

In [8]:
documents = SimpleDirectoryReader('data').load_data()
new_index = GPTVectorStoreIndex.from_documents(
    documents, 
    service_context=service_context,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
# query will use the same embed_model
query_engine = new_index.as_query_engine(
    streaming=True,
    verbose=True
)

In [None]:
prompt = "What did Paul do while growing up?"
response = query_engine.query(prompt)
response.get_response()