In [1]:
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, LLMPredictor, LangchainEmbedding, PromptHelper, ServiceContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

In [3]:
local_path = './models/ggml-vicuna-7b-1.1-q4_2.bin'
callbacks = [StreamingStdOutCallbackHandler()]
llm = GPT4All(model=local_path, callbacks=callbacks, streaming=True, verbose=True)

Found model file.


llama.cpp: loading model from ./models/ggml-vicuna-7b-1.1-q4_2.bin
llama_model_load_internal: format     = ggjt v1 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 5 (mostly Q4_2)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =  59.11 KB
llama_model_load_internal: mem required  = 5809.32 MB (+ 1026.00 MB per state)
llama_init_from_file: kv self size  = 1024.00 MB


In [4]:
llm_predictor = LLMPredictor(llm=llm)

In [5]:
# Configure prompt parameters and initialise helper
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20

prompt_helper = PromptHelper(
    max_input_size=max_input_size,
    num_output=num_output,
    max_chunk_overlap=max_chunk_overlap
)

In [6]:
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    embed_model=embed_model,
#     prompt_helper=prompt_helper
)

In [7]:
documents = SimpleDirectoryReader('data').load_data()
new_index = GPTVectorStoreIndex.from_documents(
    documents, 
    service_context=service_context,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
# query will use the same embed_model
query_engine = new_index.as_query_engine(
#     streaming=True,
    verbose=True,
)

In [9]:
%%time
prompt = "What did Paul do while growing up?"
response = query_engine.query(prompt)

5372
5372

Token indices sequence length is longer than the specified maximum sequence length for this model (1841 > 1024). Running this sequence through the model will result in indexing errors


CPU times: user 1h 11min 16s, sys: 11.9 s, total: 1h 11min 28s
Wall time: 21min 11s


In [10]:
%%time
response.get_response()

In [11]:
response

Response(response='5372', source_nodes=[NodeWithScore(node=Node(text='\t\t\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an 