# load data

In [39]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [42]:
from llama_index import(
    GPTVectorStoreIndex,
    ServiceContext,
    LLMPredictor,
    PromptHelper,
    LangchainEmbedding,
    StorageContext,
    load_index_from_storage,
    )

from langchain.document_loaders import DirectoryLoader
from langchain.docstore.document import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser

# upload model 
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub

In [45]:
def load_llm(model_path):      
    llm = HuggingFaceHub(repo_id = model_path, model_kwargs = {"temperature":0, "max_length":512}) #770M parameters			
    return llm   

In [46]:
def load_document_to_gpt_vectorstore(url, model_path, model_emb_path):
    from llama_index import download_loader 

    documents = SimpleDirectoryReader('omnisciencom', recursive = True).load_data()
    parser = SimpleNodeParser()

    nodes = parser.get_nodes_from_documents(documents)

    llm = load_llm(model_path)
    llm_predictor = LLMPredictor(llm = llm)
    embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_emb_path))


    max_input_size = 4096
    num_output = 512
    max_chunk_overlap = 0.20
    prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
    service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    embed_model=embed_model,
    prompt_helper=prompt_helper,
    )

    index = GPTVectorStoreIndex(nodes, service_context=service_context) 
    #index.save_to_disk("./gpt_index_docs_api_remotion_v2.json") #cant use save_to_disk replace with storage_context
    index.storage_context.persist(persist_dir="./llama_index_docs_api_v1") # create json file for index
    return index, service_context

In [47]:
model_path = "declare-lab/flan-alpaca-large"
model_emb_path = "sentence-transformers/all-mpnet-base-v2"

In [48]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader('omnisciencom', recursive = True).load_data()

In [49]:
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents) #take long time

In [51]:
llm = load_llm(model_path)
llm_predictor = LLMPredictor(llm = llm)
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_emb_path))

In [52]:
max_input_size = 4096
num_output = 512
max_chunk_overlap = 0.20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(
llm_predictor=llm_predictor,
embed_model=embed_model,
prompt_helper=prompt_helper,
)

In [53]:
# index = GPTVectorStoreIndex(nodes, service_context=service_context) 
# index.storage_context.persist(persist_dir="./llama_index_docs_api_v1") #take long time

In [84]:
storage_context = StorageContext.from_defaults(persist_dir="./llama_index_docs_api_v1")
index = load_index_from_storage(storage_context, service_context=service_context)

query_engine = index.as_query_engine(streaming=False, similarity_top_k=1, service_context=service_context)

In [72]:
response_stream

Response(response='Dion is a computer scientist who is a pioneer in the field of statistical machine translation and neural machine translation research. He is a former holder of a US O1 Extraordinary Ability Visa.', source_nodes=[NodeWithScore(node=TextNode(id_='1412b9ab-0003-44ee-8c0a-7144f2791444', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d028cdd3-1f14-4aca-b1e7-6ffb998ed996', node_type=None, metadata={}, hash='1af4e54898b7d653f386d686dc99e4c4f125679ec4d0e953fc6448b51fc22cfb'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='22a46e29-5e10-4a7b-9a78-2b41298490f0', node_type=None, metadata={}, hash='0bedd1ca0c44e784d921551ad1280cc4e34f1220156b6554496169ce6110d3d6'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='d3ce9409-74c8-4ce5-98d2-7a71db2aac4e', node_type=None, metadata={}, hash='376ca3b4ea669c603a95b54192590cb1b3290bef2a96a6e1f8c3ad32e749

In [85]:
query = "What is NMT"
response_stream = query_engine.query(query)
response_stream

Llama.generate: prefix-match hit


Translation technology that has been developed to improve accuracy and translation quality?</p><p>&nbsp;</p></div></div><div class="et_pb_column et_pb_text et_pb_text_46 2nd-last-child"><div class="et_pb_text_inner"><h2>Deep Neural Machine Translation Technology</h2><p><a href="../../products/language-studio/index.html">Language Studio</a> utilizes the latest in state-of-the-art translation technologies. Our custom machine translation engines utilize by <a href="https://github.com/Microsoft/BERT" target="_blank" rel="noopener noreferrer">Google's BERT model and its pre-trained task-specific models</a> to provide high quality translations.</p><h3>BERT model  (<a href="https://github.com/Microsoft/BERT" target="_blank" rel="noopener noreferrer">Russakovsky et al. 2017</a>) </h3><p><span style="font-weight: 400


llama_print_timings:        load time =   641.80 ms
llama_print_timings:      sample time =   139.07 ms /   256 runs   (    0.54 ms per token,  1840.75 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 41530.51 ms /   256 runs   (  162.23 ms per token,     6.16 tokens per second)
llama_print_timings:       total time = 42532.16 ms


Response(response='Translation technology that has been developed to improve accuracy and translation quality?</p><p>&nbsp;</p></div></div><div class="et_pb_column et_pb_text et_pb_text_46 2nd-last-child"><div class="et_pb_text_inner"><h2>Deep Neural Machine Translation Technology</h2><p><a href="../../products/language-studio/index.html">Language Studio</a> utilizes the latest in state-of-the-art translation technologies. Our custom machine translation engines utilize by <a href="https://github.com/Microsoft/BERT" target="_blank" rel="noopener noreferrer">Google\'s BERT model and its pre-trained task-specific models</a> to provide high quality translations.</p><h3>BERT model \xa0(<a href="https://github.com/Microsoft/BERT" target="_blank" rel="noopener noreferrer">Russakovsky et al. 2017</a>)\xa0</h3><p><span style="font-weight: 400', source_nodes=[NodeWithScore(node=TextNode(id_='0a9571f1-d5b6-436c-8342-a6427c151285', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excl

In [64]:
response_stream.response

'NMT is a type of machine translation technology that uses neural networks to process text. It is based on Shallow NMT, with fewer layers, and is used for text classification tasks such as text summarization and text summarization. Deep NMT is a type of RNN that uses a network of connected nodes to process text. It is used for text classification tasks such as text summarization and text summarization.'

In [65]:
query = "Who is dion"
response_stream = query_engine.query(query)
response_stream

Response(response='Dion is a computer scientist who is a pioneer in the field of statistical machine translation and neural machine translation research. He is a former holder of a US O1 Extraordinary Ability Visa.', source_nodes=[NodeWithScore(node=TextNode(id_='1412b9ab-0003-44ee-8c0a-7144f2791444', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d028cdd3-1f14-4aca-b1e7-6ffb998ed996', node_type=None, metadata={}, hash='1af4e54898b7d653f386d686dc99e4c4f125679ec4d0e953fc6448b51fc22cfb'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='22a46e29-5e10-4a7b-9a78-2b41298490f0', node_type=None, metadata={}, hash='0bedd1ca0c44e784d921551ad1280cc4e34f1220156b6554496169ce6110d3d6'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='d3ce9409-74c8-4ce5-98d2-7a71db2aac4e', node_type=None, metadata={}, hash='376ca3b4ea669c603a95b54192590cb1b3290bef2a96a6e1f8c3ad32e749

In [66]:
response_stream.response

'Dion is a computer scientist who is a pioneer in the field of statistical machine translation and neural machine translation research. He is a former holder of a US O1 Extraordinary Ability Visa.'

In [60]:
response_stream = query_engine.query("What is NMT")
print(response_stream)

None


In [69]:
str(response_stream.get_formatted_sources)

'<bound method Response.get_formatted_sources of Response(response=\'Dion is a computer scientist who is a pioneer in the field of statistical machine translation and neural machine translation research. He is a former holder of a US O1 Extraordinary Ability Visa.\', source_nodes=[NodeWithScore(node=TextNode(id_=\'1412b9ab-0003-44ee-8c0a-7144f2791444\', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: \'1\'>: RelatedNodeInfo(node_id=\'d028cdd3-1f14-4aca-b1e7-6ffb998ed996\', node_type=None, metadata={}, hash=\'1af4e54898b7d653f386d686dc99e4c4f125679ec4d0e953fc6448b51fc22cfb\'), <NodeRelationship.PREVIOUS: \'2\'>: RelatedNodeInfo(node_id=\'22a46e29-5e10-4a7b-9a78-2b41298490f0\', node_type=None, metadata={}, hash=\'0bedd1ca0c44e784d921551ad1280cc4e34f1220156b6554496169ce6110d3d6\'), <NodeRelationship.NEXT: \'3\'>: RelatedNodeInfo(node_id=\'d3ce9409-74c8-4ce5-98d2-7a71db2aac4e\', node_type=None, metadata={

# Local model 

In [73]:
from llama_index import(
    GPTVectorStoreIndex,
    ServiceContext,
    LLMPredictor,
    PromptHelper,
    Document,
    VectorStoreIndex,
    LangchainEmbedding,
    StorageContext,
    load_index_from_storage,
    )


from langchain import OpenAI
from langchain.docstore.document import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser

#scrap website
from bs4 import BeautifulSoup
import requests

# upload model 
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from llama_index.llms import LangChainLLM
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [76]:
model_path = "orca-mini-3b.ggmlv3.q4_0.bin"
model_emb_path = "sentence-transformers/all-mpnet-base-v2"
def load_llm(model_path):
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    llm_langchain = LlamaCpp(
    model_path= model_path, 
    callback_manager=callback_manager, 
    verbose=True, 
    n_ctx=2048) #define n-ctx for prevent exceed token error
    llm = LangChainLLM(llm=llm_langchain)
    return llm


In [77]:
llm = load_llm(model_path)
llm_predictor = LLMPredictor(llm = llm)
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_emb_path))

llama.cpp: loading model from orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


In [78]:
max_input_size = 4096
num_output = 512
max_chunk_overlap = 0.20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(
llm_predictor=llm_predictor,
embed_model=embed_model,
prompt_helper=prompt_helper,
)

In [79]:
index = GPTVectorStoreIndex(nodes, service_context=service_context) 
index.storage_context.persist(persist_dir="./orca_index_v1") 

In [82]:
storage_context = StorageContext.from_defaults(persist_dir="./orca_index_v1")
index = load_index_from_storage(storage_context, service_context=service_context)

query_engine = index.as_query_engine(streaming=False, similarity_top_k=1, service_context=service_context)

In [83]:
query = "What is NMT"
response_stream = query_engine.query(query)

---------------------
Neural Machine Translation (NMT) is a type of machine translation technology that uses deep neural networks to translate languages. NMT systems were based on Shallow NMT, with fewer layers. As the technology advanced it became possible to process with more layers and further improve accuracy and translation quality.</p><p>&nbsp;</p></div></div><div class="et_pb_image_16"><img decoding="async" width="800" height="293" src="https://omniscien.com/wp-content/uploads/2020/10/ShallowNMTDeepNMT.png" alt="Shallow NMT vs Deep NMT" title="Shallow NMT vs Deep NMT" srcset="https://omniscien.com/wp-content/uploads/2020/10/ShallowNMTDeepNMT.png 800w, https://omniscien.com/wp-content/uploads/2020/10/ShallowNMTDeepNMT-480x176.png 480w" sizes="


llama_print_timings:        load time =   641.80 ms
llama_print_timings:      sample time =   137.78 ms /   256 runs   (    0.54 ms per token,  1857.99 tokens per second)
llama_print_timings: prompt eval time = 101689.43 ms /  1127 tokens (   90.23 ms per token,    11.08 tokens per second)
llama_print_timings:        eval time = 40816.86 ms /   255 runs   (  160.07 ms per token,     6.25 tokens per second)
llama_print_timings:       total time = 143851.30 ms


In [88]:
index = GPTVectorStoreIndex(nodes, service_context=service_context) 

In [87]:
index.storage_context.persist(persist_dir="./orca_index_test") 