# Method 1 Langchain

https://github.com/PradipNichite/Youtube-Tutorials/blob/main/Langchain_Semnatic_Serach_Pinecone.ipynb

In [1]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader

#for Huggingface
from langchain import HuggingFaceHub

#for openai
from langchain.llms import OpenAI
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

import re

In [2]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [4]:
def load_docs(docs_path):
    loader = DirectoryLoader(docs_path, glob="**/*.html")
    documents = loader.load()
    return documents

def split_docs(documents,chunk_size=1000,chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    sp_docs = text_splitter.split_documents(documents)
    return sp_docs

documents = load_docs('omniscien.com')
sp_docs = split_docs(documents)
print(len(sp_docs))

3133


In [5]:
len(documents)

168

In [7]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# save model 
# db1 = Chroma.from_documents(sp_docs, embedding_function, persist_directory="./chroma_db")
# db1.persist()

In [5]:
#load model 
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)

In [6]:
def get_similiar_docs(query,k=1,score=False):
  if score:
    similar_docs = db.similarity_search_with_score(query,k=k)
  else:
    similar_docs = db.similarity_search(query,k=k)
  return similar_docs

In [7]:
repo_id = "declare-lab/flan-alpaca-large"
llm_hug = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature": 0.1, "max_length": 768})

In [8]:
chain = load_qa_with_sources_chain(llm_hug,  chain_type="stuff")

In [9]:
query = "What is NMT"  
similar_docs = get_similiar_docs(query)
print(chain({"input_documents": similar_docs, "question": query}, return_only_outputs=False))

{'input_documents': [Document(page_content='Search\n\nOmniscien » FAQ » What is Neural Machine Translation (NMT)?\n\nWhat is Neural Machine Translation (NMT)?\n\nNeural Machine Translation (also known\xa0as Neural MT, NMT, Deep Neural Machine Translation, Deep NMT, or DNMT) is a state-of-the-art machine translation approach that utilizes neural network techniques to predict the likelihood of a set of words in sequence. This can be a text fragment, complete sentence, or with the latest advances an entire document.\n\nNMT is a\xa0radically different approach to solving the problem of language translation and localization that uses deep neural networks and artificial intelligence to train neural models.\xa0NMT has quickly become the dominant approach to machine translation with a major transition from SMT to NMT in just 3 years. Neural Machine Translation typically produces much higher quality translations that Statistical Machine Translation approaches, with better fluency and adequacy.'

In [25]:
similar_docs

[Document(page_content='Search\n\nOmniscien » FAQ » What is Neural Machine Translation (NMT)?\n\nWhat is Neural Machine Translation (NMT)?\n\nNeural Machine Translation (also known\xa0as Neural MT, NMT, Deep Neural Machine Translation, Deep NMT, or DNMT) is a state-of-the-art machine translation approach that utilizes neural network techniques to predict the likelihood of a set of words in sequence. This can be a text fragment, complete sentence, or with the latest advances an entire document.\n\nNMT is a\xa0radically different approach to solving the problem of language translation and localization that uses deep neural networks and artificial intelligence to train neural models.\xa0NMT has quickly become the dominant approach to machine translation with a major transition from SMT to NMT in just 3 years. Neural Machine Translation typically produces much higher quality translations that Statistical Machine Translation approaches, with better fluency and adequacy.', metadata={'source'

In [86]:
query = "Who is Dion"  
similar_docs = get_similiar_docs(query)
print(chain({"input_documents": similar_docs, "question": query}, return_only_outputs=True))

{'output_text': " Dion is the Vice President and Research Director for Gartner based in Hong Kong and is a well-known pioneer of the Asian Internet Industry. He is the founder of one of Asia's first ISPs (Asia Online in Hong Kong) and has been recognized by the US Government as being in the top 5% of his field worldwide.\nSOURCES: omnisciencom/about-us/company/Index.html"}


In [89]:
y = chain({"input_documents": similar_docs, "question": query}, return_only_outputs=True)

In [92]:
y["output_text"]

" Dion is a pioneer of the Asian Internet Industry, founder of one of Asia's first ISPs (Asia Online in Hong Kong), and was Vice President and Research Director for Gartner. He was the recipient of the Chairman's Commendation Award presented by Microsoft's Bill Gates for the best showcase of software developed in the Philippines, is recognized by the US Government as being in the top 5% of his field worldwide, and is a former holder of a US O1 Extraordinary Ability Visa.\nSOURCES: omnisciencom/about-us/company/Index.html"

In [93]:
text = re.sub(r'\n.*', '', y["output_text"])

print(text)

 Dion is a pioneer of the Asian Internet Industry, founder of one of Asia's first ISPs (Asia Online in Hong Kong), and was Vice President and Research Director for Gartner. He was the recipient of the Chairman's Commendation Award presented by Microsoft's Bill Gates for the best showcase of software developed in the Philippines, is recognized by the US Government as being in the top 5% of his field worldwide, and is a former holder of a US O1 Extraordinary Ability Visa.


In [84]:
x = chain({"input_documents": similar_docs, "question": query}, return_only_outputs=False)

In [85]:
x

{'input_documents': [Document(page_content="Available as two Platform Editions specifically designed to match different business needs.\n\nProduct Overview\n\nFeatures\n\nBenefits of Media Studio (White Paper)\n\nSubtitle Optimized Machine Translation\n\nData Security & Privacy\n\nSecure by Design\n\nProject Management and Editing PlatformProject, People, Resource, Video, and Subtitle Management\n\nDetailed Features\n\nData Processing PlatformData Creation, Analysis, Cleaning, and Organization\n\nDetailed Features\n\nRequest a Demo\n\nFeature   Overview\n\nEach feature is built on a core of Artificial Intelligence, Machine Learning and Natural Language Processing\n\nMachine learning enables machines to work more like humans so that humans don't have to work more like machines. Each feature is designed to augment human intelligence, enhance productivity, increase quality, and reduce cost. Artificial intelligence enables processing and organization of data that simply not be cost-effecti

# Retrieval QA

https://python.langchain.com/docs/modules/chains/popular/vector_db_qa

In [27]:
#docsearch = Chroma.from_documents(sp_docs, embedding_function)
qa = RetrievalQA.from_chain_type(llm=llm_hug, chain_type="stuff", retriever=db.as_retriever(), return_source_documents=True)

In [28]:
query = "What is NMT"
result = qa({"query": query})

In [29]:
result["result"]

'Neural Machine Translation is a powerful tool for language translation and localization. It is able to accurately translate text into different languages, with better fluency and adequacy. It is also able to generate text that is more readable and understandable.'

In [46]:
result["source_documents"]

[Document(page_content="Available as two Platform Editions specifically designed to match different business needs.\n\nProduct Overview\n\nFeatures\n\nBenefits of Media Studio (White Paper)\n\nSubtitle Optimized Machine Translation\n\nData Security & Privacy\n\nSecure by Design\n\nProject Management and Editing PlatformProject, People, Resource, Video, and Subtitle Management\n\nDetailed Features\n\nData Processing PlatformData Creation, Analysis, Cleaning, and Organization\n\nDetailed Features\n\nRequest a Demo\n\nFeature   Overview\n\nEach feature is built on a core of Artificial Intelligence, Machine Learning and Natural Language Processing\n\nMachine learning enables machines to work more like humans so that humans don't have to work more like machines. Each feature is designed to augment human intelligence, enhance productivity, increase quality, and reduce cost. Artificial intelligence enables processing and organization of data that simply not be cost-effective or feasible with 

# Document QA

https://python.langchain.com/docs/modules/chains/additional/question_answering

In [33]:
#docsearch = Chroma.from_documents(sp_docs, embedding_function)
query = "What is NMT"
docs_qa = db.similarity_search(query)

In [34]:
chain = load_qa_with_sources_chain(llm_hug, chain_type="stuff")
query = "What is NMT"
docs_qa = db.similarity_search(query)

{'output_text': 'SOURCES: Neural Machine Translation (NMT) is a state-of-the-art machine translation approach that utilizes deep neural networks and artificial intelligence to train neural models. It is a radically different approach to solving the problem of language translation and localization that uses deep neural networks and artificial intelligence to train neural models. It has quickly become the dominant approach to machine translation with a major transition from SMT to NMT in just 3 years.'}

In [35]:
chain({"input_documents": docs_qa, "question": query}, return_only_outputs=False)

{'input_documents': [Document(page_content='Search\n\nOmniscien » FAQ » What is Neural Machine Translation (NMT)?\n\nWhat is Neural Machine Translation (NMT)?\n\nNeural Machine Translation (also known\xa0as Neural MT, NMT, Deep Neural Machine Translation, Deep NMT, or DNMT) is a state-of-the-art machine translation approach that utilizes neural network techniques to predict the likelihood of a set of words in sequence. This can be a text fragment, complete sentence, or with the latest advances an entire document.\n\nNMT is a\xa0radically different approach to solving the problem of language translation and localization that uses deep neural networks and artificial intelligence to train neural models.\xa0NMT has quickly become the dominant approach to machine translation with a major transition from SMT to NMT in just 3 years. Neural Machine Translation typically produces much higher quality translations that Statistical Machine Translation approaches, with better fluency and adequacy.'

In [36]:
query = "Who is Dion"
docs_qa = db.similarity_search(query)

In [37]:
chain({"input_documents": docs_qa, "question": query}, return_only_outputs=False)

{'input_documents': [Document(page_content="Available as two Platform Editions specifically designed to match different business needs.\n\nProduct Overview\n\nFeatures\n\nBenefits of Media Studio (White Paper)\n\nSubtitle Optimized Machine Translation\n\nData Security & Privacy\n\nSecure by Design\n\nProject Management and Editing PlatformProject, People, Resource, Video, and Subtitle Management\n\nDetailed Features\n\nData Processing PlatformData Creation, Analysis, Cleaning, and Organization\n\nDetailed Features\n\nRequest a Demo\n\nFeature   Overview\n\nEach feature is built on a core of Artificial Intelligence, Machine Learning and Natural Language Processing\n\nMachine learning enables machines to work more like humans so that humans don't have to work more like machines. Each feature is designed to augment human intelligence, enhance productivity, increase quality, and reduce cost. Artificial intelligence enables processing and organization of data that simply not be cost-effecti

# OpenAI 

In [38]:
import os
os.environ["OPENAI_API_KEY"] = "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"

embeddings = OpenAIEmbeddings()
db_openai = Chroma.from_documents(sp_docs, embeddings)

model_name = "text-davinci-003"
# model_name = "gpt-3.5-turbo"
#model_name = "gpt-4"
llm = OpenAI(model_name=model_name)

In [39]:
def get_similiar_docs(query,k=1,score=False):
  if score:
    similar_docs = db_openai.similarity_search_with_score(query,k=k)
  else:
    similar_docs = db_openai.similarity_search(query,k=k)
  return similar_docs

In [40]:
chain = load_qa_with_sources_chain(llm, chain_type="stuff")

def get_answer(query):
  similar_docs = get_similiar_docs(query)
  # print(similar_docs)
  answer =  chain.run(input_documents=similar_docs, question=query)
  return answer


In [32]:
query = "What is NMT"  
similar_docs = get_similiar_docs(query)
print(chain({"input_documents": similar_docs, "question": query}, return_only_outputs=True))

{'output_text': ' Neural Machine Translation (NMT) is a state-of-the-art machine translation approach that utilizes neural network techniques to predict the likelihood of a set of words in sequence. It uses deep neural networks and artificial intelligence to train neural models, and typically produces much higher quality translations that Statistical Machine Translation approaches. \nSOURCES: omnisciencom/faq/what-is-neural-machine-translation/index.html'}


In [33]:
query = "Who is Dion wiggins"  
similar_docs = get_similiar_docs(query)
print(chain({"input_documents": similar_docs, "question": query}, return_only_outputs=True))

{'output_text': " I don't know.\nSOURCES: omnisciencom/lsev6/ocr/optical-character-recognition-overview/index.html"}


In [41]:
query = "Who is Dion wiggins"  
similar_docs_openai = db_openai.similarity_search(query)
print(chain({"input_documents": similar_docs_openai, "question": query}, return_only_outputs=True))

{'output_text': ' Dion Wiggins is a highly experienced ICT industry visionary, entrepreneur, analyst, and consultant. He is an accomplished speaker and has a high media profile for his perceptive analysis of ICT in Asia/Pacific. He was previously Vice President and Research Director for Gartner based in Hong Kong and is a founder of The ActiveX Factory. He is a former holder of a US O1 Extraordinary Ability Visa.\n\nSOURCES: omnisciencom/about-us/company/Index.html'}


In [20]:
similar_docs

[Document(page_content='Click here...', metadata={'source': 'omnisciencom/category/faq/index.html'})]

# Method 2 (not work yet)

In [24]:
from llama_index import(
    GPTVectorStoreIndex,
    ServiceContext,
    LLMPredictor,
    PromptHelper,
    Document,
    VectorStoreIndex,
    LangchainEmbedding,
    StorageContext,
    load_index_from_storage,
    )


from langchain import OpenAI
from langchain.docstore.document import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
from llama_index import download_loader 

#scrap website
from bs4 import BeautifulSoup
import requests

# upload model 
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from llama_index.llms import LangChainLLM
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings.huggingface import HuggingFaceEmbeddings


In [25]:
def load_llm(model_path):
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    llm_langchain = LlamaCpp(
    model_path= model_path, 
    callback_manager=callback_manager, 
    verbose=True, 
    n_ctx=2048) #define n-ctx for prevent exceed token error
    llm = LangChainLLM(llm=llm_langchain)
    return llm

In [61]:
def load_document_to_gpt_vectorstore(folder_path, model_path, model_emb_path):
    
    documents = SimpleDirectoryReader(folder_path).load_data()
    #loader = DirectoryLoader(folder_path, glob="**/*.html")
    #documents = loader.load()
    parser = SimpleNodeParser()

    nodes = parser.get_nodes_from_documents(documents)

    llm = load_llm(model_path)
    llm_predictor = LLMPredictor(llm = llm)
    embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_emb_path))


    max_input_size = 4096
    num_output = 512
    max_chunk_overlap = 0.20
    prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
    service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    embed_model=embed_model,
    prompt_helper=prompt_helper,
    )

    index = GPTVectorStoreIndex(nodes, service_context=service_context) 
    #index.save_to_disk("./gpt_index_docs_api_remotion_v2.json") #cant use save_to_disk replace with storage_context
    index.storage_context.persist(persist_dir="./llama_index_docs_api_v1") # create json file for index
    return index, service_context

In [62]:
url = "https://anaconda.org/conda-forge/attrs"
model_path = "orca-mini-3b.ggmlv3.q4_0.bin"
model_emb_path = "sentence-transformers/all-mpnet-base-v2"
folder_path = 'omnisciencom'

index, service_context = load_document_to_gpt_vectorstore(folder_path= folder_path, 
                                         model_path= model_path,
                                         model_emb_path=model_emb_path)

llama.cpp: loading model from orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 
  from .autonotebook import tqdm as notebook_tqdm


In [63]:
storage_context = StorageContext.from_defaults(persist_dir="./llama_index_docs_api_v1")
index = load_index_from_storage(storage_context, service_context=service_context)

In [64]:
query_engine = index.as_query_engine(streaming=True, similarity_top_k=1, service_context=service_context)

In [66]:
response_stream = query_engine.query("What is Language studio?")
response_stream.print_response_stream()

Llama.generate: prefix-match hit


What is the purpose of this tool? 

My understanding is that this question cannot be answered without additional context. Can you please provide more information or clarify the question further?


llama_print_timings:        load time = 17716.62 ms
llama_print_timings:      sample time =    21.25 ms /    37 runs   (    0.57 ms per token,  1741.34 tokens per second)
llama_print_timings: prompt eval time =   455.64 ms /     4 tokens (  113.91 ms per token,     8.78 tokens per second)
llama_print_timings:        eval time =  5620.15 ms /    36 runs   (  156.12 ms per token,     6.41 tokens per second)
llama_print_timings:       total time =  8675.38 ms


# Test chromadb

In [24]:
# import
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor
from llama_index.node_parser import SimpleNodeParser
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding
from IPython.display import Markdown, display
import chromadb


In [13]:
def load_llm(model_path):      
    llm = HuggingFaceHub(repo_id = model_path, model_kwargs = {"temperature":0, "max_length":512}) #770M parameters			
    return llm   

In [27]:
# create client and a new collection
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("quickstart")

# load documents
url = "omnisciencom"
documents = SimpleDirectoryReader(url, recursive = True).load_data()

model_path = "declare-lab/flan-alpaca-large"
llm = load_llm(model_path)
llm_predictor = LLMPredictor(llm = llm)

# define embedding function
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor = llm_predictor)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context, show_progress=True
)

# Query Data
# query_engine = index.as_query_engine()
# response = query_engine.query("What did the author do growing up?")
# display(Markdown(f"<b>{response}</b>"))

Parsing documents into nodes: 100%|██████████| 169/169 [04:17<00:00,  1.52s/it]
Generating embeddings:   4%|▍         | 410/10273 [02:29<53:06,  3.10it/s]  

In [22]:
query_engine = index.as_query_engine()
response = query_engine.query("what is Omniscien technology")
display(Markdown(f"<b>{response}</b>"))

<b>Omniscien is a company that specializes in language processing, voice recognition, OCR, data mining, and data automation.</b>

# VectorstoreIndexCreator

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator(
    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0),
    embedding = embedding_function, vectorstore_cls = Chroma,
    vectorstore_kwargs={"persist_directory":"./vector_index_save_dir_path"}).from_documents(documents)


In [24]:
index = Chroma(persist_directory="./vector_index_save_dir_path", embedding_function=embedding_function)

In [25]:
repo_id = "declare-lab/flan-alpaca-large"
llm_hug = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature": 0.1, "max_length": 768})

In [26]:
query = "Who is Dion"
index.query(llm=llm_hug, question=query, chain_type = "stuff")

In [27]:
qa = RetrievalQA.from_chain_type(llm=llm_hug, chain_type="stuff", retriever=index.as_retriever(), return_source_documents=True)

In [28]:
query = "What is NMT"
result = qa({"query": query})

In [29]:
result["result"]

'NMT is a type of machine learning algorithm that uses natural language processing to generate text. It is used to generate text that is grammatically correct and is able to understand and respond to human language. It is useful for tasks such as text summarization, text summarization, and text summarization.'

In [30]:
query = "Who is dion"
result = qa({"query": query})

In [31]:
result["result"]

'Dion is an excellent helper.'

In [16]:
query = "Who is Philipp"
index.query(llm=llm_hug, question=query, chain_type = "stuff")

'Philipp Koehn is a Chief Scientist.'

In [13]:
query = "What is language studio"
index.query(llm=llm_hug, question=query, chain_type = "stuff")

'Language Studio is helpful because it provides a powerful and flexible translation platform that can be used to automate tasks, provide data privacy and security, and provide a wide range of features that are usually found only on large web-based public providers.'

In [None]:
query = "What is media studio"
index.query(llm=llm_hug, question=query, chain_type = "stuff")