In [9]:
from llama_index import(
    ServiceContext,
    StorageContext,
    SimpleDirectoryReader,
    LangchainEmbedding,
    VectorStoreIndex,
    load_index_from_storage,
    load_graph_from_storage,
    LLMPredictor,
    PromptHelper
    )

# upload model
from llama_index.llms import LangChainLLM
from llama_index.graph_stores import SimpleGraphStore
from llama_index import (KnowledgeGraphIndex)
from llama_index.storage.storage_context import StorageContext
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from IPython.display import Markdown, display
import re

In [19]:
def load_llm():
    n_gpu_layers = 32 
    n_batch = 512  
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    llm = LlamaCpp(
        model_path="/home/sira/sira_project/meta-Llama2/llama-2-7b-chat.ggmlv3.q8_0.bin",
        callback_manager=callback_manager,
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        verbose=True,
        n_ctx = 4096, 
        temperature = 0.1, 
        max_tokens = 4096
    )
    return llm


llm_predictor = LLMPredictor(llm=LangChainLLM(llm = load_llm()))
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(
        model_name = "sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs = {'device': 'cpu'}))
service_context = ServiceContext.from_defaults(
        llm_predictor=llm_predictor, 
        chunk_size=1000, 
        embed_model = embed_model)

#Graph Vector
storage_context_graph = StorageContext.from_defaults(persist_dir="./llama7b_graph_index_removeHTML")
graph_index = load_index_from_storage(storage_context = storage_context_graph, service_context=service_context)
#Index vector
storage_context_vector = StorageContext.from_defaults(persist_dir="./llama7b_vector_index_removeHTML")
vector_index = load_index_from_storage(storage_context = storage_context_vector, service_context=service_context)

llama.cpp: loading model from /home/sira/sira_project/meta-Llama2/llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 7354.73 MB (+ 2048.00 MB per state)
llama_new_context_with_model: kv s

In [17]:
# import QueryBundle
from llama_index import QueryBundle

# import NodeWithScore
from llama_index.schema import NodeWithScore

# Retrievers
from llama_index.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever

from typing import List


class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both Vector search and Knowledge Graph search"""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        kg_retriever: KGTableRetriever,
        mode: str = "OR",
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._kg_retriever = kg_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        kg_nodes = self._kg_retriever.retrieve(query_bundle)

        vector_ids = {n.node.node_id for n in vector_nodes}
        kg_ids = {n.node.node_id for n in kg_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in kg_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(kg_ids)
        else:
            retrieve_ids = vector_ids.union(kg_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [27]:
from llama_index import get_response_synthesizer
from llama_index.query_engine import RetrieverQueryEngine

# create custom retriever
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k = 5)
kg_retriever = KGTableRetriever(
    index=graph_index, retriever_mode="keyword", include_text=True, similarity_top_k = 5
)
custom_retriever = CustomRetriever(vector_retriever, kg_retriever)

# create response synthesizer
response_synthesizer = get_response_synthesizer(
    service_context=service_context,
    response_mode="tree_summarize",
)

In [31]:
custom_query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,
    response_synthesizer=response_synthesizer,
)

vector_query_engine = vector_index.as_query_engine(similarity_top_k=5)

kg_keyword_query_engine = graph_index.as_query_engine(
    # setting to false uses the raw triplets instead of adding the text from the corresponding nodes
    include_text=True,
    retriever_mode="hybrid",
    response_mode="tree_summarize",
    similarity_top_k=5
)

# KG Method

In [30]:
response = kg_keyword_query_engine.query("Who is Dion Wiggins")
display(Markdown(f"<b>{response}</b>"))

Llama.generate: prefix-match hit



My answer:
KEYWORDS: Dion, Wiggins, person, biography


llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =    12.40 ms /    21 runs   (    0.59 ms per token,  1693.00 tokens per second)
llama_print_timings: prompt eval time = 23439.40 ms /    83 tokens (  282.40 ms per token,     3.54 tokens per second)
llama_print_timings:        eval time =  6253.69 ms /    20 runs   (  312.68 ms per token,     3.20 tokens per second)
llama_print_timings:       total time = 29778.20 ms
Llama.generate: prefix-match hit




Subject: Dion Wiggins
Predicate: has_job
Object: software engineer at Google
Predicate next hop: works_at
Object next hop: Google

Subject: Dion Wiggins
Predicate: has_education
Object: Bachelor of Science in Computer Science from University of California, Los Angeles (UCLA)
Predicate next hop: graduated_from
Object next hop: University of California, Los Angeles (UCLA)


llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =    56.68 ms /   102 runs   (    0.56 ms per token,  1799.45 tokens per second)
llama_print_timings: prompt eval time = 19676.23 ms /    73 tokens (  269.54 ms per token,     3.71 tokens per second)
llama_print_timings:        eval time = 31389.22 ms /   101 runs   (  310.78 ms per token,     3.22 tokens per second)
llama_print_timings:       total time = 51463.02 ms


<b>

Subject: Dion Wiggins
Predicate: has_job
Object: software engineer at Google
Predicate next hop: works_at
Object next hop: Google

Subject: Dion Wiggins
Predicate: has_education
Object: Bachelor of Science in Computer Science from University of California, Los Angeles (UCLA)
Predicate next hop: graduated_from
Object next hop: University of California, Los Angeles (UCLA)</b>

In [32]:
response.get_formatted_sources

<bound method Response.get_formatted_sources of Response(response='\n\nSubject: Dion Wiggins\nPredicate: has_job\nObject: software engineer at Google\nPredicate next hop: works_at\nObject next hop: Google\n\nSubject: Dion Wiggins\nPredicate: has_education\nObject: Bachelor of Science in Computer Science from University of California, Los Angeles (UCLA)\nPredicate next hop: graduated_from\nObject next hop: University of California, Los Angeles (UCLA)', source_nodes=[NodeWithScore(node=TextNode(id_='d3b9a7cd-763e-4137-946e-219010aa2b34', embedding=None, metadata={'kg_rel_texts': [], 'kg_rel_map': {'My': [], 'KEYWORDS': [], 'answer': [], 'person': [], 'Wiggins': [], 'My answer:\nKEYWORDS: Dion': [], 'biography': [], 'Dion': []}}, excluded_embed_metadata_keys=['kg_rel_map', 'kg_rel_texts'], excluded_llm_metadata_keys=['kg_rel_map', 'kg_rel_texts'], relationships={}, hash='881dae6d82e1071e7c4737eb6d5f9fb2bb7776f2d51672bf75412a3aa6f4668f', text='The following are knowledge triplets in max de

# Vector Method 

In [33]:
response = vector_query_engine.query("Tell me events about Dion Wiggins")
display(Markdown(f"<b>{response}</b>"))

Llama.generate: prefix-match hit


and Philipp Koehn that are relevant to the webinar.





llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =    10.16 ms /    17 runs   (    0.60 ms per token,  1673.72 tokens per second)
llama_print_timings: prompt eval time = 1039753.10 ms /  3707 tokens (  280.48 ms per token,     3.57 tokens per second)
llama_print_timings:        eval time =  7103.06 ms /    16 runs   (  443.94 ms per token,     2.25 tokens per second)
llama_print_timings:       total time = 1046939.09 ms
Llama.generate: prefix-match hit









llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =     2.97 ms /     5 runs   (    0.59 ms per token,  1684.07 tokens per second)
llama_print_timings: prompt eval time = 482379.35 ms /  1872 tokens (  257.68 ms per token,     3.88 tokens per second)
llama_print_timings:        eval time =  1411.45 ms /     4 runs   (  352.86 ms per token,     2.83 tokens per second)
llama_print_timings:       total time = 483811.54 ms


<b>



</b>

In [35]:
response.get_formatted_sources

<bound method Response.get_formatted_sources of Response(response='\n\n\n\n', source_nodes=[NodeWithScore(node=TextNode(id_='720a1017-8d9a-40fc-bc92-a6ddc931cac6', embedding=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='773551c9-d229-40c0-b40d-41b4567870a4', node_type=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'}, hash='33eba177902eb2a9c79ad4313cce9d9817cd115b14725d0266b8fb241a9ab4cd'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='995c40f4-62b7-440e-91f4-2f489d346293', node_type=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'}, hash='4576a6856c60aec9bf65b6b5665afdc32ea61cfab0dea41eb0b57be89490bd16'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='b5b93aa1-6e8b-4ba5-b819-3fe9a1f742e6', node_type=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'

In [36]:
response

Response(response='\n\n\n\n', source_nodes=[NodeWithScore(node=TextNode(id_='720a1017-8d9a-40fc-bc92-a6ddc931cac6', embedding=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='773551c9-d229-40c0-b40d-41b4567870a4', node_type=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'}, hash='33eba177902eb2a9c79ad4313cce9d9817cd115b14725d0266b8fb241a9ab4cd'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='995c40f4-62b7-440e-91f4-2f489d346293', node_type=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'}, hash='4576a6856c60aec9bf65b6b5665afdc32ea61cfab0dea41eb0b57be89490bd16'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='b5b93aa1-6e8b-4ba5-b819-3fe9a1f742e6', node_type=None, metadata={'file_name': 'omniscien.com/resources/webinars.html'}, hash='1982a4dc5a09b9de78763c41f5dfb391fe4c5eb

# Custom query 

In [37]:
response = custom_query_engine.query("Tell me events about Dion Wiggins")
display(Markdown(f"<b>{response}</b>"))

Llama.generate: prefix-match hit



My answer:
KEYWORDS: Dion Wiggins, events, biography, life story, history, achievements, awards, career, personal life.


llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =    24.05 ms /    36 runs   (    0.67 ms per token,  1497.13 tokens per second)
llama_print_timings: prompt eval time = 46701.33 ms /    86 tokens (  543.04 ms per token,     1.84 tokens per second)
llama_print_timings:        eval time = 35147.42 ms /    35 runs   ( 1004.21 ms per token,     1.00 tokens per second)
llama_print_timings:       total time = 82037.84 ms
Llama.generate: prefix-match hit






































































































































































































































































llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =   184.05 ms /   257 runs   (    0.72 ms per token,  1396.38 tokens per second)
llama_print_timings: prompt eval time = 1683674.91 ms /  3838 tokens (  438.69 ms per token,     2.28 tokens per second)
llama_print_timings:        eval time = 115811.36 ms /   256 runs   (  452.39 ms per token,     2.21 tokens per second)
llama_print_timings:       total time = 1800574.54 ms
Llama.generate: prefix-match hit




1. What is Dion Wiggins's role at Omniscien Technologies?
2. What are some of the technologies that use Transformer models?
3. How does domain adaptation work in MT and ASR?
4. What are some limitations of traditional approaches to creating training data for AI systems?
5. How can AI be used to create many millions of sentences of training data for domain adaptation?


llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =    61.06 ms /    95 runs   (    0.64 ms per token,  1555.90 tokens per second)
llama_print_timings: prompt eval time = 440161.19 ms /  1672 tokens (  263.25 ms per token,     3.80 tokens per second)
llama_print_timings:        eval time = 33748.30 ms /    94 runs   (  359.02 ms per token,     2.79 tokens per second)
llama_print_timings:       total time = 474263.37 ms
Llama.generate: prefix-match hit


Answer: 
1. Dion Wiggins is a research scientist at Omniscien Technologies.
2. Some technologies that use Transformer models include chatbots, language translation systems, and text summarization tools.
3. In MT and ASR, domain adaptation involves adapting machine learning models to new domains or languages by using data from the target domain or language to fine-tune the model.
4. Traditional approaches to creating training data for AI systems can be limited by the quality and quantity of available data, as well as the cost and time required to collect and label it.
5. AI can be used to create many millions of sentences of training data for domain adaptation by using natural language processing techniques to generate synthetic data that mimics real-world language use. This can help to augment existing training data and improve the performance of machine learning models in new domains or languages.


llama_print_timings:        load time = 23650.94 ms
llama_print_timings:      sample time =   114.61 ms /   194 runs   (    0.59 ms per token,  1692.77 tokens per second)
llama_print_timings: prompt eval time = 31036.96 ms /   120 tokens (  258.64 ms per token,     3.87 tokens per second)
llama_print_timings:        eval time = 59415.47 ms /   193 runs   (  307.85 ms per token,     3.25 tokens per second)
llama_print_timings:       total time = 91127.95 ms


<b>Answer: 
1. Dion Wiggins is a research scientist at Omniscien Technologies.
2. Some technologies that use Transformer models include chatbots, language translation systems, and text summarization tools.
3. In MT and ASR, domain adaptation involves adapting machine learning models to new domains or languages by using data from the target domain or language to fine-tune the model.
4. Traditional approaches to creating training data for AI systems can be limited by the quality and quantity of available data, as well as the cost and time required to collect and label it.
5. AI can be used to create many millions of sentences of training data for domain adaptation by using natural language processing techniques to generate synthetic data that mimics real-world language use. This can help to augment existing training data and improve the performance of machine learning models in new domains or languages.</b>

In [2]:
query_engine = index_load.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5,
)
response = query_engine.query(
    "Tell me more about Dion Wiggins",
)


My answer:
KEYWORDS: Dion Wiggins, bio, information, personal details, career, achievements, awards, interests, hobbies, social media.


llama_print_timings:        load time = 23491.94 ms
llama_print_timings:      sample time =    23.18 ms /    39 runs   (    0.59 ms per token,  1682.78 tokens per second)
llama_print_timings: prompt eval time = 23491.88 ms /    87 tokens (  270.02 ms per token,     3.70 tokens per second)
llama_print_timings:        eval time = 11838.98 ms /    38 runs   (  311.55 ms per token,     3.21 tokens per second)
llama_print_timings:       total time = 35496.52 ms
Llama.generate: prefix-match hit


---------------------
Dion Wiggins is a researcher at Omniscien Technologies. His work focuses on developing and applying machine learning techniques to various applications, including natural language processing, computer vision, and recommender systems. He has published several papers in top-tier conferences and journals and has received awards for his contributions to the field.







llama_print_timings:        load time = 23491.94 ms
llama_print_timings:      sample time =    51.09 ms /    80 runs   (    0.64 ms per token,  1565.83 tokens per second)
llama_print_timings: prompt eval time = 476897.76 ms /  1645 tokens (  289.91 ms per token,     3.45 tokens per second)
llama_print_timings:        eval time = 30066.48 ms /    79 runs   (  380.59 ms per token,     2.63 tokens per second)
llama_print_timings:       total time = 507347.48 ms


In [5]:
display(Markdown(f"<b>{response}</b>"))

<b>---------------------
Dion Wiggins is a researcher at Omniscien Technologies. His work focuses on developing and applying machine learning techniques to various applications, including natural language processing, computer vision, and recommender systems. He has published several papers in top-tier conferences and journals and has received awards for his contributions to the field.




</b>

In [6]:
response

Response(response='---------------------\nDion Wiggins is a researcher at Omniscien Technologies. His work focuses on developing and applying machine learning techniques to various applications, including natural language processing, computer vision, and recommender systems. He has published several papers in top-tier conferences and journals and has received awards for his contributions to the field.\n\n\n\n\n', source_nodes=[NodeWithScore(node=TextNode(id_='7e5d38ce-de8a-44fc-9986-484850f1a22b', embedding=None, metadata={'file_name': 'omniscien.com/faq/what-is-real-time-captioning/index.html'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ba307834-171e-47fd-b2a5-d2c90f8148f6', node_type=None, metadata={'file_name': 'omniscien.com/faq/what-is-real-time-captioning/index.html'}, hash='6891b8ae60fb547d4d1fae9169fc0206f91488d417ec2da3a0b39f52b92b107d'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeI

In [11]:
def check_duplicate(source_list):
    res = []
    for i in source_list:
        if i not in res:
            res.append(i)
    return res

def convert_to_website_format(urls):
    convert_urls = []
    for url in urls:
        # Remove any '.html' at the end of the URL
        url = re.sub(r'\.html$', '', url)
        # Check if the URL starts with 'www.' or 'http://'
        if not re.match(r'(www\.|http://)', url):
            url = 'www.' + url
        if '/index' in url:
            url = url.split('/index')[0] 
        convert_urls.append(url)
    return convert_urls

def regex_source(answer):
    pattern = r"'file_name': '(.*?)'"
    matchs = re.findall(pattern, str(answer))
    convert_urls = convert_to_website_format(matchs)
    res_urls = check_duplicate(source_list=convert_urls)
    return res_urls

urls = regex_source(response.get_formatted_sources)

In [12]:
urls

['www.omniscien.com/faq/what-is-real-time-captioning',
 'www.omniscien.com/blog/speech-recognition-speech-synthesis-glossary-a-g',
 'www.omniscien.com/machine-translation/ways-to-translate']

In [13]:
response.get_formatted_sources

<bound method Response.get_formatted_sources of Response(response='---------------------\nDion Wiggins is a researcher at Omniscien Technologies. His work focuses on developing and applying machine learning techniques to various applications, including natural language processing, computer vision, and recommender systems. He has published several papers in top-tier conferences and journals and has received awards for his contributions to the field.\n\n\n\n\n', source_nodes=[NodeWithScore(node=TextNode(id_='7e5d38ce-de8a-44fc-9986-484850f1a22b', embedding=None, metadata={'file_name': 'omniscien.com/faq/what-is-real-time-captioning/index.html'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ba307834-171e-47fd-b2a5-d2c90f8148f6', node_type=None, metadata={'file_name': 'omniscien.com/faq/what-is-real-time-captioning/index.html'}, hash='6891b8ae60fb547d4d1fae9169fc0206f91488d417ec2da3a0b39f52b92b107d')