In [1]:
import os
os.environ['CURL_CA_BUNDLE'] = ""

In [2]:
import requests
requests.__version__

'2.27.0'

In [3]:
import logging
import sys

from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.embeddings import HuggingFaceEmbedding

import os
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index import StorageContext, load_index_from_storage

from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex, set_global_service_context
from llama_index.evaluation import DatasetGenerator, RelevancyEvaluator

from IPython.display import Markdown, display
from llama_index.prompts import PromptTemplate

from accelerate import Accelerator

from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters, MetadataFilter
from llama_index.vector_stores import FilterOperator, FilterCondition


import os
import re
import glob
import fitz


In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)  # Change INFO to DEBUG if you want more extensive logging
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

llm = LlamaCPP(
#    
    model_path=r"llama-2-7b-chat.Q2_K.gguf",
    
    temperature=0.1,
    max_new_tokens=1024,
    
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=5000,  # note, this sets n_ctx in the model_kwargs below, so you don't need to pass it there.
    
    # kwargs to pass to __call__()
    generate_kwargs={},
    
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 100, "repetition_penalty":1.5},
    
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
#     verbose=False,
)

In [11]:
accelerator = Accelerator()
llm = accelerator.prepare(llm)

In [None]:
embeddings = HuggingFaceEmbedding(device=0, 
                                 cache_folder = r"\cache_temp")

# chunk_size - It defines the size of the chunks (or nodes) that documents are broken into when they are indexed by LlamaIndex
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=1024,
                                               embed_model=embeddings,
                                               callback_manager=callback_manager)
set_global_service_context(service_context)


In [13]:
%%time
llm.complete('Tell me something about color Red').text


llama_print_timings:        load time =    4917.89 ms
llama_print_timings:      sample time =      21.77 ms /    91 runs   (    0.24 ms per token,  4181.02 tokens per second)
llama_print_timings: prompt eval time =    4917.61 ms /    71 tokens (   69.26 ms per token,    14.44 tokens per second)
llama_print_timings:        eval time =   23239.14 ms /    90 runs   (  258.21 ms per token,     3.87 tokens per second)
llama_print_timings:       total time =   28523.14 ms /   161 tokens


CPU times: total: 27.2 s
Wall time: 28.5 s


'  Of course! The color red is a vibrant and powerful hue that is often associated with energy, passion, and emotion. It is often used to represent love, anger, and excitement. Red is also a warm and inviting color that can evoke feelings of comfort and warmth. In many cultures, red is considered a symbol of good luck and prosperity. Did you have any other questions about the color red?'

In [42]:
transcript_directory = r"Sandy_blogspot_pdf"

# Add filename as metadata to each chunk associated with a document/transcript
filename_fn = lambda filename: {'post_title': os.path.splitext(os.path.basename(filename))[0].split('__')[0],
                               'post_year': os.path.splitext(os.path.basename(filename))[0].split('__')[1]}
documents = SimpleDirectoryReader(transcript_directory, filename_as_id=True, 
                                  file_metadata=filename_fn).load_data()

In [44]:
documents[12].metadata
# ----------------------
# {'page_label': '1',
#  'file_name': 'Sandy_blogspot_pdf\\A decade of transition__2010.pdf',
#  'post_title': 'A decade of transition',
#  'post_year': '2010'}

{'page_label': '1',
 'file_name': 'Sandy_blogspot_pdf\\A decade of transition__2020.pdf',
 'post_title': 'A decade of transition',
 'post_year': '2020'}

In [47]:
# Exclude metadata from the LLM, meaning it won't read it when generating a response.
# Future - consider looping over documents and setting the id_ to basename, instead of fullpath
[document.excluded_llm_metadata_keys.append('post_title') for document in documents]

parser = SimpleNodeParser.from_defaults(chunk_size=600, chunk_overlap=50)
pdf_nodes = parser.get_nodes_from_documents(documents)

index = VectorStoreIndex(nodes=pdf_nodes)

**********
Trace: index_construction
    |_embedding ->  2.540562 seconds
    |_embedding ->  2.269369 seconds
    |_embedding ->  2.20695 seconds
    |_embedding ->  2.264154 seconds
    |_embedding ->  2.350412 seconds
    |_embedding ->  1.731686 seconds
    |_embedding ->  1.939932 seconds
    |_embedding ->  2.271608 seconds
    |_embedding ->  2.352095 seconds
    |_embedding ->  2.387446 seconds
    |_embedding ->  2.258914 seconds
    |_embedding ->  2.272538 seconds
    |_embedding ->  2.272615 seconds
    |_embedding ->  2.328629 seconds
    |_embedding ->  2.282732 seconds
    |_embedding ->  2.282607 seconds
    |_embedding ->  2.269877 seconds
    |_embedding ->  2.337535 seconds
    |_embedding ->  2.348576 seconds
    |_embedding ->  2.297429 seconds
    |_embedding ->  2.241221 seconds
    |_embedding ->  2.13362 seconds
**********


In [48]:
print(documents[0].metadata['post_title'])
print(documents[12].metadata['post_title'])

150 kms in 15 days
A decade of transition


In [49]:
print('Number of documents:'+str(len(documents)))
print('Number of nodes:'+str(len(pdf_nodes)))

Number of documents:196
Number of nodes:218


In [50]:
filters = MetadataFilters(filters=[
    ExactMatchFilter(
        key="post_title", 
        value='A decade of transition'
    ),
])

retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")  # for this post - the prompt I give here is not critical

# printing out the metadata
for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.022596 seconds
      |_embedding ->  0.020071 seconds
**********
title:A decade of transition, Year:2020
title:A decade of transition, Year:2020


In [54]:
filters = MetadataFilters(filters=[
    ExactMatchFilter(
        key="post_year", 
        value='2013'
    ),
])

retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")  # for this post - the prompt I give here is not critical

# printing out the metadata
for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.026319 seconds
      |_embedding ->  0.02432 seconds
**********
title:Congrats Sandeep !!!, Year:2013
title:Night before the Ride, Year:2013


In [55]:
filters = MetadataFilters(filters=[
    ExactMatchFilter(
        key="post_year", 
        value='2013'
    ),
     ExactMatchFilter(
        key="post_title", 
        value='Night before the Ride'
    ),
])

retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")  # for this post - the prompt I give here is not critical

# printing out the metadata
for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.025634 seconds
      |_embedding ->  0.024137 seconds
**********
title:Night before the Ride, Year:2013


In [56]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="post_title", value="A decade of transition"),
    ],
)


retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")  # for this post - the prompt I give here is not critical

# printing out the metadata
for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.030039 seconds
      |_embedding ->  0.02754 seconds
**********
title:A decade of transition, Year:2020
title:A decade of transition, Year:2020


In [57]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="post_year", value="2013"),
    ],
)


retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")  # for this post - the prompt I give here is not critical

# printing out the metadata
for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.027453 seconds
      |_embedding ->  0.024355 seconds
**********
title:Congrats Sandeep !!!, Year:2013
title:Night before the Ride, Year:2013


In [61]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="post_year", value="2013"),
        MetadataFilter(key="post_title", value="Night before the Ride")
    ],
    condition=FilterCondition.AND 
)


retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")  # for this post - the prompt I give here is not critical

# printing out the metadata
for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.02564 seconds
      |_embedding ->  0.024114 seconds
**********
title:Night before the Ride, Year:2013


In [64]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="post_year", value="2013"),
        MetadataFilter(key="post_title", value="Night before the Ride")
    ],
    condition='or'
)


retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")  # for this post - the prompt I give here is not critical

# printing out the metadata
for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.031205 seconds
      |_embedding ->  0.029205 seconds
**********
title:Night before the Ride, Year:2013


In [66]:
filters = [
    MetadataFilter(
        key='post_year',
        value=title,
        operator='==',
    
    )
    for title in ['2015', '2015']
]

filters = MetadataFilters(filters=filters, condition="or")

retriever = index.as_retriever(filters=filters)
docs = retriever.retrieve("Marathon running")

for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

**********
Trace: query
    |_retrieve ->  0.032103 seconds
      |_embedding ->  0.029321 seconds
**********
title:First Half Marathon + 2015 Resolutions, Year:2015
title:Reflecting on First Half of 2015, Year:2015


In [14]:
# filters = MetadataFilters(
#     filters=[
#         MetadataFilter(key="post_title", value="150 kms in 15 days"),
#         MetadataFilter(key="page_label", value="1"),
#     ],
#     condition=FilterCondition.OR,
# )

**********
Trace: query
    |_retrieve ->  0.010475 seconds
      |_embedding ->  0.010475 seconds
**********


In [69]:
%%time

filters = MetadataFilters(
    filters=[
        MetadataFilter(key="post_year", value="2017"),
    ],
)
query_engine = index.as_query_engine(service_context=service_context,
                                     similarity_top_k=5,
                                         filters = filters,
                                       response_mode='tree_summarize')
    
response = query_engine.query("Summarise the theme of all documents")
print(response)

print('\n Metadata')


for i in range(len(docs)):
    print("title:"+docs[i].metadata['post_title']+", Year:"+docs[i].metadata['post_year'])

Llama.generate: prefix-match hit

llama_print_timings:        load time =    4917.89 ms
llama_print_timings:      sample time =      55.65 ms /   213 runs   (    0.26 ms per token,  3827.77 tokens per second)
llama_print_timings: prompt eval time =   62716.04 ms /  1486 tokens (   42.20 ms per token,    23.69 tokens per second)
llama_print_timings:        eval time =  156119.55 ms /   212 runs   (  736.41 ms per token,     1.36 tokens per second)
llama_print_timings:       total time =  219963.28 ms /  1698 tokens


**********
Trace: query
    |_query ->  220.005537 seconds
      |_retrieve ->  0.032903 seconds
        |_embedding ->  0.028785 seconds
      |_synthesize ->  219.972634 seconds
        |_templating ->  0.0 seconds
        |_llm ->  219.966011 seconds
**********
  Based on the information provided in the three documents, the theme that emerges is the importance of self-motivation and self-belief. The author of the first document made a promise to themselves to never doubt their capabilities and to always remember this promise when facing challenges. The author of the second document also made a promise to themselves to never give up on their dreams and to keep pushing themselves, despite feeling discouraged or defeated. The author of the third document is currently trying to accomplish a task that they have been struggling with for some time, and they are using their own experiences and the power of self-belief to motivate themselves and push through their challenges.
Overall, the th