In [1]:
# !pip install llama-index -qq
import qdrant_client
from datetime import datetime
from llama_index import GPTVectorStoreIndex

from llama_index.vector_stores.qdrant import QdrantVectorStore
from pathlib import Path
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.indices.postprocessor import (
    FixedRecencyPostprocessor,
    EmbeddingRecencyPostprocessor,
)
# load documents
from llama_index.storage.storage_context import StorageContext
Path.ls = lambda x: list(x.iterdir())

In [2]:
client = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    location=":memory:"
    # otherwise set Qdrant instance address with:
    # uri="http://<host>:<port>"
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)

In [3]:
import json

with open("../data/News_Category_Dataset_v3.json", "r") as f:
    data = [json.loads(k) for k in f.readlines()]
    links = [k.pop("link") for k in data]
    authors = [k.pop("authors") for k in data]
    data = [
        {
            "text": f"{k['headline']} under the category: {k['category']}\n {k['short_description']}",
            "date": k["date"],
        }
        for k in data
    ]

In [4]:
write_dir = Path("../data/dump").resolve()
write_dir.mkdir(exist_ok=True, parents=True)
for element in data:
    file_path = write_dir / f"{element['date']}.txt"
    with file_path.open("w") as f:
        f.write(element["text"])

In [5]:
print(write_dir.ls()[-1], file_path.stem)

/Users/nirantk/Desktop/wip/data/dump/2014-12-11.txt 2012-01-28


In [6]:
def get_file_metadata(file_name: str):
    """Get file metadata."""
    return {"date": Path(file_name).stem}


documents = SimpleDirectoryReader(
    input_files=write_dir.ls(), file_metadata=get_file_metadata
).load_data()

# define service context (wrapper container around current classes)
service_context = ServiceContext.from_defaults(chunk_size_limit=512)

In [7]:
documents[1]

Document(text='Mozambique Devises National Plan To End Child Marriage under the category: IMPACT\n The institution affects nearly 1 in 2 girls in the African nation.', doc_id='637edd94-0f9d-4350-9223-3f6d89297046', embedding=None, doc_hash='36789ba23876ca964ccba0d40257174f4ecbd975def60266e18eda045ffe6c22', extra_info={'date': '2016-04-13'})

In [8]:
# len(documents)

In [9]:
# use node parser in service context to parse into nodes
nodes = service_context.node_parser.get_nodes_from_documents(documents)
vector_store = QdrantVectorStore(client=client, collection_name="huffpostnews")

In [10]:
%%time
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex.from_documents(documents, storage_context=storage_context)

## Define Recency Postprocessors

In [11]:
recency_postprocessor = FixedRecencyPostprocessor(service_context=service_context)

In [12]:
node_postprocessor_emb = EmbeddingRecencyPostprocessor(service_context=service_context)

#### Retrieve top 10 most relevant nodes, then filter with Cohere Rerank

In [13]:
# !pip install cohere

In [14]:
import os
from llama_index.indices.postprocessor.cohere_rerank import CohereRerank

cohere_rerank = CohereRerank(api_key=os.environ["COHERE_API_KEY"], top_n=2)

In [18]:
recency_query_engine = index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[recency_postprocessor],
) 

In [19]:
reranking_query_engine = index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[cohere_rerank, recency_postprocessor],
) 

In [15]:
query_engine = index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[cohere_rerank, recency_postprocessor],
)

In [25]:
question = "Who is the current US President?"

response = query_engine.query(question)
print("Both:", response)
response = recency_query_engine.query(question)
print("Recency:", response)
response = reranking_query_engine.query(question)
print("Reranking:", response)

Both: 
The current US President is Barack Obama.
Recency: 
The current US President is Joe Biden.
Reranking: 
The current US President is Barack Obama.
