# Cohere Rerank

In [77]:
# !pip install llama-index -qq
import qdrant_client
from datetime import datetime
from llama_index import GPTVectorStoreIndex
from llama_index.readers import JSONReader

from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.indices.postprocessor import (
    FixedRecencyPostprocessor,
    EmbeddingRecencyPostprocessor
)
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.docstore import SimpleDocumentStore

In [91]:
import json

with open("../data/News_Category_Dataset_v3.json", "r") as f:
    data = [json.loads(k) for k in f.readlines()]
    links = [k.pop('link') for k in data]
    authors = [k.pop('authors') for k in data]
    data = [{"text": f"{k['headline']} under the category: {k['category']}\n {k['short_description']}", "date": k['date']} for k in data]

In [96]:
from pathlib import Path
Path.ls = lambda x: list(x.iterdir())
write_dir = Path("../data/dump").resolve()
write_dir.mkdir(exist_ok=True, parents=True)
for element in data:
    file_path = write_dir / f"{element['date']}.txt"
    with file_path.open("w") as f:
        f.write(element["text"])

In [98]:
# write_dir.ls()
# file_path.stem

'2012-01-28'

In [106]:
# load documents
from llama_index.storage.storage_context import StorageContext


def get_file_metadata(file_name: str):
    """Get file metadata."""
    return {"date": Path(file_name).stem}

documents = SimpleDirectoryReader(
    input_files=write_dir.ls(),
    file_metadata=get_file_metadata
).load_data()

# define service context (wrapper container around current classes)
service_context = ServiceContext.from_defaults(chunk_size_limit=512)

In [107]:
documents[1]

Document(text='Mozambique Devises National Plan To End Child Marriage under the category: IMPACT\n The institution affects nearly 1 in 2 girls in the African nation.', doc_id='08d3aae1-bd9a-4131-b585-f16763ebc87e', embedding=None, doc_hash='36789ba23876ca964ccba0d40257174f4ecbd975def60266e18eda045ffe6c22', extra_info={'date': '2016-04-13'})

In [108]:
# use node parser in service context to parse into nodes
nodes = service_context.node_parser.get_nodes_from_documents(documents)

In [109]:
# add to docstore
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

In [110]:
storage_context = StorageContext.from_defaults(docstore=docstore)

In [113]:
documents = documents[:51]

In [114]:
documents[0]

Document(text="'Hero Of The Year' Reunites Soldiers With Stray Dogs They Befriended Overseas under the category: IMPACT\n ", doc_id='c41de604-3491-4455-bc8b-2040bfe5a4a7', embedding=None, doc_hash='5c8e684e7fe149aafbdd18b25ee59ce336fdef3d23a417e0f25b08cdd8a19a7a', extra_info={'date': '2014-12-08'})

In [115]:
%%time
index = GPTVectorStoreIndex.from_documents(documents=documents)

CPU times: user 80 ms, sys: 18.5 ms, total: 98.5 ms
Wall time: 4.96 s


In [123]:
throughput = len(data) / 2318
f"{throughput:6f} docs per second"

'90.391286 docs per second'

## Define Recency Postprocessors

In [125]:
recency_postprocessor = FixedRecencyPostprocessor(service_context=service_context)

In [126]:
node_postprocessor_emb = EmbeddingRecencyPostprocessor(service_context=service_context)

#### Retrieve top 10 most relevant nodes, then filter with Cohere Rerank

In [127]:
# !pip install cohere

In [128]:
import os
from llama_index.indices.postprocessor.cohere_rerank import CohereRerank


api_key = os.environ["COHERE_API_KEY"]
cohere_rerank = CohereRerank(api_key=api_key, top_n=2)

In [129]:
query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[cohere_rerank, recency_postprocessor],
)
response = query_engine.query(
    "Who won the latest Superbowl?", 
)

In [122]:
print(response)


The latest Superbowl was won by the Kansas City Chiefs in 2020.


### Directly retrieve top 2 most similar nodes

In [74]:
query_engine = index.as_query_engine(
    similarity_top_k=2,
)
response = query_engine.query(
    "What did Sam Altman do in this essay?", 
)

Retrieved context is irrelevant and response is hallucinated.

In [75]:
print(response)


In this essay, Sam Altman was interviewed about his place within the gay community and the Black queer community. He discussed his views on the importance of self-awareness and collaboration in leadership.
