# Semantic Search and Retrieval-Augmented Generation

## Dense Retrieval

In [1]:
import cohere # semantic search library
import numpy as np
import pandas as pd 
from tqdm import tqdm
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("COHERE_API_KEY")
co = cohere.Client(api_key)

In [2]:
text = """
    Interstellar is a 2014 epic science fiction film directed by Christopher Nolan, who co-wrote the screenplay with his brother Jonathan Nolan. It features an ensemble cast led by Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, and Michael Caine. Set in a dystopian future where Earth is suffering from catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.
    The screenplay had its origins in a script that Jonathan had developed in 2007 and was originally set to be directed by Steven Spielberg. Theoretical physicist Kip Thorne was an executive producer and scientific consultant on the film, and wrote the tie-in book The Science of Interstellar. It was Lynda Obst's final film as producer before her death. Cinematographer Hoyte van Hoytema shot it on 35 mm film in the Panavision anamorphic format and IMAX 70 mm. Filming began in late 2013 and took place in Alberta, Klaustur, and Los Angeles. Interstellar uses extensive practical and miniature effects, and the company DNEG created additional visual effects.
    Interstellar premiered at the TCL Chinese Theatre on October 26, 2014, and was released in theaters in the United States on November 5, and in the United Kingdom on November 7. In the United States, it was first released on film stock, expanding to venues using digital projectors. It was a commercial success, grossing $681 million worldwide during its initial theatrical run, and $771 million worldwide with subsequent releases, making it the 10th-highest-grossing film of 2014. The film received generally positive reviews from critics. Among its various accolades, Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects."""
# split to list of sentences
texts = text.split('.')
# remove empty spaces and new lines from that list of sentences
texts = [t.strip(' \n') for t in texts if len(t.strip(' \n'))>0]

In [3]:
# get embeddings of those sentences
response = co.embed(
    texts=texts,
    input_type="search_document"
).embeddings 
embeds = np.array(response) # (num_sentences,embed_dim)=(14,4096)
print(embeds.shape) 

(14, 4096)


In [4]:
# Build <search-index> (embedding storage)
# conda install -c conda-forge faiss-cpu
import faiss 
dim = embeds.shape[1]
index = faiss.IndexFlatL2(dim)
print(index.is_trained)
index.add(np.float32(embeds))

True


In [5]:
print(index)

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x792c944c3450> >


In [6]:
# find the most similar <sentence> of that <query> term
def search(query, number_of_results=3):
    query_embed = co.embed(
        texts=[query], 
        input_type="search_query"
    ).embeddings[0] # list of 4096 numbers
    query_embed = np.float32([query_embed]) # (1,4096)
    distances, similar_item_ids = index.search(query_embed, number_of_results) # each (1,3)
    # format the result
    texts_np = np.array(texts) # convert list to array of strings
    # [0] will return 3 elements (3 rows)
    results = pd.DataFrame(data={'texts': texts_np[similar_item_ids[0]], 'distance': distances[0]})

    print(f"Query:'{query}'\nNearest neighbors:")
    return results
    
query = "how precise was the science"
results = search(query)
results

Query:'how precise was the science'
Nearest neighbors:


Unnamed: 0,texts,distance
0,Interstellar uses extensive practical and mini...,11992.240234
1,Theoretical physicist Kip Thorne was an execut...,12403.667969
2,Cinematographer Hoyte van Hoytema shot it on 3...,12689.907227


In [7]:
# compare with a keyword search algorithm called BM25
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string 

def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split(): # token = each word
        token = token.strip(string.punctuation)
        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc # list of significant words e.g. ["precise","science"]

In [8]:
# tokenized_scopus: [['interstellar', '2014', 'epic'],['science', 'fiction'],...] for each sentence
# we have 14 sentences
tokenized_scopus = [] 
for passage in tqdm(texts):
    tokenized_scopus.append(bm25_tokenizer(passage))
bm25 = BM25Okapi(tokenized_scopus)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 72944.42it/s]


In [9]:
def keyword_search(query, top_k=3, num_candidates=5):
    print("Input question:", query)
    # BM25 search (lexical search)
    # list of 14 scores (how match the query term to each sentence)
    bm25_scores = bm25.get_scores(
        bm25_tokenizer(query) # list of significant words e.g. ["precise","science"]
    )
    # <argpartition> means only ensure the last N elements are INDICES OF the N largest (unlike <argsort> will sort all)
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id':idx, 'score':bm25_scores[idx]} for idx in top_n]
    # here we sort from highest to lowest score
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    # printing out
    print(f"Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[:top_k]:
        print("\t{:.3f}\t{}".format(hit["score"], texts[hit["corpus_id"]].replace("\n"," ")))

keyword_search(query="how precise was the science")

Input question: how precise was the science
Top-3 lexical search (BM25) hits
	1.548	Theoretical physicist Kip Thorne was an executive producer and scientific consultant on the film, and wrote the tie-in book The Science of Interstellar
	1.548	Interstellar is a 2014 epic science fiction film directed by Christopher Nolan, who co-wrote the screenplay with his brother Jonathan Nolan
	0.000	Set in a dystopian future where Earth is suffering from catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind


## Reranking 

In [10]:
# input: <search-query> & some <search-results> 
# output: <sorted-search-results>
query = "how precise was the science"
results = co.rerank(query=query, documents=texts, top_n=3, return_documents=True)
# printing
for idx, result in enumerate(results.results):
    print(idx, result.relevance_score, result.document.text)

0 0.05418382 It was a commercial success, grossing $681 million worldwide during its initial theatrical run, and $771 million worldwide with subsequent releases, making it the 10th-highest-grossing film of 2014
1 0.041684147 Theoretical physicist Kip Thorne was an executive producer and scientific consultant on the film, and wrote the tie-in book The Science of Interstellar
2 0.03588415 It features an ensemble cast led by Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, and Michael Caine


In [11]:
# Note: we must SHORTLIST before rerank => use KEYWORD SEARCH first to get top 10 result
# => then we RERANK top3 from that top10
def keyword_and_reranking_search(query, top_k=3, num_candidates=10):
    print("Input question:", query)

    ## Scores based on BM25 lexical search
    bm25_scores = bm25.get_scores(bm25_tokenizer(query)) # (num_sentences,)
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:] # (10,) indices of the highest scores sentences
    bm25_hits = [{'corpus_id':idx, 'score':bm25_scores[idx]} for idx in top_n]
    # sort in descending order of scores (10 elements)
    bm25_hits = sorted(bm25_hits, key=lambda x:x['score'], reverse=True)
    # print out
    print(f"Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'], texts[hit['corpus_id']].replace("\n", " ")))

    ## Scores based on Cohere reranking
    docs = [texts[hit['corpus_id']] for hit in bm25_hits]
    print(f"\nTop-3 hits by rank-API ({len(bm25_hits)} BM25 hits re-ranked)")
    results = co.rerank(query=query, documents=docs, top_n=top_k, return_documents=True)
    # print out
    for hit in results.results:
        print("\t{:.3f}\t{}".format(hit.relevance_score, hit.document.text.replace("\n", " ")))

keyword_and_reranking_search(query="how precise was the science")

Input question: how precise was the science
Top-3 lexical search (BM25) hits
	1.548	Theoretical physicist Kip Thorne was an executive producer and scientific consultant on the film, and wrote the tie-in book The Science of Interstellar
	1.548	Interstellar is a 2014 epic science fiction film directed by Christopher Nolan, who co-wrote the screenplay with his brother Jonathan Nolan
	0.000	Cinematographer Hoyte van Hoytema shot it on 35 mm film in the Panavision anamorphic format and IMAX 70 mm

Top-3 hits by rank-API (10 BM25 hits re-ranked)
	0.042	Theoretical physicist Kip Thorne was an executive producer and scientific consultant on the film, and wrote the tie-in book The Science of Interstellar
	0.036	It features an ensemble cast led by Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, and Michael Caine
	0.035	Interstellar is a 2014 epic science fiction film directed by Christopher Nolan, who co-wrote the screenplay with his brother Jonathan Nolan


## Retrieval Augmented Generation (RAG)

### RAG with LLM API

In [12]:
query = "income generated"
# 1/ Retrieval
results = search(query) # index search with <faiss> => return a table of 3 rows, cols <texts>/<distance>
print(results)

Query:'income generated'
Nearest neighbors:
                                               texts      distance
0  It was a commercial success, grossing $681 mil...  14051.174805
1  Interstellar uses extensive practical and mini...  15098.763672
2  Theoretical physicist Kip Thorne was an execut...  15362.929688


In [13]:
# 2/ Grounded generation
docs_dict = [{'text':text} for text in results['texts']] # [{'text':'abcf dsf'}, {'text':'xyz zz'}, ]
response = co.chat(
    message=query,
    documents=docs_dict
)
print(response.text)

The film generated $681 million worldwide during its initial theatrical run, and $771 million worldwide with subsequent releases.


### RAG with local models

In [14]:
# !wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf

In [21]:
# Load the embedding model
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(
    model_name='thenlper/gte-small'
)
# use embedding model to setup vector database
from langchain.vectorstores import FAISS
db = FAISS.from_texts(texts, embedding_model)

In [22]:
# Load the generation model
from langchain import LlamaCpp
llm = LlamaCpp(
    model_path="Phi-3-mini-4k-instruct-fp16.gguf",
    n_gpu_layers=1,
    max_tokens=500,
    n_ctx=2048,
    seed=42,
    verbose=False
)

llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


In [23]:
# Prepare prompt template
from langchain import PromptTemplate
template = """
<|user|>
    Relevant information: {context}
    Provide a concise answer the following question using the relevant information provided above: {question}
<|end|>
<|assistant|>
"""
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# RAG pipeline
from langchain.chains import RetrievalQA
rag = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever(),
    chain_type_kwargs={
        "prompt":prompt
    },
    verbose=True
)

# Execute RAG
rag.invoke('Income generated')



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Income generated',
 'result': " Interstellar generated a worldwide income of $771 million with subsequent releases.\n\nThe information about Hoyte van Hoytema shooting the film on 35mm film in anamorphic format and IMAX, as well as the use of practical effects and DNEG's contribution to VFX, does not directly relate to the income generated but provides context to its production quality. The initial gross of $681 million is also relevant, showing the film's strong opening performance in theaters using traditional projection methods before transitioning to digital formats."}