# API notebook
Here we are going to deploy the 4 endpoints of the embeddings, reranker and generative model.

Install necessary dependencies:


*   pyngrok - for tunneling the API endpoints
*   vllm - for productionalizing the large language models
*   uvicorn & fastapi - for creating the API
* sentence-transformers - for performing sentence level semantic search using embedders and cross encoders



In [None]:
!pip install pyngrok

In [None]:
!pip install -q vllm fastapi uvicorn ngrok pyngrok nest_asyncio

In [None]:
!pip install sentence-transformers

In [None]:
from pyngrok import ngrok

ngrok.update()

Add your ngrok token here:

In [None]:
#!ngrok config add-authtoken <add token here>

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import conf, ngrok

conf.get_default().ngrok_path = "/usr/local/bin/ngrok"

# <NgrokTunnel: "https://<public_sub>.ngrok.io" -> "http://localhost:80">
ngrok_tunnel = ngrok.connect()

In [None]:
!pip install faiss-gpu



In [None]:
import time
import faiss
import numpy as np

def load_dataset_in_index(model, data):
    """
    Loads a dataset into a FAISS index after encoding the data using a given model.

    Args:
        model (object): The model used to encode the data, expected to have an 'encode' method.
        data (pandas.DataFrame): DataFrame containing the data to be indexed, should have a "context" column.

    Returns:
        faiss.Index: The FAISS index with the encoded data.
    """
    df = data[["context"]]
    encoded_data = model.encode(df.context.tolist())
    encoded_data = np.asarray(encoded_data.astype("float32"))
    index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
    index.add_with_ids(encoded_data, np.array(range(0, len(df))))
    faiss.write_index(index, os.path.join(file_location, "context.index"))
    return index

def fetch_info(dataframe_idx, df):
    """
    Fetches context and page information from a DataFrame given a row index.

    Args:
        dataframe_idx (int): Row index in the DataFrame.
        df (pandas.DataFrame): DataFrame containing the data.

    Returns:
        dict: A dictionary containing the "context" and "page" information.
    """
    info = df.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict["context"] = info["context"]
    meta_dict["page"] = info["page"]
    return meta_dict


def search(query, top_k, index, model, df):
    """
    Searches the FAISS index with a given query and returns the top K results.

    Args:
        query (str): Query string to search for.
        top_k (int): Number of top results to return.
        index (faiss.Index): The FAISS index to search in.
        model (object): The model used to encode the query, expected to have an 'encode' method.
        df (pandas.DataFrame): DataFrame containing the data.

    Returns:
        list: A list of dictionaries containing the "context" and "page" information of the top results.
    """
    t = time.time()
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    #print(top_k)
    print(">>>> Results in Total Time: {}".format(time.time() - t))
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))
    results = [fetch_info(idx, df) for idx in top_k_ids]
    return results

def rerank(model, query, embedding_results):
    """
    Re-ranks the embedding results using a cross-encoder model.

    Args:
        model (object): The re-ranking model, expected to have a 'predict' method.
        query (str): The query string.
        embedding_results (list): List of embedding results to be re-ranked.

    Returns:
        str: The context of the highest-scoring result.
    """
    model_inputs = [[query, item] for item in embedding_results]
    scores = model.predict(model_inputs)
    ranked_results = [
        {"Context": inp, "Score": score}
        for inp, score in zip(embedding_results, scores)
    ]
    ranked_results = sorted(ranked_results, key=lambda x: x["Score"], reverse=True)
    best_result = ranked_results[0]["Context"]
    print("Best result score: " + str(ranked_results[0]["Score"]))
    return best_result



def get_results(model, df, index, question):
    """
    Gets the final re-ranked results for a given query.

    Args:
        model (object): The main model used for encoding and re-ranking.
        df (pandas.DataFrame): DataFrame containing the data.
        index (faiss.Index): The FAISS index to search in.
        question (str): The query string.

    Returns:
        list: A list containing a dictionary with "context", "page", and "context_uid" of the best result.
    """
    query = question
    embedding_results = search(query, top_k=50, index=index, model=model, df=df)
    embedding_results_ctx = [item["context"] for item in embedding_results]
    final_result = rerank(cross_encoder, query, embedding_results_ctx)
    context_row = df[df["context"]==final_result]
    final_result_page = context_row.page.values[0]
    final_context_uid = context_row.context_uid.values[0]
    reranker_results = [{"context": final_result, "page":final_result_page, "context_uid":final_context_uid}]
    return reranker_results

Create a variable holding the generative model, so that it is not loaded every time we restart the application:

In [None]:
llm = None

Run the following and then copy the ngrok funnel link to the client side application:

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from vllm import LLM, SamplingParams
import uvicorn
from pyngrok import ngrok
import nest_asyncio
from starlette.responses import StreamingResponse
import asyncio
from sentence_transformers import SentenceTransformer, CrossEncoder


nest_asyncio.apply()

app = FastAPI()
if llm is None:
  llm = LLM(model="Sreenington/Phi-3-mini-4k-instruct-AWQ", quantization="AWQ")

class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int
    temperature: float

class SearchRequest(BaseModel):
  query: str

class RerankRequest(BaseModel):
  model_args: str

@app.on_event('startup')
def load_search_models():
  global embedder
  global cross_encoder
  embedder = SentenceTransformer(
          "sentence-transformers/msmarco-bert-base-dot-v5", device="cuda"
      )
  cross_encoder = CrossEncoder(
        "cross-encoder/ms-marco-MiniLM-L-12-v2", device="cuda"
    )

@app.post("/v1/completions")
async def completions(request: CompletionRequest):
  sampling_params = SamplingParams(max_tokens=request.max_tokens, temperature=request.temperature)
  response = llm.generate([request.prompt], sampling_params)
  ans = response[0].outputs[0].text
  return ans

@app.post("/v1/search")
def embed(request: SearchRequest):
  global embedder
  query_vector = embedder.encode([request.query])
  return query_vector

@app.post("/v1/rerank")
async def rerank(request: RerankRequest):
  global cross_encoder
  prediction = cross_encoder.predict(request.model_args)

public_url = ngrok.connect(8000)
print(f"Public URL: {public_url}")
uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
#del llm
del embedder
del cross_encoder