### Dependencies Installation

In [None]:
%pip install -qqq transformers accelerate bitsandbytes llama-index-llms-huggingface python-docx

In [None]:
%pip install -qqq  docx2txt llama_index.embeddings.huggingface llama-index-readers-file llama_index.llms.huggingface_api llama-index-postprocessor-rankgpt-rerank

### Library Imports

In [None]:
from llama_index.core import ServiceContext
from llama_index.core import set_global_service_context
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext

import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

#### Loading the model in 4-bit space to accomodate hardware requirements - includes compression of weights and a tradeoff in accuracy

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

#### Loading the LLM Model

In [None]:

llm = HuggingFaceLLM(
    model_name="stabilityai/stablelm-zephyr-3b",
    tokenizer_name="stabilityai/stablelm-zephyr-3b",
#     query_wrapper_prompt=PromptTemplate("<|system|>\n<|endoftext|>\n<|user|>\n{query_str}<|endoftext|>\n<|assistant|>\n"),
#     query_wrapper_prompt = query_wrapper_prompt,
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.8},
#     messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

#### Document Ingestion begins..

In [None]:
documents=SimpleDirectoryReader("/kaggle/input/resume23").load_data()
len(documents)

#### Making LLM and embedding model as global objects..

In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

#### Storage of documents and their respective embeddings

In [None]:
vector_index = VectorStoreIndex.from_documents(documents)

In [None]:
vector_index

#### Retrieval and re-ranking of information after running query against the vector Index, re-ranking is done by the LLM

In [None]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import QueryBundle
import pandas as pd
from IPython.display import display, HTML
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.llms.huggingface import HuggingFaceLLM

from llama_index.postprocessor.rankgpt_rerank import RankGPTRerank


def get_retrieved_nodes(
    query_str, vector_top_k=5, reranker_top_n=3, with_reranker=False
):
    query_bundle = QueryBundle(query_str)
    # configure retriever
    retriever = VectorIndexRetriever(
        index=vector_index,
        similarity_top_k=vector_top_k,
    )
    retrieved_nodes = retriever.retrieve(query_bundle)

    if with_reranker:
        # configure reranker
        reranker = RankGPTRerank(
            llm=llm,
            top_n=reranker_top_n,
            verbose=True,
        )
        retrieved_nodes = reranker.postprocess_nodes(
            retrieved_nodes, query_bundle
        )

    return retrieved_nodes

In [None]:
query = input("Question:")

#### An example of how information related to query is retrieved

In [None]:
new_nodes = get_retrieved_nodes(
   query,
    vector_top_k=10,
    reranker_top_n=3,
    with_reranker=True,
)

In [None]:
def visualize_retrieved_nodes(nodes) -> None:
    result_dicts = []
    for node in nodes:
        result_dict = {"Score": node.score, "Text": node.node.get_text()}
        result_dicts.append(result_dict)

    pretty_print(pd.DataFrame(result_dicts))

def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))


#### Results of the related information

In [None]:
visualize_retrieved_nodes(new_nodes)

In [None]:
from llama_index.core.response.notebook_utils import display_response
from IPython.display import Markdown, display

#### Zero-Shot Inference (without using exclusive Prompt template)

In [None]:
import time
query_engine = vector_index.as_query_engine(response_mode="compact")

start_time = time.time()
response = query_engine.query(query)
end_time= time.time()
display_response(response)

print("time taken : ",(end_time-start_time))

In [None]:
# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

#### Displaying default prompt template - one for text processing and the other for making the response in more of brief/compact manner

In [None]:
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

### Customization of the prompt template

In [None]:
from llama_index.core import Prompt

template = ("""
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge,
answer the query asking about candidate resumes and their suitability for specific job roles.
Please provide your answer in the form of a structured JSON format containing
a list of candidate's resumes along with their qualifications, experience, skills, and relevance to the job role.

Query: {query_str}
Answer:
"""
)

### Creating the prompt template compatible with query engine from LlamaIndex

In [None]:
qa_template = Prompt(template)

### New Query engine with customized prompt template

In [None]:
query_engine = vector_index.as_query_engine(response_mode="compact",text_qa_template = qa_template)

### Inference test

In [None]:
start_time = time.time()
response = query_engine.query(query)
end_time= time.time()
display_response(response)

print("time taken : ",(end_time-start_time))