# Introduction

This HR chatbot is powered by Retrieval-Augmented Generation (RAG) technology, enabling efficient analysis of various input formats such as .pdf, .doc, .docx and .png, .jpg, .jpeg. It is designed to assist in identifying the most suitable candidates for specific roles by leveraging document-based data.

# Preparation


1.  Install necessary packages and mount Google Drive data

2.  <font color='red'>Create a folder called "RAG" in your Google Drive, and upload the resumes folder </font>

3.  Prepare for openai key and folder id


In [None]:
## Mount Google Drive Data (If using Google Colaboratory)
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    print("Mounting Failed.")

In [None]:
!pip install pymupdf python-docx docx2txt pytesseract pillow
!pip install -q PyPDF2 nltk faiss-cpu openai langchain langchain_openai langchain_community tqdm fitz
!sudo apt install tesseract-ocr
# to solve some version conflicts of pymupdf
!pip uninstall pymupdf --yes
!pip install pymupdf

import glob
import os
import sys
import io
import pytesseract
from PIL import Image
import textwrap
from PyPDF2 import PdfReader
from nltk.tokenize import sent_tokenize
import nltk
import pickle
import warnings
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
import pickle as pkl
import fitz  # PyMuPDF
import docx
import docx2txt
warnings.filterwarnings("ignore", category=UserWarning)
nltk.download('punkt')

In [None]:
# replace with your openai key.

#################
openai_key = "YOUR OPENAI KEY" ## Put your Open_AI Key here
#################
os.environ["OPENAI_API_KEY"] = openai_key


# extract text from pdf/doc/docx

In [None]:
import nltk
nltk.download('punkt_tab')
# set the path
filepaths = glob.glob("/content/drive/MyDrive/RAG/Resumes/*.*")


def extract_text_from_image(filepath):
    image = Image.open(filepath)
    text = pytesseract.image_to_string(image)
    return text

def extract_text_from_pdf(filepath):
    pdf_reader = fitz.open(filepath)
    text = ''
    for page in pdf_reader:
        text += page.get_text()
    pdf_reader.close()
    return text

def extract_text_from_doc(filepath):
    doc = docx.Document(filepath)
    text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    return text

def extract_text_from_docx(filepath):
    text = docx2txt.process(filepath)
    return text

def load_from_text(fstarter: str, text: str, split_size: int, overlap: int):
    text = sent_tokenize(text)
    i = 0
    data = []
    while i < len(text):
        splits = [fstarter]
        splits.extend(text[i:i+split_size])
        data.append(" ".join(splits))
        i = i + (split_size - overlap)
    return data

chunked_dataset = {}
split_size = 15
overlap = 1

print(f"{filepaths}")
print(f"Number of files: {len(filepaths)}")

i = 0

for filepath in tqdm(filepaths):
    file_name = os.path.basename(filepath)
    i += 1
    extension = os.path.splitext(filepath)[1].lower()

    if extension in ['.pdf', '.doc', '.docx', '.png', '.jpg', '.jpeg']:
        if extension == '.pdf':
            data = extract_text_from_pdf(filepath)
        elif extension == '.doc':
            data = extract_text_from_doc(filepath)
        elif extension == '.docx':
            data = extract_text_from_docx(filepath)
        elif extension in ['.png', '.jpg', '.jpeg']:
            data = extract_text_from_image(filepath)
        else:
            continue

        fstarter = f"Information from {file_name}, the {i}th document."
        chunked_data = load_from_text(fstarter, data, split_size, overlap)
        chunked_dataset[i] = chunked_data
        print(f"Processed {file_name}, the {i}th document")
        print(f"Number of chunks: {len(chunked_data)}")


# Based on chunked data build local vector database

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

for doc_number in tqdm(chunked_dataset.keys()):
    folder_path = f"/content/drive/MyDrive/RAG/Resumes_vectordatabase/{doc_number}"
    if not os.path.exists(folder_path):
        vectorstore = FAISS.from_texts(chunked_dataset[doc_number], embedding=embeddings)
        vectorstore.save_local(folder_path=folder_path, index_name="Resumes")


In [None]:
ordinal = list(chunked_dataset.keys())

base_folder_path = "/content/drive/MyDrive/RAG/Resumes_vectordatabase"
save_folder_path = "/content/drive/MyDrive/RAG/Vectorstores"

vectorstore = FAISS.load_local(
    folder_path=f"{base_folder_path}/{ordinal[0]}",
    embeddings=embeddings,
    index_name="Resumes",
    allow_dangerous_deserialization=True
)

for i in range(1, len(ordinal)):
    _vectorstore = FAISS.load_local(
        folder_path=f"{base_folder_path}/{ordinal[i]}",
        embeddings=embeddings,
        index_name="Resumes",
        allow_dangerous_deserialization=True
    )
    vectorstore.merge_from(_vectorstore)

vectorstore.save_local(folder_path=save_folder_path, index_name="Resumes")

-----


# Ask llm questions



In [None]:
top_k_docs = 10

persona = "You are an HR manager. Answer my question concisely and reasonably. \
If there are fewer than expected or no suitable candidates, \
clearly state that there are not enough candidates available without offering unrelated options. \
Only list candidates directly related to the specified field. \
Do not include candidates from unrelated expertise or specialties, as experience in other specialties is irrelevant. \
 If no suitable candidates are available, state this directly. \
 If multiple candidates are listed, provide a brief summary of the best candidate after listing them."


llm = ChatOpenAI(model= "gpt-4", temperature=0.7, max_retries = 10) ### Change it as you wish

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vectorstore = FAISS.load_local(folder_path=f"/content/drive/MyDrive/RAG/Vectorstores",
                                  embeddings=embeddings,
                                  index_name="Resumes",
                                  allow_dangerous_deserialization=True)


retriever = vectorstore.as_retriever(search_type = "similarity", search_kwargs={"k":top_k_docs})
qa = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = "stuff",
        retriever = retriever,
        verbose = True,
        return_source_documents = True
    )



In [None]:
query_history = ""

while True:
    query = input('Enter your query: (type "exit" to exit) \n')

    if query.strip().lower() == "exit":
        print("Answer from AI HR manager: \n")
        print("goodbye")
        break

    print("\n \033[1mloading... \033[0m\n")

    original_stdout = sys.stdout
    captured_output = io.StringIO()
    sys.stdout = captured_output

    sources = vectorstore.similarity_search_with_relevance_scores(query, k=5)

    result = qa.invoke({
        "persona": persona,
        "query": "<history>" +query_history+"</history> \n <query>"+query+"</query>",
    })

    sys.stdout = original_stdout

    output = captured_output.getvalue()
    filtered_output = output.replace("Entering new RetrievalQA chain...\n", "").replace("Finished chain.\n", "").replace("loading...\n", "")


    query_history += f"User: {query}\n"
    query_history += f"Bot: {result['result']}\n"

    formatted_response = textwrap.dedent(result['result'])
    wrapped_string = textwrap.fill(formatted_response, width=60)

    print("Answer from AI HR manager:")
    print(wrapped_string)
    print("\n\n")