In [2]:
import os

llamaparse_api_key = os.environ.get('LLAMA_CLOUD_API_KEY')
openai_api_key = os.environ.get("OPENAI_API_KEY")

In [3]:
##### LLAMAPARSE #####
from llama_parse import LlamaParse

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from groq import Groq
from langchain_groq import ChatGroq
import joblib
import os
import nest_asyncio  # noqa: E402
nest_asyncio.apply()
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
def load_or_parse_data():
    data_file = "./data/parsed_data.pkl"

    if os.path.exists(data_file):
        # Load the parsed data from the file
        parsed_data = joblib.load(data_file)
        print("loading the data")
    else:
        print("I'm in else")
        # Perform the parsing step and store the result in llama_parse_documents
        parsingInstructionUber10k = """The provided document is a HR policies
        of an organization.
        Try to be precise while answering the questions"""
        parser = LlamaParse(api_key=llamaparse_api_key,
                            result_type="markdown",
                            parsing_instruction=parsingInstructionUber10k,
                            max_timeout=5000,)
        llama_parse_documents = parser.load_data("./data/HR_Policy_Manual_KFSLnew.pdf")


        # Save the parsed data to a file
        print("Saving the parse results in .pkl format ..........")
        joblib.dump(llama_parse_documents, data_file)

        # Set the parsed data to the variable
        parsed_data = llama_parse_documents

    return parsed_data

In [6]:
import chardet

def convert_to_utf8(file_path):
    """
    Convert the given file to UTF-8 encoding if it's not already in that format.
    """
    with open(file_path, 'rb') as f:
        raw_data = f.read()

    # Detect file encoding using chardet
    result = chardet.detect(raw_data)
    encoding = result['encoding'] or 'utf-8'  # Default to utf-8 if detection fails
    # print(encoding)

    print(f"Detected encoding: {encoding}")

    # Decode using the detected encoding and re-encode as utf-8
    text = raw_data.decode(encoding)
    
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

    print(f"File converted to UTF-8 successfully: {file_path}")

# convert_to_utf8("data/output.md")

In [9]:
def create_vector_database():
    
    # Call the function to either load or parse the data
    llama_parse_documents = load_or_parse_data()

    with open('data/output.md', 'a', encoding='utf-8', errors='ignore') as f:  # Open the file in append mode ('a')
        for doc in llama_parse_documents:
            f.write(doc.text + '\n')

    markdown_path = "./data/output.md"

    convert_to_utf8("data/output.md")
    print("utf conversion done")
    
    loader = UnstructuredMarkdownLoader(markdown_path)

    documents = loader.load()
    # Split loaded documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)

    #len(docs)
    # print(f"length of documents loaded: {len(documents)}")
    # print(f"total number of document chunks generated :{len(docs)}")
    #docs[0]

    # Initialize Embeddings
    embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

    # Create and persist a Chroma vector database from the chunked documents
    # vs = Chroma.from_documents(
    #     documents=docs,
    #     embedding=embed_model,
    #     persist_directory="chroma_db_llamaparse1",  # Local mode with in-memory storage only
    #     collection_name="rag"
    # )

    print('Vector DB created successfully !')
    return embed_model

In [10]:
embed_model = create_vector_database()

loading the data
Detected encoding: utf-8
File converted to UTF-8 successfully: data/output.md
utf conversion done


  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


Vector DB created successfully !


In [11]:
llm = ChatOpenAI(
    model_name = 'gpt-3.5-turbo',
    temperature = 0
    )

In [12]:
vectorstore = Chroma(embedding_function=embed_model,
                    persist_directory="chroma_db_llamaparse1",
                    collection_name="rag")

retriever=vectorstore.as_retriever(search_kwargs={'k': 1})

  vectorstore = Chroma(embedding_function=embed_model,


In [13]:
custom_prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below in a complete sentence and nothing else.
format the answers in bullets wherever required and prettify the text. Mention all the nested points that are truly required.
Helpful answer:
"""

In [14]:
def set_custom_prompt():

    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

prompt = set_custom_prompt()
prompt


PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of information to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nContext: {context}\nQuestion: {question}\n\nOnly return the helpful answer below in a complete sentence and nothing else.\nformat the answers in bullets wherever required and prettify the text. Mention all the nested points that are truly required.\nHelpful answer:\n")

In [17]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        combine_docs_chain_kwargs={'prompt':prompt}
    )

In [23]:
response = conversation_chain({
    "question":"what documents are required to carry on the date of joining?"
})

In [24]:
print(response['answer'])

The candidate is expected to carry the following documents on the date of joining:
- Certificates supporting academic/professional qualifications
  - SSC/10th Class - Certificate along with the mark sheets
  - Intermediate/12th Standard - Certificate along with the mark sheets
  - Bachelors Degree - Certificates along with the semester/year wise mark sheets
  - Masters / Professional Degree - Certificates along with the semester/year wise mark sheets
  - Diploma / PG Diploma - Certificate along with the mark sheets
  - Any other relevant academic certificates
- Latest salary payslip / Salary Certificate
- Relieving letter from last employer
- Service Certificate / Proof of Employment from present & all previous employers
- Form 16 (OR) Taxable Income Statement duly certified by the previous employer
- Statement showing the deductions & Taxable Income with break-up
- 6 recent passport colour photographs & 1 stamp size colour photograph
- Valid Passport: Photocopy of valid passport inclu

In [25]:
response

{'question': 'what documents are required to carry on the date of joining?',
 'chat_history': [HumanMessage(content='how are you?'),
  AIMessage(content="I don't know."),
  HumanMessage(content='what is the dress code of an organization?'),
  AIMessage(content='The dress code of the organization is as follows:\n- Men:\n  - Monday to Thursday: Formals with Tie\n  - Friday: Smart casuals\n  - Saturday: Casuals\n- Women:\n  - Monday to Friday: Formals'),
  HumanMessage(content='what documents are required to carry on the date of joining?'),
  AIMessage(content='The candidate is expected to carry the following documents on the date of joining:\n- Certificates supporting academic/professional qualifications\n  - SSC/10th Class - Certificate along with the mark sheets\n  - Intermediate/12th Standard - Certificate along with the mark sheets\n  - Bachelors Degree - Certificates along with the semester/year wise mark sheets\n  - Masters / Professional Degree - Certificates along with the semest

In [33]:
qa = RetrievalQA.from_chain_type(llm=llm,
                               chain_type="stuff",
                               retriever=retriever,
                               return_source_documents=True,
                               memory = memory,
                               chain_type_kwargs={"prompt": prompt})

In [35]:
response = qa.invoke({
    "query": "what is the dress code of an organization?"
    })

In [36]:
response

{'query': 'what is the dress code of an organization?',
 'history': [HumanMessage(content='what is the grade of assistant manager?'),
  AIMessage(content='- The grade of Assistant Manager is 4.'),
  HumanMessage(content='what is the dress code of an organization?'),
  AIMessage(content='The dress code for men in the organization is:\n- Monday to Thursday: Formals with Tie\n- Friday: Smart casuals\n- Saturday: Casuals\n\nThe dress code for women in the organization is:\n- Monday to Friday: Formals')],
 'result': 'The dress code for men in the organization is:\n- Monday to Thursday: Formals with Tie\n- Friday: Smart casuals\n- Saturday: Casuals\n\nThe dress code for women in the organization is:\n- Monday to Friday: Formals',
 'source_documents': [Document(metadata={'source': './data/output.md'}, page_content='Index\n\n9.0 Working Hours / Days\n\nAll employees will follow the normal working hours as follows:\n\nHO - Monday to Friday - 9:00 am to 6:00 pm\n\nOther Officer - Monday to Frida

In [169]:
print(response['result'])

- The grade of Assistant Manager is 4.


In [198]:
response = qa.invoke({
    "query": "what is the dress code of an organization?"
    })

In [199]:
response

{'query': 'what is the dress code of an organization?',
 'result': 'The dress code for men in the organization is:\n- Monday to Thursday: Formals with Tie\n- Friday: Smart casuals\n- Saturday: Casuals\n\nThe dress code for women in the organization is:\n- Monday to Friday: Formals',
 'source_documents': [Document(metadata={'source': './data/output.md'}, page_content='Index\n\n9.0 Working Hours / Days\n\nAll employees will follow the normal working hours as follows:\n\nHO - Monday to Friday - 9:00 am to 6:00 pm\n\nOther Officer - Monday to Friday - 9:00 am to 6:00 pm, Saturday ‚Äì 9:00 am to 1:30 pm\n\nThe working hours are designed to meet and exceed customer requirements. Employees may be required to extend working hours accordingly.\n\nIndex\n\n10.0 Dress Code\n\nThe following dress code has been prescribed for all employees of the company:\n\nMen\n\nMonday to Thursday ‚Äì Formals with Tie\n\nFriday ‚Äì Smart casuals\n\nSaturday ‚Äì Casuals\n\nWomen\n\nMonday to Friday ‚Äì Formals\n\

In [200]:
print(response['result'])

The dress code for men in the organization is:
- Monday to Thursday: Formals with Tie
- Friday: Smart casuals
- Saturday: Casuals

The dress code for women in the organization is:
- Monday to Friday: Formals


In [201]:
response = qa.invoke({
    "query": "and what do women wear on saturday?"
    })

In [202]:
response

{'query': 'and what do women wear on saturday?',
 'result': '- Women wear casuals on Saturdays.',
 'source_documents': [Document(metadata={'source': './data/output.md'}, page_content='Index\n\n9.0 Working Hours / Days\n\nAll employees will follow the normal working hours as follows:\n\nHO - Monday to Friday - 9:00 am to 6:00 pm\n\nOther Officer - Monday to Friday - 9:00 am to 6:00 pm, Saturday ‚Äì 9:00 am to 1:30 pm\n\nThe working hours are designed to meet and exceed customer requirements. Employees may be required to extend working hours accordingly.\n\nIndex\n\n10.0 Dress Code\n\nThe following dress code has been prescribed for all employees of the company:\n\nMen\n\nMonday to Thursday ‚Äì Formals with Tie\n\nFriday ‚Äì Smart casuals\n\nSaturday ‚Äì Casuals\n\nWomen\n\nMonday to Friday ‚Äì Formals\n\nThis document is a proprietary information of KFSL and should not be reproduced or altered without requisite permissions.\n\nConfidential\n\nPage 18 of 28\n\nHR Policy Manual - Karvy Fi

In [203]:
print(response['result'])

- Women wear casuals on Saturdays.


In [204]:
response = qa.invoke({
    "query": "what documents are required to carry on the date of joining?"
    })

In [205]:
response

{'query': 'what documents are required to carry on the date of joining?',
 'result': 'On the date of joining, the candidate is expected to carry the following documents:\n- Certificates supporting academic/professional qualifications:\n  - SSC/10th Class - Certificate along with the mark sheets\n  - Intermediate/12th Standard - Certificate along with the mark sheets\n  - Bachelors Degree - Certificates along with the semester/year wise mark sheets\n  - Masters / Professional Degree - Certificates along with the semester/year wise mark sheets\n  - Diploma / PG Diploma - Certificate along with the mark sheets\n  - Any other relevant academic certificates\n- Latest salary payslip / Salary Certificate\n- Relieving letter from last employer\n- Service Certificate / Proof of Employment from present & all previous employers\n- Form 16 (OR) Taxable Income Statement duly certified by the previous employer\n- Statement showing the deductions & Taxable Income with break-up\n- 6 recent passport co

In [206]:
print(response['result'])

On the date of joining, the candidate is expected to carry the following documents:
- Certificates supporting academic/professional qualifications:
  - SSC/10th Class - Certificate along with the mark sheets
  - Intermediate/12th Standard - Certificate along with the mark sheets
  - Bachelors Degree - Certificates along with the semester/year wise mark sheets
  - Masters / Professional Degree - Certificates along with the semester/year wise mark sheets
  - Diploma / PG Diploma - Certificate along with the mark sheets
  - Any other relevant academic certificates
- Latest salary payslip / Salary Certificate
- Relieving letter from last employer
- Service Certificate / Proof of Employment from present & all previous employers
- Form 16 (OR) Taxable Income Statement duly certified by the previous employer
- Statement showing the deductions & Taxable Income with break-up
- 6 recent passport colour photographs & 1 stamp size colour photograph
- Valid Passport:
  - Photocopy of valid passport

In [207]:
response = qa.invoke({
    "query": "what is the notice period of manager on probation and confirmation?"
    })

In [208]:
response

{'query': 'what is the notice period of manager on probation and confirmation?',
 'result': '- The notice period for a manager on probation is 15 days.\n- The notice period for a manager on confirmation is 30 days.',
 'source_documents': [Document(metadata={'source': './data/output.md'}, page_content='Step 4: Based on the discussion, the supervisor needs to inform HR either on confirmation of services or extension of probation for the appraisee. Extension can be granted for a span of 1 to 3 months depending on the discussion of the supervisor, skip level manager, and HR. In case of extension of probation, the supervisor should share a performance improvement plan with the appraisee along with defined timelines for the achievement of the same. Whichever is the case, probation extension and confirmations have to be finally approved by the CEO.\n\nStep 5: Confirmation/probation extension letter has to be issued to the appraisee. In case a probation extension letter is issued, it has to be

In [209]:
print(response['result'])

- The notice period for a manager on probation is 15 days.
- The notice period for a manager on confirmation is 30 days.


In [210]:
response = qa.invoke({
    "query": "for associates?"
    })

In [211]:
response

{'query': 'for associates?',
 'result': 'The interview process for Associate Trainees includes three tiers of interviewers: \n- Tier 1 & 2: CEO and KFSL HR\n- Tier 3: VP level vertical head and KFSL HR',
 'source_documents': [Document(metadata={'source': './data/output.md'}, page_content='Recruitment ñ Associate Trainee Program\n\nObjective\n\nTo provide business with resources for specific short-term projects simultaneously providing exposure to students who have to undertake projects as a part of their curriculum.\n\nScope\n\nAll trainees hired.\n\nProcess\n\nIdentification and grading of management institutes has to be an ongoing exercise. Based on this exercise, KFSL shall embark on identification of institutes from where associate trainees can be hired.\n\nThe plan should contain the following:\n\nProject guide\n\nTitle of the project\n\nDepartment in which the project will be undertaken\n\nStipend payable (if any)\n\nDate of start and tenure of the project\n\nOn approval of this 

In [212]:
print(response['result'])

The interview process for Associate Trainees includes three tiers of interviewers: 
- Tier 1 & 2: CEO and KFSL HR
- Tier 3: VP level vertical head and KFSL HR
