In [None]:
import os
import openai
from langchain.document_loaders import PyPDFLoader
import re
import json
import nltk

In [None]:
from dotenv import load_dotenv
load_dotenv()
openai.key = os.environ["OPENAI_API_KEY"]

### Document Loading

In [None]:
loader = PyPDFLoader('documents/Resume_Divya_Prasanth_Paraman.pdf')
pages = loader.load()

In [None]:
pages[0].page_content

'DIVYA PRASANTH PARAMAN\nBloomington, Indiana. 47404\ndparaman@iu.edu ⋄(812)·974·3423⋄LinkedIn: Divya Prasanth Paraman ⋄GitHub\nEDUCATION\nLuddy School of Informatics, Computing, and Engineering, Indiana University Bloomington May 2025\nMaster of Science in Data Science CGPA: 3.95/4\nKumaraguru College of Technology, Coimbatore, India\nBachelor of Engineering in Electronics and Communication Engineering\nPROFESSIONAL EXPERIENCE\nIndiana University May 2024 - August 2024\nFADS Research Fellow - NLP - Python, PyTorch, TensorFlow, Dockers Bloomington, IN\n·Leveraged OpenAI’s Whisper for speech-to-text analysis, implemented speaker identification using Praat/n-gram models, and\nperformed NLP analysis on vowel sounds via MFCC, resulting in an 80% reduction in manual data annotation; developed\nmicroservices to streamline the entire process.\nBounteous India September 2021 - July 2023\nSenior Data Analyst - Python, Tableau, SQL, Hadoop, ETL, Apache Airﬂow, Supervised Machine Learning Chennai

### Document Splitting

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
section_keywords = [
    re.compile(r"experience", re.IGNORECASE),
    re.compile(r"summary", re.IGNORECASE),
    re.compile(r"education", re.IGNORECASE),
    re.compile(r"projects", re.IGNORECASE),
    re.compile(r"skills", re.IGNORECASE),
    re.compile(r"achievements", re.IGNORECASE)
]

In [None]:
doc_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""],
    length_function=len
)

In [None]:
docs = doc_splitter.split_documents(pages)
docs

[Document(metadata={'source': 'documents/Resume_Divya_Prasanth_Paraman.pdf', 'page': 0}, page_content='DIVYA PRASANTH PARAMAN\nBloomington, Indiana. 47404\ndparaman@iu.edu ⋄(812)·974·3423⋄LinkedIn: Divya Prasanth Paraman ⋄GitHub\nEDUCATION\nLuddy School of Informatics, Computing, and Engineering, Indiana University Bloomington May 2025\nMaster of Science in Data Science CGPA: 3.95/4\nKumaraguru College of Technology, Coimbatore, India\nBachelor of Engineering in Electronics and Communication Engineering\nPROFESSIONAL EXPERIENCE\nIndiana University May 2024 - August 2024'),
 Document(metadata={'source': 'documents/Resume_Divya_Prasanth_Paraman.pdf', 'page': 0}, page_content='FADS Research Fellow - NLP - Python, PyTorch, TensorFlow, Dockers Bloomington, IN\n·Leveraged OpenAI’s Whisper for speech-to-text analysis, implemented speaker identification using Praat/n-gram models, and\nperformed NLP analysis on vowel sounds via MFCC, resulting in an 80% reduction in manual data annotation; deve

In [None]:
len(docs)

8

In [None]:
# let us try splitting using markdown header splitter
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [None]:
markdown_splitters = [
    ("PROFESSIONAL EXPERIENCE", "experience"),
    ("EDUCATION", "education"),
    ("ACADEMIC PROJECTS", "projects"),
    ("TECHNICAL STRENGTHS", "skills"),
]

In [None]:
splitter_markdown = MarkdownHeaderTextSplitter(headers_to_split_on=markdown_splitters)

In [None]:
docs = splitter_markdown.split_text(pages[0].page_content)

In [None]:
len(docs)

5

In [None]:
for index, each in enumerate(docs):
    each.metadata = {'header': ['contact'] if index==0 else list(each.metadata.keys())}

In [None]:
docs

[Document(metadata={'header': ['contact']}, page_content='DIVYA PRASANTH PARAMAN\nBloomington, Indiana. 47404\ndparaman@iu.edu ⋄(812)·974·3423⋄LinkedIn: Divya Prasanth Paraman ⋄GitHub'),
 Document(metadata={'header': ['education']}, page_content='Luddy School of Informatics, Computing, and Engineering, Indiana University Bloomington May 2025\nMaster of Science in Data Science CGPA: 3.95/4\nKumaraguru College of Technology, Coimbatore, India\nBachelor of Engineering in Electronics and Communication Engineering'),
 Document(metadata={'header': ['experience']}, page_content='Indiana University May 2024 - August 2024\nFADS Research Fellow - NLP - Python, PyTorch, TensorFlow, Dockers Bloomington, IN\n·Leveraged OpenAI’s Whisper for speech-to-text analysis, implemented speaker identification using Praat/n-gram models, and\nperformed NLP analysis on vowel sounds via MFCC, resulting in an 80% reduction in manual data annotation; developed\nmicroservices to streamline the entire process.\nBount

In [None]:
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

In [None]:
pc.delete_index("pinecone-index-resume-chatbot")

In [None]:
pc.create_index(
    name='pinecone-index-resume-chatbot',
    dimension=1536,
    metric='cosine',
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [None]:
index = pc.Index("pinecone-index-resume-chatbot")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [None]:
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])

In [None]:
vectorstore = PineconeVectorStore.from_documents(
    docs,
    embedding_model,
    index_name="pinecone-index-resume-chatbot"
)

### Self-Query Retriever

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [None]:
document_description = 'resume'
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="headers",
        description="A list of headers indicating the topics or sections within the document chunk",
        type="list[string]"
    )
]

In [None]:
document_content_description = "A chat with resume"
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    enable_limit=True,
    verbose=True
)

In [None]:
retriever.invoke("project he worked on related to recommendation system")

[Document(metadata={'header': ['projects']}, page_content='Movies Recommendation System - Graph Neural Networks, PyTorch, TensorFlow, PySpark May 2024\n·Implemented and trained the state-of-the-art LightGCN model on 100k and 1M datasets using IU’s GPU services to achieve\nparallelization, achieving 25% Recall@10 and 48% Precision@10.\nHouse Price Prediction - Python, Pandas, Scikit-learn, Matplotlib, Seaborn, XGBoost March 2024\n·Developed a model achieving an RMSE of 42K USD and MAE of 15% with proximity to ocean being the number 1 predictor.\nCredit Risk Analysis - Python, Scikit-learn, Seaborn, Matplotlib, XGBoost, RandomForest, SVM December 2023\n·Developed a model addressing class imbalance and achieving test accuracy of 67% and F1 score of 58%.'),
 Document(metadata={'header': ['education']}, page_content='Luddy School of Informatics, Computing, and Engineering, Indiana University Bloomington May 2025\nMaster of Science in Data Science CGPA: 3.95/4\nKumaraguru College of Technolo

### Let's try using the FAISS vector store

In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import uuid

index = faiss.IndexFlatL2(len(embedding_model.embed_query("hello world")))

In [None]:
vector_store_faiss = FAISS(embedding_model, index, docstore=InMemoryDocstore(), index_to_docstore_id={})

In [None]:
uuids = [str(uuid.uuid4()) for _ in range(len(docs))]

vector_store_faiss.add_documents(documents=docs, ids=uuids)

['6862ba30-ca59-4f1a-b286-e569d07c701e',
 'b7b99550-e639-408a-a3f8-61ccf1296ea5',
 'c00efb4f-2fdb-422f-b240-910c10dced76',
 '68af6ba8-53f1-4338-acc4-4b44758219cf',
 '25fa2428-cfb6-4e6c-8b80-95355cd49a75']

In [None]:
results = vector_store_faiss.similarity_search(
    "provide his experience",
    k=1
)

for result in results:
    print(f"{result.page_content}")

Indiana University May 2024 - August 2024
FADS Research Fellow - NLP - Python, PyTorch, TensorFlow, Dockers Bloomington, IN
·Leveraged OpenAI’s Whisper for speech-to-text analysis, implemented speaker identification using Praat/n-gram models, and
performed NLP analysis on vowel sounds via MFCC, resulting in an 80% reduction in manual data annotation; developed
microservices to streamline the entire process.
Bounteous India September 2021 - July 2023
Senior Data Analyst - Python, Tableau, SQL, Hadoop, ETL, Apache Airﬂow, Supervised Machine Learning Chennai, India
·Freed up about 50% of manual labor hours for higher value tasks by developing an AI model that performs data taxonomy
within the proof of concept project.
·Scaled the business impact report generation across 1000+ customers by completely automating the process by building end-
to-end pipelines and Business Intelligence dashboards achieving a reduction in report development effort by 80%.
·Managed a team of three analytics prof

### Building a QA Chain

In [None]:
llm2 = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

template = """
Use the following to answer the question at the end. If you don't know the answer, just say that you don't know the answer, don't try to make up an answer. Use three sentences maximum and keep the answer as concise as possible. Always end with "Thanks for asking!".
{context}
Question: {question}
Helpful answer:"""
qa_prompt = PromptTemplate.from_template(template)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm2,  # Make sure llm2 is an instance of a language model
    retriever=vectorstore.as_retriever(),  # Correct spelling of 'retriever'
    return_source_documents=True,
    chain_type_kwargs={"prompt": qa_prompt}
)

In [None]:
question = "how many years experience does he have excluding education?"
qa_chain({"query": question})

{'query': 'how many years experience does he have excluding education?',
 'result': ' Divya Prasanth Paraman has approximately 4 years of experience excluding education. Thanks for asking!',
 'source_documents': [Document(metadata={'header': ['education']}, page_content='Luddy School of Informatics, Computing, and Engineering, Indiana University Bloomington May 2025\nMaster of Science in Data Science CGPA: 3.95/4\nKumaraguru College of Technology, Coimbatore, India\nBachelor of Engineering in Electronics and Communication Engineering'),
  Document(metadata={'header': ['skills']}, page_content='Professional Certifications TensorFlow Developer Certificate\nIndustry Skills Scrum, Agile, Software Development Life Cycle Practices, Git\nAI Frameworks LangChain, TensorFlow, PyTorch\nProgramming Languages Python, SQL, R, C\nDatabases PostgreSQL, MySQL, Hadoop, Hive\nVisualisation Tools Tableau, Power BI, Microsoft Excel, Python(Matplotlib, Seaborn, Plotly)\nStatistical Techniques Hypothesis Te

### Including Memory

In [None]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
from langchain.chains import ConversationalRetrievalChain

qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=vectorstore.as_retriever(),
    memory=memory
)

In [None]:
question = "Does he have software engineering experience?"
result = qa({"question": question})
result['answer']

'\n\nBased on the context provided, it appears that Divya Prasanth Paraman has experience in software engineering through his roles at Soliton Technologies and Luddy School of Informatics, Computing, and Engineering. At Soliton Technologies, he worked as a Software Engineer and developed a Python framework that resulted in a 2x acceleration in software development for GUI applications. He also built modular code and enabled continuous integration and continuous deployment practices. As a Software Engineer Intern at Soliton Technologies, he worked on web development and developed a web repository called SMILE that encourages cross-functional team collaboration. Additionally, at Luddy School of Informatics, Computing, and Engineering, he is pursuing a Master of Science in Data Science, which may also involve software engineering coursework and projects.'

In [None]:
result = qa({"question": "What kind of relevant tools has he used?"})
result['answer']

" In his software engineering experience, he has used Python, PyTorch, TensorFlow, Dockers, Praat, n-gram models, MFCC, OpenAI's Whisper, Tableau, SQL, Hadoop, ETL, Apache Airflow, Supervised Machine Learning, Advanced OOPS/Data Structures, SDLC, Agile, CI/CD, Angular, Pandas, Scikit-learn, Matplotlib, Seaborn, XGBoost, and RandomForest."