# Loading the file

In [1]:
import fitz

def load_pdf(pdf_file_path):
    contents = []
    doc = fitz.open(pdf_file_path)

    for page in doc:
        content = page.get_text()
        contents.append("\n" + content)
    
    return "\n".join(contents)

In [2]:
docs = load_pdf("C:/Users/vasan/Downloads/resume.pdf")

In [3]:
docs

'\nVasanth\nProfile:\nDetail-oriented and innovative Data Scientist with a specialization in \nNatural Language Processing (NLP) and Large Language Models (LLM). Excels \nin leveraging advanced machine learning techniques to derive actionable \ninsights from textual data. Passionate about pushing the boundaries of NLP \nto solve real-world problems and drive business value.\nProfessional Experience:\nSenior Data Scientist\nTechGenius AI, Silicon Valley\nJuly 2020 - Present\nLed a team of data scientists in developing cutting-edge NLP models, \nresulting in a X% improvement in accuracy over previous solutions.\nImplemented sentiment analysis and entity recognition algorithms to \nextract key insights from customer feedback, driving product improvements.\nCollaborated with software engineers to deploy NLP models into production \nsystems, ensuring scalability and real-time performance.\nConducted research into novel approaches to language modeling, including \ntransformer-based architect

# Splitter

In [7]:
def chunk_documents(text, chunk_size, overlap):

    ## Vasanth is saying
    ## [Vasan, th is , sayin, g] -> chunk_size=5, overlap=0
    ## [Vasan, santh, nth i, h is , is sa, saying] -> chunk_size=5, overlap=3

    split_lists = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        split_lists.append(text[start:end])
        start += chunk_size - overlap

    return split_lists

In [10]:
chunks = chunk_documents(docs, 500, 50)

In [11]:
chunks

['\nVasanth\nProfile:\nDetail-oriented and innovative Data Scientist with a specialization in \nNatural Language Processing (NLP) and Large Language Models (LLM). Excels \nin leveraging advanced machine learning techniques to derive actionable \ninsights from textual data. Passionate about pushing the boundaries of NLP \nto solve real-world problems and drive business value.\nProfessional Experience:\nSenior Data Scientist\nTechGenius AI, Silicon Valley\nJuly 2020 - Present\nLed a team of data scientists in d',
 ' 2020 - Present\nLed a team of data scientists in developing cutting-edge NLP models, \nresulting in a X% improvement in accuracy over previous solutions.\nImplemented sentiment analysis and entity recognition algorithms to \nextract key insights from customer feedback, driving product improvements.\nCollaborated with software engineers to deploy NLP models into production \nsystems, ensuring scalability and real-time performance.\nConducted research into novel approaches to l

# Simple Vector DB

In [14]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Torch save vectorstore -> Compute embeddings for document, Embedding Dict, Save Dict as Torch file

def store_embeddings_as_dict(embeddings_list, text_list):
    embeddings_dict = {text: embedding for text, embedding in zip(text_list, embeddings_list)}
    return embeddings_dict

def save_embeddings_to_pt(embeddings_dict, filename):
    torch.save(embeddings_dict, filename)

def create_vectorstore(embed_model, chunks):
    chunk_embeddings = []
    embedder = SentenceTransformer(embed_model)
    for chunk in tqdm(chunks, desc="Vasanth Embeddings..."):
        chunk_embedding = embedder.encode(chunk, convert_to_tensor=True).to("cpu")
        chunk_embeddings.append(chunk_embedding)
    embeddings_dict = store_embeddings_as_dict(chunk_embeddings, chunks)
    save_embeddings_to_pt(embeddings_dict, "embeddings.pt")

In [15]:
create_vectorstore("sentence-transformers/all-MiniLM-L6-v2", chunks)

Vasanth Embeddings...: 100%|██████████| 7/7 [00:00<00:00, 24.50it/s]


# Retrieve top k docs

In [16]:
from sentence_transformers import util

def retrieve_relevant_docs(embed_model, embeddings_path, query, top_k):
    embedder = SentenceTransformer(embed_model)
    embeddings_dict = torch.load(embeddings_path)
    chunks = list(embeddings_dict.keys())
    query_encoded = embedder.encode(query, convert_to_tensor=True)
    top_k = min(top_k, len(embeddings_dict))
    scores = []
    for chunk in tqdm(chunks, desc="Computing Similarity..."):
        cos_score = util.cos_sim(query_encoded, embeddings_dict[chunk])[0]
        scores.append(cos_score)
    scores = torch.Tensor(scores)
    top_k_chunk_indices = torch.topk(scores, k=top_k).indices.tolist()
    top_k_chunks =[chunks[i] for i in top_k_chunk_indices]
    return top_k_chunks

In [17]:
top_k_docs = retrieve_relevant_docs("sentence-transformers/all-MiniLM-L6-v2", "embeddings.pt", "What are the skills specified", 3)

Computing Similarity...: 100%|██████████| 7/7 [00:00<00:00, 6992.17it/s]


In [18]:
top_k_docs

['l as part of a document analysis pipeline, automating the \nextraction of key information for legal professionals.\nCertifications:\nDeep Learning Specialization (Coursera)\nNatural Language Processing with Deep Learning (Stanford University)\nLanguages:\nFluent in English and Spanish\n',
 '\nVasanth\nProfile:\nDetail-oriented and innovative Data Scientist with a specialization in \nNatural Language Processing (NLP) and Large Language Models (LLM). Excels \nin leveraging advanced machine learning techniques to derive actionable \ninsights from textual data. Passionate about pushing the boundaries of NLP \nto solve real-world problems and drive business value.\nProfessional Experience:\nSenior Data Scientist\nTechGenius AI, Silicon Valley\nJuly 2020 - Present\nLed a team of data scientists in d',
 'ds, informing business strategy.\nContributed to the development of automated text summarization algorithms, \nreducing manual effort and improving efficiency.\nEducation:\nMaster of Scienc

# Stuff docs

In [19]:
def stuff_docs(relevant_chunks):
    relevant_chunks = [f"Document{i}: {value}\n\n" for i, value in enumerate(relevant_chunks, start=1)]
    stuffed_chunk = "".join(relevant_chunks)
    return stuffed_chunk

In [20]:
stuffed_doc = stuff_docs(top_k_docs)
stuffed_doc

'Document1: l as part of a document analysis pipeline, automating the \nextraction of key information for legal professionals.\nCertifications:\nDeep Learning Specialization (Coursera)\nNatural Language Processing with Deep Learning (Stanford University)\nLanguages:\nFluent in English and Spanish\n\n\nDocument2: \nVasanth\nProfile:\nDetail-oriented and innovative Data Scientist with a specialization in \nNatural Language Processing (NLP) and Large Language Models (LLM). Excels \nin leveraging advanced machine learning techniques to derive actionable \ninsights from textual data. Passionate about pushing the boundaries of NLP \nto solve real-world problems and drive business value.\nProfessional Experience:\nSenior Data Scientist\nTechGenius AI, Silicon Valley\nJuly 2020 - Present\nLed a team of data scientists in d\n\nDocument3: ds, informing business strategy.\nContributed to the development of automated text summarization algorithms, \nreducing manual effort and improving efficiency.

# Create prompt

In [21]:
prompt = """Answer the question based on the given context alone.
context: {context}
question: {question}
answer:"""

# Answer with LLM

In [22]:
import os
from groq import Groq
def qa(llm_name, context, question):
    llm_inp = prompt.format(context=context, question=question)
    client = Groq(api_key="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")
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": llm_inp,
        }
    ],
    model=llm_name
    )

    return chat_completion.choices[0].message.content

In [23]:
qa("llama3-70b-8192", top_k_docs, "What are the skills specified")

'According to the context, the skills specified are:\n\n* Programming Languages: Python, R, SQL\n* Machine Learning Libraries: TensorFlow, PyTorch, scikit-learn\n* NLP Frameworks: NLTK, spaCy, Transformers\n* Data Visualization: Matplotlib, Seaborn, Tableau'