In [1]:
pip install chromaDB

Collecting chromaDB
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromaDB)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromaDB)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-win_amd64.whl.metadata (262 bytes)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromaDB)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromaDB)
  Downloading posthog-3.21.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromaDB)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.31.0-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromaDB)
  Downloading opentelemetry_instrumentation_fastapi-0.52b0-py3-none-any.whl.metadata (2.2 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromaDB)
  Downloading opentelemetry_sdk-1.31.0-py3-none

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-api-core 2.17.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5, but you have protobuf 5.29.4 which is incompatible.
streamlit 1.33.0 requires protobuf<5,>=3.20, but you have protobuf 5.29.4 which is incompatible.
streamlit 1.33.0 requires tenacity<9,>=8.1.0, but you have tenacity 9.0.0 which is incompatible.


In [11]:
import os 
import pdfplumber
import pytesseract
import re
import spacy
import torch
import chromadb
import numpy as np
import uuid

In [4]:
from PIL import Image
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
nlp = spacy.load("en_core_web_sm")
chroma_client = chromadb.PersistentClient(path = "./chroma_db")
collection = chroma_client.get_or_create_collection(name='resume_embeddings')

# Extract Text

In [6]:
def extract_text(file_path:str)->str:
    """
    extract text from file(PDF or Image)
    Parameters:
        file_path(str): Path to file. Supported formats:  "pdf", "jpg", "jpeg", "png".
    Returns:
        str: extracted text form the file.
    Raises:
        ValueError: when the file type is unsupported
    """
    text = ""
    _,file_extension = os.path.splitext(file_path)
    file_extension= file_extension.lower()
    if file_extension in [".pdf"]:
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text+= page.extract_text()
        except Exception as e:
            return f"Error in parsing the pdf: {e}"
    elif file_extension in [".jpg", ".jpeg", ".png"]:
        try:
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)
        except Exception as e:
            return f"Error in parsing in image: {e}"
    else:
        raise ValueError("unsupported file type. Please use 'pdf', 'jpg', 'jpeg' or 'png'")
    return text.strip()

# Pre-Process Text

In [7]:
def clean_text(text:str)->str:
    """
    Cleans the extracted text including:
     -Removing extra spaces and new Lines.
     -Handling common ocr errors (e.g., 'ﬁ' to 'fi').
     -Normalizing punctuation.
    Parameters:
        text(str): Text to be cleaned.
    Returns:
        str: Cleaned text.
    """
    text = re.sub(r'\s+', ' ', text) # replace multiple spaces with single space
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove non-ASCII characters
    text = re.sub(r'ﬁ', 'fi', text) # common ocr mistake
    return text
def segment_into_sentences(text: str)->list[str]:
    """
    Segments cleaned text into individual sentences. the sentences are the stream of words in single line
    Parameters:
        text(str): Cleaned text to be segmented
    Returns:
        list[str]: list of sentences extracted from the text.
    """
    sentence_endings = re.compile(r'(\.|\n|\t)')
    sentences = sentence_endings.split(text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    return sentences
def preProcess_chunk_text(text: str, chunk_size=10) -> list[list[str]]:
    """
    Divide the sentences from the cleaned text into manageable chunks. Each chunk contains sentences, and the number of words is minimum the chunk_size.

    Parameters:
        text (str): Parsed text obtained from the file that is to be chunked.
        chunk_size (int): Maximum number of words per chunk (Default: 20).

    Returns:
        list[list[str]]: List of chunks, where each chunk contains sentences with a word count greater than or equal to chunk_size.
    """
    text = clean_text(text)
    sentences = segment_into_sentences(text)

    chunks = []
    chunk = []
    word_count = 0
    
    for sentence in sentences:
        if(sentence == "."):
            continue
        chunk.append(sentence)
        word_count += len(sentence.split())
        if(word_count>chunk_size):
            chunks.append(' '.join(chunk))
            chunk = []
            word_count = 0
            
    if chunk:
        chunks.append(' '.join(chunk))

    return chunks
    

# Form Embedding

In [8]:
model_path = "BERT_FineTuned_Model2"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)

In [118]:
def extract_keywords(text: str):
    """
    extracts the key phrases  from the question.
    Parametes:
        text(str): text from which keywords will be extracted
    Returns: 
        List[str]: a list of key words.
    """
    doc = nlp(text)
    keywords = [token.text.lower() for token in doc if token.pos_ in ['NOUN', 'PROPN', 'ADJ','VERB']] 
    return keywords
    
def get_sentence_embedding(text:str):
    """
    Convert a sentence into its dense embedding usng RoBERTa.
    Parameters:
        text(str): the text to be converted into embedding.
    Returns
        embedding: the dense vector representing embedding.
    """
    inputs = tokenizer(text, return_tensors = "pt", padding = True, truncation = True, max_length = 512)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state
    embedding = hidden_states.mean(dim = 1).squeeze()
    return embedding

In [119]:
def store_embeddings(text_chunks:list[str]):
    """
    stores text chunks and their embeddings into chroma DB.

    Parameters:
        text_chunks(list[str]): list of extracted text chunks.
    Returns:
        None(Stores the embedding in chromaDB)
    """

    for chunk in text_chunks:
        embedding = get_sentence_embedding(chunk).tolist()
    
        collection.add(
            ids = [str(uuid.uuid4())],
            embeddings = [embedding],
            metadatas = [{'text':chunk}]
        )
    print("text_chunk was stored as embedding")

In [120]:
def get_most_relevant_sentences(question: str, top_k = 7)->list[str]:
    """
    find the most relevant sentences in the resume for a given question using chromaDB and boost the relevance using keyword matching

    Parameters: 
        questions(str): the user's query.
        resume_chunks(list): list of resume text chunks
        top_k: maximum number of relvant sentences required(default 7)
    Returns:
        List[str]: the list of most relevant chunks
    """
    keywords = extract_keywords(question)
    question_embedding = get_sentence_embedding(question).numpy()
    results = collection.query(
        query_embeddings = [question_embedding],
        n_results = int(top_k*5)
    )

    # print("results \n", results)
    retrieved_chunks = results['metadatas'][0]
    distances = results['distances'][0]
    print("Retrieved Chunks:")
    for i, chunk in enumerate(retrieved_chunks):
        print(f"{i+1}. {chunk['text']} (Distance: {distances[i]})")

    text_chunks = [chunk['text'] for chunk in retrieved_chunks]
    # print(text_chunks)
    keyword_boost = []
    for chunk,distance in zip(text_chunks, distances):
        # print(chunk)
        # print(distance)
        chunk_keywords = extract_keywords(chunk)

        match_count = len([chunk_keyword for chunk_keyword in chunk_keywords if chunk_keyword in keywords])
        print(chunk)
        print(chunk_keywords)
        print(match_count)
        
        sim = 1/(1+distance)
        boosted_score= sim + match_count*0.54
        keyword_boost.append((boosted_score, chunk))
        
    keyword_boost.sort(key = lambda x:x[0], reverse = True)
    # print(keyword_boost)
    top_k_chunks = [chunk for _, chunk in keyword_boost[:top_k]]

    return top_k_chunks

# Model Development

In [121]:
qa_model_path = "bert-large-uncased-whole-word-masking-finetuned-squad"
qa_tokenizer = BertTokenizer.from_pretrained(qa_model_path)
qa_model = BertForQuestionAnswering.from_pretrained(qa_model_path)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [122]:
def generate_answer(question: str, relevant_sentences:list[str])->str:
    """
    Generate a clear and concise answer to the question based on the most relevant sentences.

    Parameters:
        question(str):The user's query.
        relevant_sentences(list[str]): list of most relevant sentences.
        
    Returns:
        str: Generate answer.
    """

    context = " ".join(relevant_sentences)
    inputs = qa_tokenizer.encode_plus(question, context, return_tensors='pt', truncation = True, max_length = 512)
    # print(inputs)
    with torch.no_grad():
        output = qa_model(**inputs)
        start_scores,end_scores, = output.start_logits, output.end_logits
        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores) + 1

    answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index]))

    return answer.strip()

# Testing

In [123]:
print(extract_keywords(question))

['skills']


In [124]:
text  = extract_text("D:\Resumes\off_campus.pdf")
resume_chunks = preProcess_chunk_text(text) 
chroma_client.delete_collection(name="resume_embeddings")
collection = chroma_client.get_or_create_collection(name="resume_embeddings")  # Recreate it
print("✅ Collection deleted and recreated!")
store_embeddings(resume_chunks)
question = "what are the skills?"
top_relevant_sentences = get_most_relevant_sentences(question)
answer = generate_answer(question, top_relevant_sentences)
answer

✅ Collection deleted and recreated!


Number of requested results 35 is greater than number of elements in index 17, updating n_results = 17


text_chunk was stored as embedding
Retrieved Chunks:
1. Utilized the GPT model to generate clear & concise response comparing the suggested with prior selections (Distance: 224.90476619830622)
2. Optimized response generation by assigning weights to sentences based on their alignment with the question (Distance: 226.94681803478935)
3. Recommended targeted corrective actions to mitigate consumption spikes & optimize consumption efficiency (Distance: 238.03595262031405)
4. Developed anomaly detection techniques to accurately identify irregular energy consumption patterns (Distance: 281.3277128566909)
5. Developed a user-friendly interface with Bootstrap and Flask, enabling seamless user interaction with system Predictive Optimization and Anomaly Detection in Energy Consumption Apr 2024 - May 2024 Used Facebook Prophet to forecast energy consumption trends by analyzing weather and household factors (Distance: 284.5439654890591)
6. Analyzed fund merits, user purchase patterns and risk prof

'problem solving , leadership , collaborative'

In [126]:
top_relevant_sentences


['EDUCATION B Tech in Civil Engineering 2021 - present Indian Institute Technology Roorkee ADDITIONAL INFORMATION Computer Languages: Python, C++, SQL Soft skills: Problem Solving, Leadership, Collaborative Additional Courses: DeepLearning',
 'Utilized the GPT model to generate clear & concise response comparing the suggested with prior selections',
 'Optimized response generation by assigning weights to sentences based on their alignment with the question',
 'Recommended targeted corrective actions to mitigate consumption spikes & optimize consumption efficiency',
 'Developed anomaly detection techniques to accurately identify irregular energy consumption patterns',
 'Developed a user-friendly interface with Bootstrap and Flask, enabling seamless user interaction with system Predictive Optimization and Anomaly Detection in Energy Consumption Apr 2024 - May 2024 Used Facebook Prophet to forecast energy consumption trends by analyzing weather and household factors',
 'Analyzed fund meri