In [17]:
import os 
import pdfplumber
import pytesseract
import re
import spacy
import torch

In [18]:
from PIL import Image
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Extract Text

In [20]:
def extract_text(file_path:str)->str:
    """
    extract text from file(PDF or Image)
    Parameters:
        file_path(str): Path to file. Supported formats:  "pdf", "jpg", "jpeg", "png".
    Returns:
        str: extracted text form the file.
    Raises:
        ValueError: when the file type is unsupported
    """
    text = ""
    _,file_extension = os.path.splitext(file_path)
    file_extension= file_extension.lower()
    if file_extension in [".pdf"]:
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text+= page.extract_text()
        except Exception as e:
            return f"Error in parsing the pdf: {e}"
    elif file_extension in [".jpg", ".jpeg", ".png"]:
        try:
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)
        except Exception as e:
            return f"Error in parsing in image: {e}"
    else:
        raise ValueError("unsupported file type. Please use 'pdf', 'jpg', 'jpeg' or 'png'")
    return text.strip()

# Pre-Process Text

In [27]:
def clean_text(text:str)->str:
    """
    Cleans the extracted text including:
     -Removing extra spaces and new Lines.
     -Handling common ocr errors (e.g., 'ﬁ' to 'fi').
     -Normalizing punctuation.
    Parameters:
        text(str): Text to be cleaned.
    Returns:
        str: Cleaned text.
    """
    text = re.sub(r'\s+', ' ', text) # replace multiple spaces with single space
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove non-ASCII characters
    text = re.sub(r'ﬁ', 'fi', text) # common ocr mistake
    return text
def segment_into_sentences(text: str)->list[str]:
    """
    Segments cleaned text into individual sentences
    Parameters:
        text(str): Cleaned text to be segmented
    Returns:
        list[str]: list of sentences extracted from the text.
    """
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences
def preProcess_chunk_text(text:str, chunk_size = 20)->list[list[str]]:
    """
    Divide the sentences from the cleaned text into manageable chunks. Each chunk contain sentences, and number of words in each chunk does not exceed 
    specified chunk_size
    Paramters:
        text(str): parsed text obtained from the file that is to be chunked.
        chunk_size(int): Maximum number of words per chunk (Default: 20).
    Returns:
        list[list[str]]: list of chunks, where each chunk contains sentences having number of words is less than or equal to chunk_size.|
        
    """
    text = clean_text(text)
    sentences= segment_into_sentences(text)
    
    chunks = []
    chunk = []
    word_count = 0
    for sentence in sentences: 
        words = sentence.split()
        word_count+= len(words)
        if word_count>chunk_size:
            word_count = len(words)
            chunks.append(" ".join(chunk))
            chunk = []
        chunk.append(sentence)
    if chunk:
        chunks.append(' '.join(chunk))
    return chunks
    

# Form Embedding

In [None]:
def extract_keywords(text: str):
    """
    extracts the key phases and noun from the question.
    Parametes:
        text(str): text form which keywords will be extracted
    Returns: 
        List[str]: a list of key phrases.
    """
    doc = nlp(question)
    keywords = [token.text.lower() for token in doc if token.pos_ in ['NOUN', 'PROPN', 'ADJ']]
    return keywords
    
def get_sentence_embedding(text:str):
    """
    Convert a sentence into its dense embedding usng RoBERTa.
    Parameters:
        text(str): the text to be converted into embedding.
    Returns
        embedding: the dense vector representing embedding.
    """
    inputs = tokenizer(text, return_tensors = "pt", padding = True, truncation = True, max_length = 512)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state
    embedding = hidden_states.mean(dim = 1).squeeze()
    return embedding

In [52]:
def get_most_relevant_sentences(question: str, resume_chunks: list[str], top_k = 7)->list[str]:
    """
    find the most relevant sentences in the resume for a given question

    Parameters: 
        questions(str): the user's query.
        resume_chunks(list): list of resume text chunks
        top_k: maximum number of relvant sentences required(default 7)
    Returns:
        List[str]: the list of most relevant chunks
    """
    keywords = extract_keywords(question)
    question_embedding = get_sentence_embedding(question)
    chunk_embeddings = [get_sentence_embedding(chunk) for chunk in resume_chunks]

    similarities = cosine_similarity([question_embedding.numpy()], [chunks.numpy() for chunks in chunk_embeddings])

    keyword_boost = []
    for chunk, sim in zip(resume_chunks, similarities[0]):
        chunk_keywords = extract_keywords(chunk)
        match_count = len(set(keywords) & set(chunk_keywords))
        boosted_sim = sim + match_count*0.1
        keyword_boost.append((boosted_sim, chunk))

    keyword_boost.sort(key = lambda x:x[0], reverse = True)
    top_k_chunks = [chunk for _, chunk in keyword_boost[:top_k]]

    return top_k_chunks

# Testing

In [53]:
resume_chunks

['PULKIT RAWAT UG (IV Year I Semester) Email: rpulkit610@gmail.com B.Tech.',
 '(Civil Engineering) p_rawat@ce.iitr.ac.in Contact No: 6261360983 linkedin.com/in/pulkit-rawat/ AREAS OF INTEREST Data Analytics, Data Science, Natural Language Processing, Data Structure and Algorithm, Gymnasium WORK EXPERIENCE Data Scientist , BuildWealth Technologies Pvt Ltd Dec 2023 - Apr 2023 Leveraged customer support tickets data from freshdesk to built a chatbot that answered user queries Integrated an RAG model with GPT to generate responses from both text and images, enhancing accuracy.',
 'Analyzed fund merits, user purchase patterns and risk profiles to develop a robust fund suggestion model.',
 'Utilized the GPT model to generate clear & concise response comparing the suggested with prior selections.',
 'PERSONAL PROJECTS May 2024 - July 2024 Enhancement of haptic alert systems for distracted pedestrians Engineered a smart wearable device that vibrates based on vehicle proximity, alerting pedestr

In [56]:
text  = extract_text("D:\Resumes\off_campus.pdf")
resume_chunks = preProcess_chunk_text(text) 
question = "what are my AREAS OF INTEREST"
top_relevant_chunks = get_most_relevant_sentences(question, resume_chunks, 10)
print(top_relevant_chunks)

['PULKIT RAWAT UG (IV Year I Semester) Email: rpulkit610@gmail.com B.Tech.', 'PERSONAL PROJECTS May 2024 - July 2024 Enhancement of haptic alert systems for distracted pedestrians Engineered a smart wearable device that vibrates based on vehicle proximity, alerting pedestrians of traffic.', '(Civil Engineering) p_rawat@ce.iitr.ac.in Contact No: 6261360983 linkedin.com/in/pulkit-rawat/ AREAS OF INTEREST Data Analytics, Data Science, Natural Language Processing, Data Structure and Algorithm, Gymnasium WORK EXPERIENCE Data Scientist , BuildWealth Technologies Pvt Ltd Dec 2023 - Apr 2023 Leveraged customer support tickets data from freshdesk to built a chatbot that answered user queries Integrated an RAG model with GPT to generate responses from both text and images, enhancing accuracy.', 'Developed anomaly detection techniques to accurately identify irregular energy consumption patterns.', 'Extra curriculars: Head Research Analyst | Appetizer, Executive Member | STC', 'Analyzed fund merit