In [1]:
import os 
import pdfplumber
import pytesseract
import re
import spacy
import torch

In [2]:
from PIL import Image
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Extract Text

In [4]:
def extract_text(file_path:str)->str:
    """
    extract text from file(PDF or Image)
    Parameters:
        file_path(str): Path to file. Supported formats:  "pdf", "jpg", "jpeg", "png".
    Returns:
        str: extracted text form the file.
    Raises:
        ValueError: when the file type is unsupported
    """
    text = ""
    _,file_extension = os.path.splitext(file_path)
    file_extension= file_extension.lower()
    if file_extension in [".pdf"]:
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text+= page.extract_text()
        except Exception as e:
            return f"Error in parsing the pdf: {e}"
    elif file_extension in [".jpg", ".jpeg", ".png"]:
        try:
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)
        except Exception as e:
            return f"Error in parsing in image: {e}"
    else:
        raise ValueError("unsupported file type. Please use 'pdf', 'jpg', 'jpeg' or 'png'")
    return text.strip()

# Pre-Process Text

In [151]:
def clean_text(text:str)->str:
    """
    Cleans the extracted text including:
     -Removing extra spaces and new Lines.
     -Handling common ocr errors (e.g., 'ﬁ' to 'fi').
     -Normalizing punctuation.
    Parameters:
        text(str): Text to be cleaned.
    Returns:
        str: Cleaned text.
    """
    text = re.sub(r'\s+', ' ', text) # replace multiple spaces with single space
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove non-ASCII characters
    text = re.sub(r'ﬁ', 'fi', text) # common ocr mistake
    return text
def segment_into_sentences(text: str)->list[str]:
    """
    Segments cleaned text into individual sentences
    Parameters:
        text(str): Cleaned text to be segmented
    Returns:
        list[str]: list of sentences extracted from the text.
    """
    sentence_endings = re.compile(r'(\.|\n|\t)')
    sentences = sentence_endings.split(text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    return sentences
def preProcess_chunk_text(text: str, chunk_size=10) -> list[list[str]]:
    """
    Divide the sentences from the cleaned text into manageable chunks. Each chunk contains sentences, and the number of words in each chunk does not exceed 
    the specified chunk_size. Additionally, chunks are divided at line breaks (\n) to ensure logical chunking.

    Parameters:
        text (str): Parsed text obtained from the file that is to be chunked.
        chunk_size (int): Maximum number of words per chunk (Default: 20).

    Returns:
        list[list[str]]: List of chunks, where each chunk contains sentences with a word count less than or equal to chunk_size.
    """
    text = clean_text(text)
    sentences = segment_into_sentences(text)

    chunks = []
    chunk = []
    word_count = 0
    
    for sentence in sentences:
        if(sentence == "."):
            continue
        chunk.append(sentence)
        word_count += len(sentence.split())
        if(word_count>chunk_size):
            chunks.append(' '.join(chunk))
            chunk = []
            word_count = 0
            
    if chunk:
        chunks.append(' '.join(chunk))

    return chunks
    

# Form Embedding

In [125]:
model_path = "BERT_FineTuned_Model2"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)

In [138]:
def extract_keywords(text: str):
    """
    extracts the key phases and noun from the question.
    Parametes:
        text(str): text form which keywords will be extracted
    Returns: 
        List[str]: a list of key phrases.
    """
    doc = nlp(text)
    keywords = [token.text.lower() for token in doc if token.pos_ in ['NOUN', 'PROPN', 'ADJ']]
    return keywords
    
def get_sentence_embedding(text:str):
    """
    Convert a sentence into its dense embedding usng RoBERTa.
    Parameters:
        text(str): the text to be converted into embedding.
    Returns
        embedding: the dense vector representing embedding.
    """
    inputs = tokenizer(text, return_tensors = "pt", padding = True, truncation = True, max_length = 512)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state
    embedding = hidden_states.mean(dim = 1).squeeze()
    return embedding

In [162]:
def get_most_relevant_sentences(question: str, resume_chunks: list[str], top_k = 7)->list[str]:
    """
    find the most relevant sentences in the resume for a given question

    Parameters: 
        questions(str): the user's query.
        resume_chunks(list): list of resume text chunks
        top_k: maximum number of relvant sentences required(default 7)
    Returns:
        List[str]: the list of most relevant chunks
    """
    keywords = extract_keywords(question)
    question_embedding = get_sentence_embedding(question).numpy()
    chunk_embeddings = [get_sentence_embedding(chunk).numpy() for chunk in resume_chunks]

    chunk_embeddings = [embedding.flatten() for embedding in chunk_embeddings]
    similarities = cosine_similarity([question_embedding], chunk_embeddings)

    keyword_boost = []
    for chunk, sim in zip(resume_chunks, similarities[0]):
        chunk_keywords = extract_keywords(chunk)
        match_count = len([chunk_keyword for chunk_keyword in chunk_keywords if chunk_keyword in keywords])
        boosted_sim = sim + match_count*0.54
        keyword_boost.append((boosted_sim, chunk))
    keyword_boost.sort(key = lambda x:x[0], reverse = True)
    top_k_chunks = [chunk for _, chunk in keyword_boost[:top_k]]

    return top_k_chunks

# Testing

In [163]:
text  = extract_text("D:\Resumes\off_campus.pdf")
resume_chunks = preProcess_chunk_text(text) 
question = "what are my soft skills."
top_relevant_chunks = get_most_relevant_sentences(question, resume_chunks, 10)
top_relevant_chunks

['EDUCATION B Tech in Civil Engineering 2021 - present Indian Institute Technology Roorkee ADDITIONAL INFORMATION Computer Languages: Python, C++, SQL Soft skills: Problem Solving, Leadership, Collaborative Additional Courses: DeepLearning',
 'Recommended targeted corrective actions to mitigate consumption spikes & optimize consumption efficiency',
 'Utilized the GPT model to generate clear & concise response comparing the suggested with prior selections',
 'Optimized response generation by assigning weights to sentences based on their alignment with the question',
 'Extra curriculars: Head Research Analyst | Appetizer, Executive Member | STC',
 'Enhanced BERT model, fine-tuned on a dataset of QnA pairs; generating response with weighted relevance',
 'Analyzed fund merits, user purchase patterns and risk profiles to develop a robust fund suggestion model',
 'Developed a user-friendly interface with Bootstrap and Flask, enabling seamless user interaction with system Predictive Optimizat