In [2]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pdfplumber as pdp
import fitz 
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [3]:
def pdf_to_text(pdf_path):
    text = ""
    with pdp.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Example usage
pdf_text = pdf_to_text("unlabelled\Resume_2.pdf")

# Print the extracted text
print(pdf_text)

Resume
Name : Nirmal Cardoza
​
Email : nirmal.cardoza@gmail.com
​
Phone : 9972169059 Hyderabad, India
Objective:
To get a Technical Artist role to share and improve my technical and creative skills. And
to contribute those skills to a good and great game.
About me:
I am a passionate gamer wanting to work with a team to make quality games , and eager
to learn and share skills and knowledge. I have a good artistic sense as well as knowledge to
see Art Technically, as I evolved from an 3d artist to Technical Artist. And I love to discuss
games and make prototypes.
Skills:
● Strong experience in Unity Engine.
● Skilled to create Tools, Shaders,VFX and Game Prototypes.
● Scripts/Programing : C#, Python, MEL, C++
● Good understanding of game engines, rendering pipelines, and worked on
Unity NintendoWare,Vicious Engine , Phyre Engine and Cocos2d
● Skilled with Tools for Maya, Rigging, Animation , Fx, and Lighting
● Strong problem solving, troubleshooting skills and habit to keep myself learni

In [13]:

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # Last layer hidden-states of the model
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask  # Take average

# Compute cosine similarity
def cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

def update_highlighting(file_path, output_path, query_text):
    doc = fitz.open(file_path)
    for page_num in range(len(doc)):
        page = doc[page_num]
        page.clean_contents()
        # Tokenize context text
        context_text = page.get_text()
        context_tokens = context_text.lower().split()  # Convert context to lowercase and split into tokens
        query_tok = query_text.lower().split()
        context_tokens = [token for token in context_tokens if token not in query_tok]
        # Compute embeddings for context tokens
        context_input = tokenizer(context_tokens, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            context_output = model(**context_input)

        # Perform mean pooling
        context_embeddings = mean_pooling(context_output, context_input['attention_mask'])

        # Compute embeddings for query text
        query_input = tokenizer(query_text, return_tensors='pt')
        with torch.no_grad():
            query_output = model(**query_input)

        # Compute mean pooling for query text
        query_embedding = mean_pooling(query_output, query_input['attention_mask']).numpy()

        # Compute similarity between query and context tokens
        token_similarity_scores = {}
        for i, token in enumerate(context_tokens):
            token_embedding = context_embeddings[i].numpy()
            similarity = cosine_similarity(query_embedding, token_embedding)
            token_similarity_scores[token] = similarity

        
        sorted_tokens = sorted(token_similarity_scores.items(), key=lambda x: x[1], reverse=True)
        for item in sorted_tokens:
            print(item)
        for idx, (token, similarity) in enumerate(sorted_tokens):
            if idx < 3:
                for inst in page.search_for(token):
                    page.draw_rect(inst, color=(1, 0.647, 0), fill=(1, 0.9, 0.7), width=1.5, overlay=False)
            elif idx < 6:
                for inst in page.search_for(token):
                    page.draw_rect(inst, color=(1, 1, 0), fill=(1, 1, 0.9), width=1.5, overlay=False)
            elif idx < 10:
                for inst in page.search_for(token):
                    page.draw_rect(inst, color=(1, 1, 0), fill=(1, 1, 0.8), width=1.5, overlay=False)

    doc.save(output_path)

# Open Source
# AI 

# Three dimensional modeling
# Creativity

# Office work
# Discounts

# Innovation
# Leadership experience


# harshibars-resume  
# linkedin_resume_original
# modular-resume
# sales-resume
input_file = "demo\harshibars-resume.pdf"  # Please pass the path to the input PDF
output_file = "out_put_resume2.pdf"  # To be decided by you
query_text = "AI"  # Your query text here

update_highlighting(input_file, output_file, query_text)

output_file


('algorithms,', array([0.54972625], dtype=float32))
('machine', array([0.48757645], dtype=float32))
('python,', array([0.47610655], dtype=float32))
('python', array([0.46388483], dtype=float32))
('software', array([0.4595851], dtype=float32))
('arts', array([0.44606185], dtype=float32))
('tech', array([0.43967134], dtype=float32))
('computer', array([0.42757732], dtype=float32))
('learning', array([0.42133367], dtype=float32))
('engineer', array([0.4185858], dtype=float32))
('ui', array([0.4172641], dtype=float32))
('skills', array([0.415241], dtype=float32))
('automatically', array([0.4147169], dtype=float32))
('systems,', array([0.4126276], dtype=float32))
('engineering', array([0.40697452], dtype=float32))
('script', array([0.4044734], dtype=float32))
('javascript', array([0.40233168], dtype=float32))
('java', array([0.3971898], dtype=float32))
('analytics,', array([0.39395627], dtype=float32))
('languages', array([0.37655282], dtype=float32))
('tools', array([0.35563278], dtype=flo

'out_put_resume2.pdf'