# Extracting Text

In [1]:
import filetype, io, pytesseract, pdfplumber, docx
from PIL import Image

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text(raw_text):   
   if isinstance(raw_text, memoryview):
    raw_text = raw_text.tobytes()  # Convert memoryview to bytes
       
   file_extension = filetype.guess_extension(raw_text)
   
   if file_extension == 'pdf':
         with pdfplumber.open(io.BytesIO(raw_text)) as pdf:
            resume_text = []
            for page in pdf.pages:
               resume_text.append(page.extract_text() or "")
            resume_text = ' '.join(resume_text)
            
      
   elif file_extension == 'docx':
      document = docx.Document(io.BytesIO(raw_text))
      resume_text = ' '.join([para.text for para in document.paragraphs])

   elif file_extension in ['png', 'jpg', 'jpeg']:
       image = Image.open(io.BytesIO(raw_text))
       resume_text = pytesseract.image_to_string(image)
       
   else:
      resume_text = raw_text.decode('utf-8', errors='ignore')
   
   return resume_text    

# Cleaning the text

In [2]:
import re
def clean_text(text):  
 text = re.sub(r'[\u2022\u2023\u25E6\u2043\u2219]', '-', text) #removing common bullets
 text = re.sub(r'[\s+]', ' ', text) #removing any whitespace character
 text = re.sub(r'[^\x00-\x7F]+', ' ', text) #match anything
 return text.strip()

# Sentence Splitting and Embedding

In [3]:
import logging, os, spacy, numpy as np
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)

model_path = 'models/mpnet_local'

if os.path.exists(model_path):
   model = SentenceTransformer(model_path, device='cpu')

else:
  model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
  model.save(model_path)

nlp = spacy.load('en_core_web_trf')

def sentence_embedding(text):
  doc = nlp(text)
  sentence_list = [sent.text.strip() for sent in doc.sents]
  sentence_vector = model.encode(
                      sentence_list, 
                      output_value='sentence_embedding',
                      convert_to_numpy=True,
                      convert_to_tensor=False,
                      batch_size=24,
                      normalize_embeddings=True,
                      device='cpu'
                    )
  return sentence_vector

def normalized_mean_of_sentence_vector(sentence_vector):
    mean_vector = np.mean(np.array(sentence_vector), axis=0)
    #print(mean_vector.shape)
    norm = np.linalg.norm(mean_vector)
    if norm == 0 or np.isnan(norm):
        return None
    normalized_vector = mean_vector / norm
    #print(np.linalg.norm(normalized_vector))
    return normalized_vector

# Defining Pipeline

In [4]:
def pipeline(raw_text):
    extracted_text = extract_text(raw_text) # extracting text
    if not extracted_text or not extracted_text.strip():
        return None, None
        
    cleaned_text = clean_text(extracted_text) # cleaning text
    if not cleaned_text or not cleaned_text.strip():
        return None, None
        
    sentence_vector = sentence_embedding(cleaned_text) # embedding sentence
    if sentence_vector is None or len(sentence_vector)==0:
        return None, None
        
    normalized_mean_vector = normalized_mean_of_sentence_vector(sentence_vector)
    return normalized_mean_vector, cleaned_text

# Just finding out source of resume

In [5]:
import os, hashlib
base_path = 'CVS'
clean_path = 'Clean_CV'

os.makedirs(base_path, exist_ok=True)
os.makedirs(clean_path, exist_ok=True)

hash_file = os.path.join(clean_path, 'file_hashes.txt')

if os.path.exists(hash_file):
    with open(hash_file, 'r') as f:
        existing_hashes = set(line.strip() for line in f)

else:
    existing_hashes = set()

folder_source = []
for folder in os.listdir(base_path):
    path = os.path.join(base_path, folder)
    if os.path.isdir(path):
        folder_source.append(folder)
folder_source = sorted(folder_source)
folder_source

['ACCOUNTANT',
 'ADVOCATE',
 'AGRICULTURE',
 'APPAREL',
 'ARTS',
 'AUTOMOBILE',
 'AVIATION',
 'BANKING',
 'BPO',
 'BUSINESS-DEVELOPMENT',
 'CHEF',
 'CONSTRUCTION',
 'CONSULTANT',
 'DESIGNER',
 'DIGITAL-MEDIA',
 'ENGINEERING',
 'FINANCE',
 'FITNESS',
 'HEALTHCARE',
 'HR',
 'INFORMATION-TECHNOLOGY',
 'PUBLIC-RELATIONS',
 'SALES',
 'TEACHER']

# Storing the vector

In [6]:
# import os, hashlib

# base_path = 'CVS'
# clean_path = 'Clean_CV'

# os.makedirs(base_path, exist_ok=True)
# os.makedirs(clean_path, exist_ok=True)

# hash_path = os.path.join(clean_path, 'file_hashes.txt')

# existing_hashes = set()
# if os.path.exists(hash_path):
#     with open(hash_path, 'r') as f:
#         for line in f:
#             existing_hashes.add(line.strip())

# for folder in folder_source:
#     path = os.path.join(base_path, folder)
#     save_path = os.path.join(clean_path, folder)
#     os.makedirs(save_path, exist_ok=True)
#     current_folder_vectors = []
#     current_folder_hashes = []
    
#     for file in sorted(os.listdir(path)): 
#         file_path = os.path.join(path, file)
#         with open(file_path, 'rb') as f:
#             text = f.read()
        
#         resume_vector, resume_text = pipeline(text)
#         resume_vector = np.array(resume_vector)
#         if resume_vector is None or resume_vector.size != 768:
#             actual_size = resume_vector.size if hasattr(resume_vector, 'size') else 'Unknown'
#             print(f"Skipped invalid resume (wrong shape or empty) : {file}, size: {actual_size}")
#             continue

#         file_hash = hashlib.sha256(resume_text.encode('utf-8')).hexdigest()
#         if file_hash in existing_hashes:
#             print(f'Skipped duplicate: {file}')
#             continue
            
#         existing_hashes.add(file_hash)   
#         current_folder_hashes.append(file_hash)
#         current_folder_vectors.append(resume_vector)
#         print(len(current_folder_vectors))

#     if current_folder_hashes:
#         with open(hash_path, 'a') as f:
#             f.write('\n'.join(current_folder_hashes) + '\n')
#             f.write(f"{folder}\n")
    
#     if current_folder_vectors:
#         np_vector = np.vstack(current_folder_vectors)
#         mean_vector = np.mean(np_vector, axis=0)
#         norm = np.linalg.norm(mean_vector)
#         if norm == 0 or np.isnan(norm):
#             print('Invalid mean vector')
#             continue
#         mean_vector = mean_vector / norm
#         final_vector_path = os.path.join(save_path, 'mean_vector.npz')
#         np.savez(final_vector_path, mean=mean_vector, all_vectors=np_vector)
#         print(f'Saved both mean and all vectors for {folder} to {final_vector_path}')

# Collecting mean embedding of each domain

In [7]:
import os
import pandas as pd

domain_dict ={}
all_vectors, all_vectors_labels = [], []
clean_path = 'Clean_CV'
folder_source = []
for folder in os.listdir(clean_path):
    path = os.path.join(clean_path, folder)
    if os.path.isdir(path):
        folder_source.append(folder)

folder_source = sorted(folder_source)
for folder in folder_source:
    path = os.path.join(clean_path, folder)
    for file in os.listdir(path):
        if file.endswith('.npz'):
            with np.load(os.path.join(path, file)) as data:
                vector = data['all_vectors']
                all_vectors.extend(vector)
                all_vectors_labels.extend([folder]*vector.shape[0])
                domain_dict[folder] = data['mean']


final_dict = {
    'domain_value': domain_dict,
    'each_vector': np.vstack(all_vectors),
    'each_label': np.array(all_vectors_labels)
}

In [8]:
import pickle
with open('final.pkl', 'wb') as f:
    pickle.dump(final_dict, f)
print('File created')

File created


In [23]:
from IPython.display import display
import ipywidgets as widgets

uploader = widgets.FileUpload(
    accept='.pdf, .docx, .png, .txt, .jpg, .jpeg'
)

display(uploader)

FileUpload(value=(), accept='.pdf, .docx, .png, .txt, .jpg, .jpeg', description='Upload')

In [25]:
if uploader.value:
    file = uploader.value[0]
    print('file uploaded')
else:
    print('file not uploaded')

file uploaded


In [26]:
new_resume_vector, new_resume_text = pipeline(file['content'])
new_resume_text

'MIT-Based Cryptography Roadmap Introduction & Prerequisites This roadmap outlines a structured path to learn cryptography following MIT-level standards. Basic algebra, discrete math, and Python are recommended prerequisites. Phase 1: Foundations (Math   Intro Crypto) (cid:127) Discrete mathematics (cid:127) Number theory (cid:127) Modular arithmetic (cid:127) Basic cryptographic primitives (cid:127) Hashing, symmetric encryption, one-way functions Phase 2: Applied Cryptography (cid:127) AES, RSA, ECC (cid:127) TLS, HTTPS, PKI (cid:127) Digital signatures (cid:127) Secure coding practices (cid:127) Realnworld protocol failures Phase 3: Advanced Theory (cid:127) Zeronknowledge proofs (cid:127) Secure multiparty computation (cid:127) Lattices & postnquantum crypto (cid:127) Complexity theory foundations Phase 4: Research Topics (cid:127) Homomorphic encryption (cid:127) Cryptanalysis (cid:127) Proof-carrying data (cid:127) Cuttingnedge postnquantum systems Recommended Textbooks (cid:127)

In [27]:
if new_resume_vector is None:
    print('Something is wrong')

In [28]:
import pickle
import numpy as np

# Use 'rb' (Read Binary)
with open('final.pkl', 'rb') as f:
    data = pickle.load(f)

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
sim = {'labels': [], 'cosine_sim': []}
for label, vector in data['domain_value'].items():
    sim['labels'].append(label)
    sim['cosine_sim'].append(cosine_similarity(new_resume_vector.reshape(1,-1), vector.reshape(1,-1)))
print(sim['cosine_sim'])

[array([[0.21513158]], dtype=float32), array([[0.18158045]], dtype=float32), array([[0.22961742]], dtype=float32), array([[0.1494559]], dtype=float32), array([[0.23032983]], dtype=float32), array([[0.20712756]], dtype=float32), array([[0.20988353]], dtype=float32), array([[0.21220729]], dtype=float32), array([[0.22544411]], dtype=float32), array([[0.15642521]], dtype=float32), array([[0.11708594]], dtype=float32), array([[0.18746476]], dtype=float32), array([[0.2371529]], dtype=float32), array([[0.21156885]], dtype=float32), array([[0.19402468]], dtype=float32), array([[0.2592247]], dtype=float32), array([[0.20530562]], dtype=float32), array([[0.16421136]], dtype=float32), array([[0.17343241]], dtype=float32), array([[0.1751014]], dtype=float32), array([[0.29557395]], dtype=float32), array([[0.17048492]], dtype=float32), array([[0.13869867]], dtype=float32), array([[0.22874132]], dtype=float32)]


In [30]:
# max_vector = max(sim['cosine_sim'])
# i=0
# for vec in sim['cosine_sim']:
#     if vec == max_vector:
#         break
#     i+=1
# print(i)
# label = sim['labels'][i]
# label

max_index = sim['cosine_sim'].index(max(sim['cosine_sim']))
match_label = sim['labels'][max_index]
print(max(sim['cosine_sim']))
match_label
a = [2,4,5,3,5,1]
max(a)

[[0.29557395]]


5

In [31]:
import pandas as pd

df = pd.DataFrame(sim)
max_row = df.loc[df['cosine_sim'].idxmax()]
print(f'Best match: {max_row['labels']} with similarity {max_row['cosine_sim']}')
df

Best match: INFORMATION-TECHNOLOGY with similarity [[0.29557395]]


Unnamed: 0,labels,cosine_sim
0,ACCOUNTANT,[[0.21513158]]
1,ADVOCATE,[[0.18158045]]
2,AGRICULTURE,[[0.22961742]]
3,APPAREL,[[0.1494559]]
4,ARTS,[[0.23032983]]
5,AUTOMOBILE,[[0.20712756]]
6,AVIATION,[[0.20988353]]
7,BANKING,[[0.21220729]]
8,BPO,[[0.22544411]]
9,BUSINESS-DEVELOPMENT,[[0.15642521]]
