# Extracting Text

In [1]:
import filetype, io, pytesseract, pdfplumber, docx
from PIL import Image

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text(raw_text):   
   if isinstance(raw_text, memoryview):
    raw_text = raw_text.tobytes()  # Convert memoryview to bytes
       
   file_extension = filetype.guess_extension(raw_text)
   
   if file_extension == 'pdf':
         with pdfplumber.open(io.BytesIO(raw_text)) as pdf:
            resume_text = []
            for page in pdf.pages:
               resume_text.append(page.extract_text() or "")
            resume_text = ' '.join(resume_text)
            
      
   elif file_extension == 'docx':
      document = docx.Document(io.BytesIO(raw_text))
      resume_text = ' '.join([para.text for para in document.paragraphs])

   elif file_extension in ['png', 'jpg', 'jpeg']:
       image = Image.open(io.BytesIO(raw_text))
       resume_text = pytesseract.image_to_string(image)
       
   else:
      resume_text = raw_text.decode('utf-8', errors='ignore')
   
   return resume_text    

# Cleaning the text

In [None]:
import re
def clean_text(text):  
 text = re.sub(r'[\u2022\u2023\u25E6\u2043\u2219]', '-', text) #removing common bullets
 text = re.sub(r'[\s+]', ' ', text) #removing any whitespace character
 text = re.sub(r'[^\x00-\x7F]+', ' ', text) #match anything
 return text.strip()

# Sentence Splitting and Embedding

In [None]:
import logging, os, spacy, numpy as np
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)

model_path = 'models/mpnet_local'

if os.path.exists(model_path):
   model = SentenceTransformer(model_path, device='cpu')

else:
  model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
  model.save(model_path)

nlp = spacy.load('en_core_web_trf')

def sentence_embedding(text):
  doc = nlp(text)
  sentence_list = [sent.text.strip() for sent in doc.sents]
  sentence_vector = model.encode(
                      sentence_list, 
                      output_value='sentence_embedding',
                      convert_to_numpy=True,
                      convert_to_tensor=False,
                      batch_size=24,
                      normalize_embeddings=True,
                      device='cpu'
                    )
  return sentence_vector

def normalized_mean_of_sentence_vector(sentence_vector):
    mean_vector = np.mean(np.array(sentence_vector), axis=0)
    #print(mean_vector.shape)
    normalized_vector = mean_vector / np.linalg.norm(mean_vector)
    #print(np.linalg.norm(normalized_vector))
    return normalized_vector

# Defining Pipeline

In [None]:
def pipeline(raw_text):
    extracted_text = extract_text(raw_text) # extracting text
    cleaned_text = clean_text(extracted_text) # cleaning text
    sentence_vector = sentence_embedding(cleaned_text) # embedding sentence
    normalized_mean_vector = normalized_mean_of_sentence_vector(sentence_vector)
    return normalized_mean_vector, cleaned_text

# Just finding out source of resume

In [None]:
import os, hashlib
base_path = 'CVS'
clean_path = 'Clean_CV'

os.makedirs(base_path, exist_ok=True)
os.makedirs(clean_path, exist_ok=True)

hash_file = os.path.join(clean_path, 'file_hashes.txt')

if os.path.exists(hash_file):
    with open(hash_file, 'r') as f:
        existing_hashes = set(line.strip() for line in f)

else:
    existing_hashes = set()

folder_source = []
for folder in os.listdir(base_path):
    if folder == 'file_hashes.txt':
        continue
    folder_source.append(folder)
folder_source

# Storing the vector

In [None]:
import os, hashlib

base_path = 'CVS'
clean_path = 'Clean_CV'

os.makedirs(base_path, exist_ok=True)
os.makedirs(clean_path, exist_ok=True)

hash_path = os.path.join(clean_path, 'file_hashes.txt')

existing_hashes = set()
if os.path.exists(hash_path):
    with open(hash_path, 'r') as f:
        for line in f:
            existing_hashes.add(line.strip())

for folder in folder_source:
    path = os.path.join(base_path, folder)
    save_path = os.path.join(clean_path, folder)
    os.makedirs(save_path, exist_ok=True)
    current_folder_vectors = []
    current_folder_hashes = []
    
    for file in sorted(os.listdir(path)): 
        file_path = os.path.join(path, file)
        with open(file_path, 'rb') as f:
            text = f.read()
        
        resume_vector, resume_text = pipeline(text)

        file_hash = hashlib.sha256(resume_text.encode('utf-8')).hexdigest()
        if file_hash in existing_hashes:
            print(f'Skipped duplicate: {file}')
            continue
            
        existing_hashes.add(file_hash)   
        current_folder_hashes.append(file_hash)
        current_folder_vectors.append(resume_vector)

    if current_folder_hashes:
        with open(hash_path, 'a') as f:
            f.write('\n'.join(current_folder_hashes) + '\n')
    print(len(current_folder_vectors))
    if current_folder_vectors:
        np_vector = np.array(current_folder_vectors)
        mean_vector = np.mean(np_vector, axis=0)
        mean_vector = mean_vector / np.linalg.norm(mean_vector)
        final_vector_path = os.path.join(save_path, 'mean_vector.npz')
        np.savez(final_vector_path, mean=mean_vector, all_vectors=np_vector)
        print(f'Saved both mean and all vectors for {folder} to {final_vector_path}')

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics.pairwise import cosine_similarity

# for folder in folder_source:
#     path = os.path.join(base_path, folder)
#     save_path = os.path.join(clean_path, folder)
#     os.makedirs(save_path, exist_ok=True)
#     vector = []
#     for file in os.listdir(save_path):
#         if file.endswith('.npy'):
#             vec = np.load(os.path.join(save_path, file))
#             vector.append(vec)
    
#     cos = cosine_similarity(vector, vector)
#     print(len(cos))
#     print(len(vector))
#     sns.heatmap(cos[:10], annot=True,  fmt=".2f", cmap="coolwarm")
#     plt.show() 

# Collecting all the embedding

In [None]:
all_vectors, labels, filename = [], [], []
clean_path = 'Clean_CV'
print(folder_source)
for folder in folder_source:
    path = os.path.join(clean_path, folder)
    for file in os.listdir(path):
        if file.endswith('.npy'):
            vector = np.load(os.path.join(path, file))
            all_vectors.append(vector)
            labels.append(folder)
            filename.append(file)
all_vectors = np.array(all_vectors)
all_vectors.shape

In [None]:
import hdbscan

# cluster on full embeddings
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')  
cluster_labels = clusterer.fit_predict(all_vectors)

print("Unique clusters:", set(cluster_labels))


In [None]:
import umap
import matplotlib.pyplot as plt
import numpy as np

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(all_vectors)  # shape -> (1030, 2)


In [None]:
plt.figure(figsize=(12,8))
plt.scatter(embedding_2d[:,0], embedding_2d[:,1], c=cluster_labels, cmap='tab20', s=10)
plt.title("HDBSCAN clusters of resumes")
plt.show()


In [None]:
unique_labels = list(set(labels))
colors = [unique_labels.index(l) for l in labels]

plt.figure(figsize=(12,8))
plt.scatter(embedding_2d[:,0], embedding_2d[:,1], c=colors, cmap='tab20', s=10)
plt.title("Resumes colored by folder")
plt.show()


In [None]:
for folder in folder_source:
    indices = [i for i,l in enumerate(labels) if l==folder]
    print(f"{folder}: clusters = {set(cluster_labels[i] for i in indices)}")
