In [1]:
import re, logging, os
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
from IPython.display import display
import ipywidgets as widgets

uploader = widgets.FileUpload(
    accept='.pdf, .docx, .png, .txt, .jpg, .jpeg',
    multiple=True
)

display(uploader)

FileUpload(value=(), accept='.pdf, .docx, .png, .txt, .jpg, .jpeg', description='Upload', multiple=True)

In [8]:
import os, hashlib
from datetime import datetime

CV_FOLDER = "CVS/Resume"
HASH_FOLDER = 'CVS'
os.makedirs(CV_FOLDER, exist_ok=True)

HASH_FILE = os.path.join(HASH_FOLDER, "file_hashes.txt")

# Load existing hashes if available
if os.path.exists(HASH_FILE):
    with open(HASH_FILE, "r") as f:
        existing_hashes = set(line.strip() for line in f)
else:
    existing_hashes = set()

In [9]:
for uploaded_file in uploader.value:
    file_hash = hashlib.sha256(uploaded_file.content).hexdigest()
    if file_hash in existing_hashes:
        print(f"Skipped duplicate: {uploaded_file.name}")
        continue 
        
    with open(HASH_FILE, 'a') as f:
        f.write(file_hash+'\n')
    existing_hashes.add(file_hash)
    
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
    unique_filename = f"{timestamp}_{uploaded_file.name}"
    path = os.path.join(CV_FOLDER, unique_filename)
    
    with open(path, 'wb') as f:
        f.write(uploaded_file.content)
    print(f"Saved: {f.name}")

In [5]:
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)

model_path = 'models/mpnet_local'

if os.path.exists(model_path):
   model = SentenceTransformer(model_path)

else:
  model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
  model.save(model_path)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

In [6]:
def text_to_sentence(text):
  clean = re.sub(r'\b([a-zA-Z]\.),{2,}\b|([0-9]+\.(?!\d))', lambda m: m.group().replace('.', '[DOT]'), text)
  pattern = r'[\.\?\!]\s+'
  match = re.split(pattern, clean)
  sentence = []
  for s in match:
      s = s.replace('[DOT]', '.')
      sentence.append(s)
  return sentence

In [7]:
def sentence_embedding(text):
  sentence_list = text_to_sentence(text)
  sentence_vector = model.encode(
                      sentence_list, 
                      output_value='sentence_embedding',
                      convert_to_numpy=True,
                      convert_to_tensor=False,
                      batch_size=24,
                      normalize_embeddings=True
                    )
  # print(sentence_vector)
  # print(len(sentence_vector))

  similarity = util.cos_sim(sentence_vector, sentence_vector)
  print(similarity)
  sns.heatmap(similarity)