In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import logging
import numpy as np
import pickle
from transformers import pipeline
import sqlite3


In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
class ResumeFilter:
    def __init__(self, model_name='bert-base-uncased'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.embeddings = None

    def load_data(self, file_path):
        try:
            self.df = pd.read_excel(file_path)
            self.df['text'] = self.df.apply(self.combine_columns, axis=1)
            logger.info("Data loaded and combined successfully.")
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise

    @staticmethod
    def combine_columns(row):
        return ' '.join([str(row['candidateName']), str(row['companyName']), str(row['designation']),
                         str(row['experienceMas']), str(row['qualificationMas']), str(row['qualificationMas2'])])

    def get_bert_embedding(self, text):
        try:
            inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            return embeddings
        except Exception as e:
            logger.error(f"Error generating BERT embedding: {e}")
            raise

    def generate_embeddings(self, batch_size=32, save_path='embeddings.pkl'):
        try:
            embeddings = []
            for i in tqdm(range(0, len(self.df), batch_size), desc="Generating embeddings"):
                batch_texts = self.df['text'][i:i+batch_size].tolist()
                batch_embeddings = [self.get_bert_embedding(text) for text in batch_texts]
                embeddings.extend(batch_embeddings)
            self.df['embedding'] = embeddings
            self.embeddings = np.array(embeddings)
            with open(save_path, 'wb') as f:
                pickle.dump(self.embeddings, f)
            logger.info("Embeddings generated and saved successfully.")
        except Exception as e:
            logger.error(f"Error generating embeddings: {e}")
            raise

    def load_embeddings(self, load_path='embeddings.pkl'):
        try:
            with open(load_path, 'rb') as f:
                self.embeddings = pickle.load(f)
            logger.info("Embeddings loaded successfully.")
        except Exception as e:
            logger.error(f"Error loading embeddings: {e}")
            raise

    def filter_resumes(self, job_description, top_n=5):
        try:
            job_embedding = self.get_bert_embedding(job_description)
            similarities = cosine_similarity(self.embeddings, [job_embedding]).flatten()
            self.df['similarity'] = similarities
            filtered_df = self.df.sort_values(by='similarity', ascending=False).head(top_n)
            return filtered_df[['candidateName', 'companyName', 'designation', 'similarity']]
        except Exception as e:
            logger.error(f"Error filtering resumes: {e}")
            raise


In [4]:
if __name__ == "__main__":
    file_path = '/content/drive/MyDrive/TN EMPLOYEE DATABASE.xlsx'  # Update this with your actual file path

    resume_filter = ResumeFilter()
    resume_filter.load_data(file_path)
    resume_filter.generate_embeddings()

    while True:
        job_description = input("Enter the job description (or type 'exit' to quit): ").strip()
        if job_description.lower() == 'exit':
            print("Exiting the program.")
            break
        filtered_resumes = resume_filter.filter_resumes(job_description)
        print(filtered_resumes)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 2900/2900 [15:43<00:00,  3.07it/s]


Enter the job description (or type 'exit' to quit): looking for a person with a 9 years experience in python 
      candidateName                                  companyName  \
45576   DAVID DASAN                              NATURE OF WORK    
50387  Rajesh James             Freelancer for multiple vendors    
46688   SHERWIN SAM  On The Lookout for an Exciting Opportunity    
37595       SHIVA A    Bluechip Corporate Investment Centre Ltd    
27845       N JANSI                            Datasyncsolution    

                                 designation  similarity  
45576                    Senior Cad Operator    0.756271  
50387  Communication and Soft skills trainer    0.753400  
46688                          Sales Manager    0.753091  
37595                         Branch Manager    0.747262  
27845             programmer in HTML and PHP    0.741049  
Enter the job description (or type 'exit' to quit): 5 + years experience in PMP 
      candidateName                      compa

In [6]:

# NER pipeline example
extractor = pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english')
text = "Your job offer or resume text here"
extracted_info = extractor(text)
print(extracted_info)

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[]


In [7]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [8]:
conn = sqlite3.connect('embeddings.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY, embedding BLOB)''')

<sqlite3.Cursor at 0x7a95d53b6240>