## The Final function

In this notebook we'll put all the functions together into one ComprehensiveTalentRanker class, which, given the path to the data file, starred candidate ids and target string, will print the top 5 matching candidates info.

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer 
import torch
import spacy
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')   

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sirak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
class ComprehensiveTalentRanker:
    def __init__(self, data_path, glove_path=None, fasttext_path=None):
        self.data_path = data_path
        self.df = None
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.glove_embeddings = self.load_glove_embeddings(glove_path) if glove_path else None
        self.fasttext_embeddings = self.load_fasttext_vectors(fasttext_path) if fasttext_path else None
        self.load_data()
        self.preprocess_data()

    def load_data(self):
        self.df = pd.read_csv(self.data_path)
        print(f"Loaded {len(self.df)} candidates")

    def preprocess_text(self, text):
        if pd.isna(text):
            return ""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        return ' '.join(tokens)

    def preprocess_data(self):
        print("Preprocessing data...")
        self.df['combined_string'] = self.df.apply(
            lambda row: f"{row['job_title']} {row['location']}" if pd.notna(row['job_title']) and pd.notna(row['location']) else "",
            axis=1
        )
        self.df['combined_string_processed'] = self.df['combined_string'].apply(self.preprocess_text)
        print("Data preprocessing completed")

    # --- Embedding Loaders ---
    def load_glove_embeddings(self, file_path):
        if not file_path:
            return None
        embeddings = {}
        with open(file_path, 'r', encoding='utf8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                vector = np.array(parts[1:], dtype=np.float32)
                embeddings[word] = vector
        print("Loaded GloVe embeddings.")
        return embeddings

    def load_fasttext_vectors(self, file_path, max_words=200000):
        if not file_path:
            return None
        embeddings = {}
        with open(file_path, 'r', encoding='utf8', newline='\n', errors='ignore') as f:
            next(f)  # skip header
            for i, line in enumerate(f):
                if i >= max_words:
                    break
                parts = line.rstrip().split(' ')
                word = parts[0]
                vector = np.array(parts[1:], dtype=np.float32)
                embeddings[word] = vector
        print("Loaded FastText embeddings.")
        return embeddings

    # --- Embedding Methods ---
    def bag_of_words_similarity(self, data, target_string):
        vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 2))
        all_texts = list(data['combined_string_processed']) + [target_string]
        bow_matrix = vectorizer.fit_transform(all_texts)
        job_titles_matrix = bow_matrix[:-1]
        target_vector = bow_matrix[-1:]
        similarities = cosine_similarity(job_titles_matrix, target_vector).flatten()
        return similarities

    def tfidf_similarity(self, data, target_string):
        vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
        all_texts = list(data['combined_string_processed']) + [target_string]
        tfidf_matrix = vectorizer.fit_transform(all_texts)
        job_titles_matrix = tfidf_matrix[:-1]
        target_vector = tfidf_matrix[-1:]
        similarities = cosine_similarity(job_titles_matrix, target_vector).flatten()
        return similarities

    def glove_similarity(self, data, target_string):
        def get_document_embedding(text):
            if pd.isna(text) or text == "":
                return np.zeros(100)
            text_processed = self.preprocess_text(text)
            words = text_processed.split()
            word_embeddings = [self.glove_embeddings[word] for word in words if word in self.glove_embeddings]
            if len(word_embeddings) == 0:
                return np.zeros(100)
            return np.mean(word_embeddings, axis=0)
        job_embeddings = [get_document_embedding(text) for text in data['combined_string']]
        target_embedding = get_document_embedding(target_string)
        similarities = cosine_similarity(job_embeddings, [target_embedding]).flatten()
        return similarities

    def fasttext_similarity(self, data, target_string):
        def get_document_embedding(text):
            if pd.isna(text) or text == "":
                return np.zeros(300)
            text_processed = self.preprocess_text(text)
            words = text_processed.split()
            word_embeddings = [self.fasttext_embeddings[word] for word in words if word in self.fasttext_embeddings]
            if len(word_embeddings) == 0:
                return np.zeros(300)
            return np.mean(word_embeddings, axis=0)
        job_embeddings = [get_document_embedding(text) for text in data['combined_string']]
        target_embedding = get_document_embedding(target_string)
        similarities = cosine_similarity(job_embeddings, [target_embedding]).flatten()
        return similarities

    def bert_similarity(self, data, target_string):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        model.eval()
        doc_embeddings = []
        for text in data['combined_string']:
            if pd.isna(text):
                doc_embeddings.append(np.zeros(768))
                continue
            inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
                doc_embeddings.append(embedding)
        target_inputs = tokenizer(target_string, return_tensors='pt', max_length=512, truncation=True, padding=True)
        with torch.no_grad():
            target_outputs = model(**target_inputs)
            target_embedding = target_outputs.last_hidden_state[:, 0, :].numpy().flatten()
        similarities = [np.dot(doc_emb, target_embedding) / (np.linalg.norm(doc_emb) * np.linalg.norm(target_embedding) + 1e-8) for doc_emb in doc_embeddings]
        return np.array(similarities)

    def sbert_similarity(self, data, target_string):
        sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
        job_titles = data['combined_string'].fillna('').tolist()
        job_embeddings = sbert_model.encode(job_titles)
        target_embedding = sbert_model.encode([target_string])
        similarities = cosine_similarity(job_embeddings, target_embedding).flatten()
        return similarities

    # --- Reranking Methods ---
    def rerank_boosting(self, ranking_df, starred_candidates):
        starred_mask = ranking_df['id'].isin(starred_candidates)
        starred_features = ranking_df[starred_mask]['similarity_score']
        if len(starred_features) == 0:
            return ranking_df
        starred_avg = starred_features.mean()
        for idx, row in ranking_df.iterrows():
            candidate_score = row['similarity_score']
            similarity_to_starred = 1 - abs(candidate_score - starred_avg)
            boost_factor = 1 + 0.3 * similarity_to_starred
            ranking_df.loc[idx, 'similarity_score'] = candidate_score * boost_factor
        ranking_df['rank'] = ranking_df['similarity_score'].rank(ascending=False, method='min').astype(int)
        return ranking_df

    def rerank_combining(self, ranking_df, target_string, starred_candidates, method):
        starred_mask = ranking_df['id'].isin(starred_candidates)
        starred_info = " ".join(ranking_df.loc[starred_mask, 'combined_string'].fillna("").tolist())
        
        # Combine with target_string
        combined_target = f"{target_string} {starred_info}"
        
        temp_df = ranking_df.copy()
        temp_df['combined_string'] = temp_df['combined_string'].fillna("")
        temp_df['combined_string_processed'] = temp_df['combined_string'].apply(self.preprocess_text)
        if method == 'bow':
            similarities = self.bag_of_words_similarity(temp_df, self.preprocess_text(target_string))
        elif method == 'tfidf':
            similarities = self.tfidf_similarity(temp_df, self.preprocess_text(target_string))
        elif method == 'glove':
            similarities = self.glove_similarity(temp_df, target_string)
        elif method == 'fasttext':
            similarities = self.fasttext_similarity(temp_df, target_string)
        elif method == 'bert':
            similarities = self.bert_similarity(temp_df, target_string)
        elif method == 'sbert':
            similarities = self.sbert_similarity(temp_df, target_string)
        else:
            raise ValueError(f"Unknown method: {method}.")
        scaler = MinMaxScaler()
        similarities_norm = scaler.fit_transform(similarities.reshape(-1, 1)).flatten()
        temp_df['similarity_score'] = similarities_norm
        temp_df['rank'] = temp_df['similarity_score'].rank(ascending=False, method='min').astype(int)
        display_cols = ['rank', 'id', 'job_title', 'location', 'similarity_score']
        return temp_df[display_cols + [col for col in temp_df.columns if col not in display_cols]]

    # --- Main Ranking Function ---
    def rank_candidates(self, target_string, method='bert', starred_candidates=None, reranking_method='boosting'):
        target_processed = self.preprocess_text(target_string)
        data = self.df.copy()
        if method == 'bow':
            similarities = self.bag_of_words_similarity(data, target_processed)
        elif method == 'tfidf':
            similarities = self.tfidf_similarity(data, target_processed)
        elif method == 'glove':
            similarities = self.glove_similarity(data, target_string)
        elif method == 'fasttext':
            similarities = self.fasttext_similarity(data, target_string)
        elif method == 'bert':
            similarities = self.bert_similarity(data, target_string)
        elif method == 'sbert':
            similarities = self.sbert_similarity(data, target_string)
        else:
            raise ValueError(f"Unknown method: {method}. Available: 'bow', 'tfidf', 'glove', 'fasttext', 'bert', 'sbert'")
        scaler = MinMaxScaler()
        similarities_norm = scaler.fit_transform(similarities.reshape(-1, 1)).flatten()
        ranking_df = data.copy()
        ranking_df['similarity_score'] = similarities_norm
        ranking_df['rank'] = ranking_df['similarity_score'].rank(ascending=False, method='min').astype(int)
        if starred_candidates:
            if reranking_method == 'boosting':
                ranking_df = self.rerank_boosting(ranking_df, starred_candidates)
            elif reranking_method == 'combining':
                ranking_df = self.rerank_combining(ranking_df, target_string, starred_candidates, method)
        ranking_df = ranking_df.sort_values('rank').reset_index(drop=True)
        return ranking_df

    # --- Comparison and Best Method ---
    def compare_methods(self, target_string, starred_candidates=None, reranking_method='boosting'):
        methods = ['bow', 'tfidf', 'glove', 'fasttext', 'bert', 'sbert']
        results = {}
        print(f"Comparing methods for target: '{target_string}'")
        print("="*60)
        for method in methods:
            print(f"\nTesting {method.upper()}...")
            try:
                ranking = self.rank_candidates(target_string, method, starred_candidates, reranking_method)
                top_10 = ranking.head(10)[['rank', 'id', 'job_title', 'location', 'similarity_score']]
                results[method] = {
                    'ranking': ranking,
                    'top_10': top_10,
                    'avg_score': ranking['similarity_score'].mean(),
                    'max_score': ranking['similarity_score'].max()
                }
                print(f"Top 5 candidates using {method.upper()}:")
                for _, row in top_10.head(5).iterrows():
                    starred_mark = " ⭐" if row['id'] in (starred_candidates or []) else ""
                    print(f"  {row['rank']:2d}. ID {row['id']:3d} - {row['job_title'][:50]:<50} | {row['location'][:30]:<30} (Score: {row['similarity_score']:.4f}){starred_mark}")
            except Exception as e:
                print(f"Error with {method}: {e}")
                continue

    def get_best_method(self, target_string, starred_candidates=None, reranking_method='boosting'):
        results = self.compare_methods(target_string, starred_candidates, reranking_method)
        if not results:
            print("No methods worked successfully")
            return None
        best_method = max(results.keys(), key=lambda x: results[x]['avg_score'])
        print(f"\n{'='*60}")
        print(f"BEST METHOD: {best_method.upper()}")
        print(f"Average similarity score: {results[best_method]['avg_score']:.4f}")
        print("="*60)
        return best_method

In [4]:
#Usage example

ranker = ComprehensiveTalentRanker(
    data_path="../data/potential_talents_data.csv",
    glove_path="../glove_data/glove.6B.100d.txt",
    fasttext_path="../fasttext_data/cc.en.300.vec"
)
target = "seeking human resources"
starred_candidates = [51, 12, 10]
ranker.compare_methods(target, starred_candidates, reranking_method='combining') 

Loaded GloVe embeddings.
Loaded FastText embeddings.
Loaded 104 candidates
Preprocessing data...
Data preprocessing completed
Comparing methods for target: 'seeking human resources'

Testing BOW...
Top 5 candidates using BOW:
   1. ID  28 - Seeking Human Resources Opportunities              | Chicago, Illinois              (Score: 1.0000)
   1. ID  30 - Seeking Human Resources Opportunities              | Chicago, Illinois              (Score: 1.0000)
   3. ID  73 - Aspiring Human Resources Manager, seeking internsh | Houston, Texas Area            (Score: 0.8936)
   4. ID  99 - Seeking Human Resources Position                   | Las Vegas, Nevada Area         (Score: 0.8563)
   5. ID  53 - Seeking Human Resources HRIS and Generalist Positi | Greater Philadelphia Area      (Score: 0.8044)

Testing TFIDF...
Top 5 candidates using TFIDF:
   1. ID  28 - Seeking Human Resources Opportunities              | Chicago, Illinois              (Score: 1.0000)
   1. ID  30 - Seeking Human Resourc