In [10]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer

# Load candidate data
personnel_df = pd.read_csv("data_pro.csv")

# Initialize SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_candidate_embeddings(df):
    """
    Generate embeddings for each candidate based on their skills, experience, name, and country.
    """
    print("Generating embeddings for candidates...")
    embeddings = []

    for idx, row in df.iterrows():
        # Ensure aggregated_skill is a list
        skills_list = row['aggregated_skill'] if isinstance(row['aggregated_skill'], list) else []

        # Handle missing or empty values
        name = row['name'] if pd.notnull(row['name']) else "Unknown Name"
        experience_years = row['experience_years'] if pd.notnull(row['experience_years']) else 0
        country = row['country'] if pd.notnull(row['country']) else "Unknown Country"

        # Construct text for embedding
        candidate_text = f"Name: {name}, Skills: {', '.join(skills_list)}, Experience: {experience_years} years, Country: {country}."
        embeddings.append(model.encode(candidate_text))

        # Log progress
        if idx % 100 == 0:
            print(f"Processed {idx+1}/{len(df)} candidates...")

    print(f"Generated embeddings for {len(df)} candidates.")
    return np.array(embeddings)

def build_knn_index(candidate_embeddings, n_neighbors=50):
    """
    Build a KNN index for candidate embeddings.
    """
    print("Building KNN index...")
    from sklearn.neighbors import NearestNeighbors
    knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
    knn.fit(candidate_embeddings)
    print("KNN index built.")
    return knn

def save_knn_index(knn, filename="knn_index.joblib"):
    """
    Save the KNN index to a file.
    """
    print("Saving KNN index...")
    joblib.dump(knn, filename)
    print(f"KNN index saved to {filename}.")

def save_embeddings(candidate_embeddings, candidate_ids, embeddings_file="embeddings.npy", ids_file="candidate_ids.npy"):
    """
    Save embeddings and candidate IDs.
    """
    print("Saving embeddings and IDs...")
    np.save(embeddings_file, candidate_embeddings)
    np.save(ids_file, candidate_ids)
    print(f"Embeddings saved to {embeddings_file}.")
    print(f"Candidate IDs saved to {ids_file}.")

# Generate embeddings
candidate_embeddings = generate_candidate_embeddings(personnel_df)
candidate_ids = personnel_df["person_id"].values

# Build KNN index
knn = build_knn_index(candidate_embeddings, n_neighbors=50)

# Save embeddings and KNN index
save_embeddings(candidate_embeddings, candidate_ids)
save_knn_index(knn)




Generating embeddings for candidates...
Processed 1/18293 candidates...
Processed 101/18293 candidates...
Processed 201/18293 candidates...
Processed 301/18293 candidates...
Processed 401/18293 candidates...
Processed 501/18293 candidates...
Processed 601/18293 candidates...
Processed 701/18293 candidates...
Processed 801/18293 candidates...
Processed 901/18293 candidates...
Processed 1001/18293 candidates...
Processed 1101/18293 candidates...
Processed 1201/18293 candidates...
Processed 1301/18293 candidates...
Processed 1401/18293 candidates...
Processed 1501/18293 candidates...
Processed 1601/18293 candidates...
Processed 1701/18293 candidates...
Processed 1801/18293 candidates...
Processed 1901/18293 candidates...
Processed 2001/18293 candidates...
Processed 2101/18293 candidates...
Processed 2201/18293 candidates...
Processed 2301/18293 candidates...
Processed 2401/18293 candidates...
Processed 2501/18293 candidates...
Processed 2601/18293 candidates...
Processed 2701/18293 candid