In [186]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import LatentDirichletAllocation
from scipy.optimize import linear_sum_assignment
import pandas as pd
import faiss
import torch
from deap import base, creator, tools, algorithms
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
experience_weight = 0.2 





In [187]:
# Sample data
candidates_data = {
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'skills': [
        'Python, Machine Learning, Data Analysis, SQL, TensorFlow, Pandas, NumPy, Keras, Scikit-Learn, ETL',
        'Java, Spring, Microservices, Docker, Kubernetes, Maven, RESTful APIs, Hibernate, Git, CI/CD',
        'Python, Deep Learning, NLP, PyTorch, Data Visualization, Flask, AWS, Spark, Big Data, SQL',
        'JavaScript, React, Node.js, TypeScript, Web Development, Angular, HTML, CSS, Redux, Webpack'
    ],
    'experience': [7, 5, 4, 6]  # Years of experience
}


experts_data = {
    'id': [1, 2, 3, 4],
    'name': ['Dr. Johnson','Dr. Smith', 'Dr. Brown', 'Dr. Taylor'],
    'skills': [
        'Python, Machine Learning, Data Analysis, SQL, TensorFlow, Pandas, NumPy, Keras, Scikit-Learn, ETL',
        'Java, Spring, Microservices, Docker, Kubernetes, Maven, RESTful APIs, Keras, Scikit-Learn, ETL',
        'C++, High-Performance Computing, Algorithm Optimization, Data Structures, Machine Learning, SQL, Parallel Computing, MPI, OpenMP',
        'JavaScript, Full Stack Development, React, Node.js, TypeScript, Angular, Web Development, HTML, CSS, Redux'
    ],
    'experience': [15, 10, 12, 8]  # Years of experience
}
# Candidate David is best matched with Expert Dr. Taylor (Combined Score: 0.98)
# Candidate Alice is best matched with Expert Dr. Johnson (Combined Score: 0.90)
# Candidate Bob is best matched with Expert Dr. Smith (Combined Score: 0.76)
# Candidate Charlie is best matched with Expert Dr. Johnson (Combined Score: 0.30)


In [190]:
def cosine_similarity_with_BERT_embeddings(candidates_data, experts_data):
    # Create DataFrames
    candidates_df = pd.DataFrame(candidates_data)
    experts_df = pd.DataFrame(experts_data)

    # Load BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Function to get BERT embeddings
    def get_bert_embedding(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        # Mean pooling over the token embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.numpy().flatten()

    # Generate embeddings for candidates
    candidates_df['embedding'] = candidates_df['skills'].apply(get_bert_embedding)

    # Generate embeddings for experts
    experts_df['embedding'] = experts_df['skills'].apply(get_bert_embedding)

    # Convert the embeddings column into a list of arrays for similarity calculation
    candidates_embeddings = np.array(candidates_df['embedding'].tolist())
    experts_embeddings = np.array(experts_df['embedding'].tolist())

    # Normalize experience values to be between 0 and 1
    candidates_df['normalized_experience'] = candidates_df['experience'] / candidates_df['experience'].max()
    experts_df['normalized_experience'] = experts_df['experience'] / experts_df['experience'].max()

    # Compute cosine similarity based on embeddings
    similarity_matrix = cosine_similarity(candidates_embeddings, experts_embeddings)

    # Factor in the experience by multiplying the cosine similarity with experience ratios
    for i in range(similarity_matrix.shape[0]):
        for j in range(similarity_matrix.shape[1]):
            # Calculate the absolute difference in experience
            experience_diff = abs(candidates_df['experience'].iloc[i] - experts_df['experience'].iloc[j])
            
            # Compute inverse experience ratio (add a small epsilon to avoid division by zero)
            epsilon = 1e-6
            experience_ratio = 1 / (experience_diff + epsilon)
            
            # Normalize the experience ratio (optional, for better scaling)
            experience_ratio /= (1 + experience_ratio)
            
            # Adjust the similarity score
            similarity_matrix[i, j] *= (1 - experience_weight + experience_weight * experience_ratio)


    # Create a DataFrame to easily visualize the similarities
    similarity_df = pd.DataFrame(similarity_matrix, 
                                 index=candidates_df['name'], 
                                 columns=experts_df['name'])

    # Convert the DataFrame into the desired dictionary format
    similarity_dict = {}
    for candidate in similarity_df.index:
        similarity_dict[candidate] = {}
        for expert in similarity_df.columns:
            similarity_dict[candidate][expert] = similarity_df.loc[candidate, expert]

    return similarity_dict


In [191]:
print(cosine_similarity_with_BERT_embeddings(candidates_data, experts_data))

TypeError: Sequential.forward() got an unexpected keyword argument 'input_ids'

In [166]:
def cosine_similarity_with_topic_modelling(candidates_data, experts_data):
    # Convert data to DataFrames
    candidates_df = pd.DataFrame(candidates_data)
    experts_df = pd.DataFrame(experts_data)

    # Combine skills from both candidates and experts
    all_skills = pd.concat([candidates_df['skills'], experts_df['skills']])

    # Preprocessing - Vectorization using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_skills)

    # Apply LDA to extract topics
    n_topics = 5  # Number of topics to extract
    lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda_matrix = lda_model.fit_transform(tfidf_matrix)

    # Split LDA matrix back into candidates and experts
    candidates_lda = lda_matrix[:len(candidates_df)]
    experts_lda = lda_matrix[len(candidates_df):]

    # Compute cosine similarity between candidates and experts based on topic distribution
    similarity_matrix = cosine_similarity(candidates_lda, experts_lda)

    # Normalize expert experience and factor it into the similarity score
    experts_df['normalized_experience'] = experts_df['experience'] / experts_df['experience'].max()

    for i in range(similarity_matrix.shape[0]):
        for j in range(similarity_matrix.shape[1]):
            # Calculate the absolute difference in experience between candidate and expert
            experience_diff = abs(candidates_df['experience'].iloc[i] - experts_df['experience'].iloc[j])
            
            # Compute inverse experience ratio (add a small epsilon to avoid division by zero)
            epsilon = 1e-6
            experience_ratio = 1 / (experience_diff + epsilon)
            
            # Normalize the experience ratio (optional for scaling)
            experience_ratio /= (1 + experience_ratio)
            
            # Adjust similarity score with experience ratio
            similarity_matrix[i, j] *= (1 - experience_weight + experience_weight * experience_ratio)

    # Convert similarity matrix to nested dictionary format
    similarity_dict = {}
    for i, candidate in enumerate(candidates_df['name']):
        similarity_dict[candidate] = {}
        for j, expert in enumerate(experts_df['name']):
            similarity_dict[candidate][expert] = similarity_matrix[i][j]

    # Display the similarity dictionary
    # for candidate, experts in similarity_dict.items():
    #     print(f"\nSimilarity scores for {candidate}:")
    #     for expert, score in experts.items():
    #         print(f"  {expert}: {score:.6f}")

    return similarity_dict


In [167]:
print(cosine_similarity_with_topic_modelling(candidates_data, experts_data))

{'Alice': {'Dr. Johnson': 0.8333333277777788, 'Dr. Smith': 0.8181818165289259, 'Dr. Brown': 0.0975932183778799, 'Dr. Taylor': 0.0991069981263536}, 'Bob': {'Dr. Johnson': 0.8499945169006395, 'Dr. Smith': 0.8222169279314182, 'Dr. Brown': 0.10026787419950227, 'Dr. Taylor': 0.10680707376988563}, 'Charlie': {'Dr. Johnson': 0.09753588352152301, 'Dr. Smith': 0.0961345061947944, 'Dr. Brown': 0.09922590928830119, 'Dr. Taylor': 0.10085605654726858}, 'David': {'Dr. Johnson': 0.0964960347564007, 'Dr. Smith': 0.09419851078671959, 'Dr. Brown': 0.0983844719840481, 'Dr. Taylor': 0.8666626523483435}}


In [169]:
def Faiss_search(candidates_data, experts_data):

    # Convert data to DataFrames
    candidates_df = pd.DataFrame(candidates_data)
    experts_df = pd.DataFrame(experts_data)

    # Load pre-trained Sentence-BERT model to generate embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for candidate and expert skills
    candidates_embeddings = model.encode(candidates_df['skills'].tolist())
    experts_embeddings = model.encode(experts_df['skills'].tolist())

    # Normalize embeddings for better FAISS performance
    candidates_embeddings = np.array(candidates_embeddings).astype('float32')
    experts_embeddings = np.array(experts_embeddings).astype('float32')

    # Create a FAISS index
    index = faiss.IndexFlatL2(candidates_embeddings.shape[1])  # L2 distance
    index.add(experts_embeddings)  # Add expert embeddings to the index

    # Search for the nearest experts for each candidate
    k = len(experts_df)  # Search for all experts to get similarity with each expert
    distances, indices = index.search(candidates_embeddings, k)

    # Normalize expert experience and factor it into the similarity score
    experts_df['normalized_experience'] = experts_df['experience'] / experts_df['experience'].max()

    # Store the matches and similarity scores for each candidate-expert pair
    similarity_scores = {}

    # Iterate over candidates and store all similarity scores
    for i, candidate in enumerate(candidates_df['name']):
        candidate_scores = {}
        for j in range(k):
            expert_idx = indices[i][j]
            expert_name = experts_df.iloc[expert_idx]['name']
            similarity_score = 1 / (1 + distances[i][j])  # Convert distance to similarity score
            
            # Calculate the absolute difference in experience between candidate and expert
            experience_diff = abs(candidates_df['experience'].iloc[i] - experts_df['experience'].iloc[expert_idx])
            
            # Compute inverse experience ratio (add a small epsilon to avoid division by zero)
            epsilon = 1e-6
            experience_ratio = 1 / (experience_diff + epsilon)
            
            # Normalize the experience ratio (optional for scaling)
            experience_ratio /= (1 + experience_ratio)
            
            # Adjust the similarity score based on the experience ratio
            adjusted_similarity_score = similarity_score * (1 - experience_weight + experience_weight * experience_ratio)
            candidate_scores[expert_name] = adjusted_similarity_score
        
        similarity_scores[candidate] = candidate_scores


    # Return similarity scores
    return similarity_scores


In [170]:
print(Faiss_search(candidates_data, experts_data))

{'Alice': {'Dr. Johnson': 0.8333333277777788, 'Dr. Smith': 0.8181818165289259, 'Dr. Brown': 0.4471360019595779, 'Dr. Taylor': 0.3821540124358233}, 'Bob': {'Dr. Taylor': 0.4168642990282711, 'Dr. Johnson': 0.34337471281050685, 'Dr. Smith': 0.33215332085417537, 'Dr. Brown': 0.3288761976265938}, 'Charlie': {'Dr. Johnson': 0.5834837663294539, 'Dr. Smith': 0.5751003807371362, 'Dr. Brown': 0.4115934017350203, 'Dr. Taylor': 0.37401277770004954}, 'David': {'Dr. Taylor': 0.7838279382034288, 'Dr. Johnson': 0.3829458043553021, 'Dr. Smith': 0.3738280497572381, 'Dr. Brown': 0.3438182451372938}}


In [171]:
def Bipartite_graph_matching(candidates_data, experts_data):
    # Convert data to DataFrames
    candidates_df = pd.DataFrame(candidates_data)
    experts_df = pd.DataFrame(experts_data)

    # Load pre-trained Sentence-BERT model to generate embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for candidate and expert skills
    candidates_embeddings = model.encode(candidates_df['skills'].tolist())
    experts_embeddings = model.encode(experts_df['skills'].tolist())

    # Compute cosine similarity between each candidate and expert
    similarity_matrix = cosine_similarity(candidates_embeddings, experts_embeddings)

    # Normalize expert experience and factor it into the similarity scores
    experts_df['normalized_experience'] = experts_df['experience'] / experts_df['experience'].max()

    # Adjust similarity scores based on expert experience
    for candidate_idx in range(len(candidates_df)):
        for expert_idx in range(len(experts_df)):
            # Calculate the absolute difference in experience between candidate and expert
            experience_diff = abs(candidates_df['experience'].iloc[candidate_idx] - experts_df['experience'].iloc[expert_idx])
            
            # Compute inverse experience ratio (add a small epsilon to avoid division by zero)
            epsilon = 1e-6
            experience_ratio = 1 / (experience_diff + epsilon)
            
            # Normalize the experience ratio (optional for scaling)
            experience_ratio /= (1 + experience_ratio)
            
            # Adjust the similarity score based on the experience ratio
            similarity_matrix[candidate_idx, expert_idx] *= (
                1 - experience_weight + experience_weight * experience_ratio
            )


    # Create a DataFrame for storing the results
    results = []

    # Find best matches for each candidate
    for candidate_idx, candidate_name in enumerate(candidates_df['name']):
        # For each candidate, get the best matches (top experts)
        best_experts_idx = np.argsort(-similarity_matrix[candidate_idx])  # Sort indices based on similarity scores in descending order
        for expert_idx in best_experts_idx:
            expert_name = experts_df.iloc[expert_idx]['name']
            similarity_score = similarity_matrix[candidate_idx, expert_idx]
            results.append((candidate_name, expert_name, similarity_score))

    # Create a dictionary to store similarity scores
    similarity_scores = {}
    for candidate, expert, score in results:
        if candidate not in similarity_scores:
            similarity_scores[candidate] = {}
        similarity_scores[candidate][expert] = score

    return similarity_scores


In [172]:
print(Bipartite_graph_matching(candidates_data, experts_data))

{'Alice': {'Dr. Johnson': 0.8333333, 'Dr. Smith': 0.8181818, 'Dr. Brown': 0.476406, 'Dr. Taylor': 0.32970056}, 'Bob': {'Dr. Taylor': 0.37846094, 'Dr. Johnson': 0.22294234, 'Dr. Smith': 0.21565665, 'Dr. Brown': 0.19421607}, 'Charlie': {'Dr. Johnson': 0.6545541, 'Dr. Smith': 0.6451496, 'Dr. Brown': 0.41207463, 'Dr. Taylor': 0.31671673}, 'David': {'Dr. Taylor': 0.82087, 'Dr. Johnson': 0.3387207, 'Dr. Smith': 0.33065593, 'Dr. Brown': 0.24446523}}


In [61]:

# print(Multi_Objective_Genetic_Algorithms(candidates_data, experts_data))

Similarity Scores for All Individuals:
Individual [0, 1, 0]: {'Alice': {'Dr. Smith': 0.7073623}, 'Bob': {'Dr. Johnson': 0.8378098}, 'Charlie': {'Dr. Smith': 0.5688001}}
Individual [0, 0, 0]: {'Alice': {'Dr. Smith': 0.7073623}, 'Bob': {'Dr. Smith': 0.086992055}, 'Charlie': {'Dr. Smith': 0.5688001}}
{'[0, 1, 0]': {'Alice': {'Dr. Smith': 0.7073623}, 'Bob': {'Dr. Johnson': 0.8378098}, 'Charlie': {'Dr. Smith': 0.5688001}}, '[0, 0, 0]': {'Alice': {'Dr. Smith': 0.7073623}, 'Bob': {'Dr. Smith': 0.086992055}, 'Charlie': {'Dr. Smith': 0.5688001}}}


In [67]:
# print(K_means_Clustering_Algorithm(candidates_data, experts_data))

{'Alice': {'Dr. Smith': 1.0}, 'Charlie': {'Dr. Smith': 1.0}, 'Bob': {'Dr. Johnson': 1.0}}


In [173]:
def hybrid_recommendation_algorithm(candidates_data, experts_data):    
    # Create DataFrames
    candidates_df = pd.DataFrame(candidates_data)
    experts_df = pd.DataFrame(experts_data)

    # Combine skills to create a vocabulary
    all_skills = candidates_df['skills'].tolist() + experts_df['skills'].tolist()

    # Vectorize skills
    vectorizer = CountVectorizer()
    skills_matrix = vectorizer.fit_transform(all_skills)

    # Split into candidates and experts features
    candidates_skills_matrix = skills_matrix[:len(candidates_df)]
    experts_skills_matrix = skills_matrix[len(candidates_df):]

    # Convert to DataFrames
    candidates_skills_df = pd.DataFrame(candidates_skills_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    experts_skills_df = pd.DataFrame(experts_skills_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # Add experience as a feature
    candidates_features = pd.concat([candidates_skills_df, candidates_df['experience']], axis=1)
    experts_features = pd.concat([experts_skills_df, experts_df['experience']], axis=1)

    # Convert all column names to strings
    candidates_features.columns = candidates_features.columns.astype(str)
    experts_features.columns = experts_features.columns.astype(str)

    # Ensure all features are numeric
    candidates_features = candidates_features.apply(pd.to_numeric, errors='ignore')
    experts_features = experts_features.apply(pd.to_numeric, errors='ignore')

    # Standardize the features
    scaler = StandardScaler()
    candidates_features_scaled = scaler.fit_transform(candidates_features)
    experts_features_scaled = scaler.transform(experts_features)

    # Combine all features for collaborative filtering
    all_features_scaled = np.vstack([candidates_features_scaled, experts_features_scaled])

    # Perform Dimensionality Reduction
    svd = TruncatedSVD(n_components=10)
    all_features_reduced = svd.fit_transform(all_features_scaled)

    # Calculate cosine similarity for collaborative filtering
    similarity_matrix = cosine_similarity(all_features_reduced)

    # Split back into candidates and experts similarity matrix
    candidates_similarity_matrix = similarity_matrix[:len(candidates_df), len(candidates_df):]
    experts_similarity_matrix = similarity_matrix[len(candidates_df):, :len(candidates_df)]

    # Transpose the experts_similarity_matrix to match the shape
    experts_similarity_matrix = experts_similarity_matrix.T

    # Define weights for hybrid recommendation
    alpha = 0.5  # Weight for content-based scores
    beta = 0.5   # Weight for collaborative scores

    # Compute final scores
    final_scores = alpha * candidates_similarity_matrix + beta * experts_similarity_matrix
    experts_df['normalized_experience'] = experts_df['experience'] / experts_df['experience'].max()

    for i in range(len(candidates_df)):
        for j in range(len(experts_df)):
            # Calculate the absolute difference in experience between candidate and expert
            experience_diff = abs(candidates_df['experience'].iloc[i] - experts_df['experience'].iloc[j])
            
            # Compute inverse experience ratio (add a small epsilon to avoid division by zero)
            epsilon = 1e-6
            experience_ratio = 1 / (experience_diff + epsilon)
            
            # Normalize the experience ratio (optional for scaling)
            experience_ratio /= (1 + experience_ratio)
            
            # Adjust the final score based on the experience ratio
            final_scores[i, j] *= (1 - experience_weight + experience_weight * experience_ratio)

    # Construct output in desired format
    output_similarity_matrix = {}
    for i, candidate_name in enumerate(candidates_df['name']):
        output_similarity_matrix[candidate_name] = {}
        for j, expert_name in enumerate(experts_df['name']):
            output_similarity_matrix[candidate_name][expert_name] = final_scores[i, j]

    return output_similarity_matrix


In [174]:
print(hybrid_recommendation_algorithm(candidates_data, experts_data))

{'Alice': {'Dr. Johnson': 0.672152479572753, 'Dr. Smith': 0.4445580676203108, 'Dr. Brown': 0.06636245976450182, 'Dr. Taylor': -0.30156318455931214}, 'Bob': {'Dr. Johnson': -0.1919659288904818, 'Dr. Smith': -0.06704293235792054, 'Dr. Brown': 0.04121712974441467, 'Dr. Taylor': -0.17659959187130372}, 'Charlie': {'Dr. Johnson': -0.1852625648798173, 'Dr. Smith': -0.20074282779498814, 'Dr. Brown': -0.07343747072075271, 'Dr. Taylor': -0.32181506778530855}, 'David': {'Dr. Johnson': -0.24958310976542486, 'Dr. Smith': -0.1511172216734905, 'Dr. Brown': -0.034049975615631764, 'Dr. Taylor': 0.7574836205993246}}


In [184]:
cosine_similarity_with_BERT_embeddings_matrix = cosine_similarity_with_BERT_embeddings(candidates_data, experts_data)
cosine_similarity_with_topic_modelling_matrix = cosine_similarity_with_topic_modelling(candidates_data, experts_data)
Faiss_search_matrix = Faiss_search(candidates_data, experts_data)
Bipartite_graph_matching_matrix = Bipartite_graph_matching(candidates_data, experts_data)
hybrid_recommendation_algorithm_matrix = hybrid_recommendation_algorithm(candidates_data, experts_data)

# without precomputation - 8.9 s



In [185]:
# Example scores from each method with desired format
methods_scores = {
    'cosine_similarity_with_topic_modelling': cosine_similarity_with_topic_modelling_matrix,
    'cosine_similarity_with_BERT_embeddings': cosine_similarity_with_BERT_embeddings_matrix,
    'faiss_searching': Faiss_search_matrix,
    'hybrid_recommendation': hybrid_recommendation_algorithm_matrix
}

# Normalizing function
def normalize_scores(scores):
    all_scores = [v for candidate_scores in scores.values() for v in candidate_scores.values()]
    min_score = min(all_scores)
    max_score = max(all_scores)
    return {candidate: {expert: (score - min_score) / (max_score - min_score)
                        for expert, score in candidate_scores.items()}
            for candidate, candidate_scores in scores.items()}

# Normalize all methods' scores
normalized_methods_scores = {method: normalize_scores(scores) for method, scores in methods_scores.items()}

# Step 2: Combine the scores
# Assigning equal weight to all methods
weights = {method: 1 / len(normalized_methods_scores) for method in normalized_methods_scores}

combined_scores = {candidate: {} for candidate in normalized_methods_scores['faiss_searching'].keys()}

for candidate in combined_scores.keys():
    for expert in normalized_methods_scores['faiss_searching'][candidate].keys():
        combined_score = sum(weights[method] * normalized_methods_scores[method][candidate][expert]
                             for method in normalized_methods_scores)
        combined_scores[candidate][expert] = combined_score

# Step 3: Rank the matches and ensure unique matches per candidate
final_matches = sorted([(candidate, expert, score) for candidate, experts in combined_scores.items() 
                        for expert, score in experts.items()],
                       key=lambda x: x[2], reverse=True)

# Ensure unique matching per candidate
assigned_candidates = set()
unique_matches = []

# Iterate over sorted matches
for match in final_matches:
    candidate, expert, score = match
    if candidate not in assigned_candidates:
        unique_matches.append(match)
        assigned_candidates.add(candidate)

# Display final unique matches
print("Final Unique Matching based on Combined Scores:")
for match in unique_matches:
    print(f"Candidate {match[0]} is best matched with Expert {match[1]} (Combined Score: {match[2]:.2f})")


Final Unique Matching based on Combined Scores:
Candidate David is best matched with Expert Dr. Taylor (Combined Score: 0.98)
Candidate Alice is best matched with Expert Dr. Johnson (Combined Score: 0.90)
Candidate Bob is best matched with Expert Dr. Smith (Combined Score: 0.76)
Candidate Charlie is best matched with Expert Dr. Johnson (Combined Score: 0.30)


In [None]:
# # takes too much time
# def Multi_Objective_Genetic_Algorithms(candidates_data, experts_data):
#     # Convert data to DataFrames
#     candidates_df = pd.DataFrame(candidates_data)
#     experts_df = pd.DataFrame(experts_data)

#     # Load pre-trained Sentence-BERT model to generate embeddings
#     model = SentenceTransformer('all-MiniLM-L6-v2')

#     # Generate embeddings for candidate and expert skills
#     candidates_embeddings = model.encode(candidates_df['skills'].tolist())
#     experts_embeddings = model.encode(experts_df['skills'].tolist())

#     # Compute cosine similarity between each candidate and expert
#     similarity_matrix = cosine_similarity(candidates_embeddings, experts_embeddings)

#     # Create a similarity matrix DataFrame
#     similarity_matrix_df = pd.DataFrame(similarity_matrix,
#                                         index=candidates_df['name'],
#                                         columns=experts_df['name'])

#     # Genetic Algorithm Setup
#     creator.create("FitnessMulti", base.Fitness, weights=(1.0, -1.0))  # Maximize similarity, minimize experience difference
#     creator.create("Individual", list, fitness=creator.FitnessMulti)

#     toolbox = base.Toolbox()

#     # Initialize individual with random assignment of experts to candidates, allowing duplicates
#     def init_individual():
#         return random.choices(range(len(experts_df)), k=len(candidates_df))

#     toolbox.register("individual", tools.initIterate, creator.Individual, init_individual)
#     toolbox.register("population", tools.initRepeat, list, toolbox.individual)

#     def evaluate(individual):
#         total_similarity = 0.0
#         total_experience_difference = 0.0
        
#         for i, expert_idx in enumerate(individual):
#             candidate_experience = candidates_df.iloc[i]['experience']
#             expert_experience = experts_df.iloc[expert_idx]['experience']
            
#             similarity = similarity_matrix[i, expert_idx]
#             experience_difference = abs(candidate_experience - expert_experience)
            
#             total_similarity += similarity
#             total_experience_difference += experience_difference
        
#         return total_similarity / len(individual), total_experience_difference / len(individual)

#     toolbox.register("mate", tools.cxTwoPoint)
#     toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.2)
#     toolbox.register("select", tools.selNSGA2)
#     toolbox.register("evaluate", evaluate)

#     # Genetic Algorithm Execution
#     population = toolbox.population(n=50)  # Population size
#     ngen = 50  # Number of generations
#     cxpb = 0.7  # Crossover probability
#     mutpb = 0.2  # Mutation probability

#     result_population = algorithms.eaMuPlusLambda(population, toolbox, mu=50, lambda_=100, cxpb=cxpb, mutpb=mutpb, ngen=ngen, 
#                                                 stats=None, halloffame=None, verbose=False)

#     # Extract best individuals
#     all_individuals = result_population[0]
    
#     # Create a dictionary to store the similarity scores for all individuals
#     similarity_scores_all = {}

#     for ind in all_individuals:
#         individual_score = {}
#         for i, expert_idx in enumerate(ind):
#             candidate_name = candidates_df.iloc[i]['name']
#             expert_name = experts_df.iloc[expert_idx]['name']
#             similarity_score = similarity_matrix[i, expert_idx]

#             if candidate_name not in individual_score:
#                 individual_score[candidate_name] = {}
            
#             individual_score[candidate_name][expert_name] = similarity_score
        
#         similarity_scores_all[str(ind)] = individual_score

#     # Print all matches with similarity scores
#     print("Similarity Scores for All Individuals:")
#     for individual, scores in similarity_scores_all.items():
#         print(f"Individual {individual}: {scores}")

#     return similarity_scores_all



In [None]:
# def K_means_Clustering_Algorithm(candidates_data, experts_data):
#     # Create DataFrames
#     candidates_df = pd.DataFrame(candidates_data)
#     experts_df = pd.DataFrame(experts_data)

#     # Combine skills to create a vocabulary
#     all_skills = candidates_df['skills'].tolist() + experts_df['skills'].tolist()

#     # Vectorize skills
#     vectorizer = CountVectorizer()
#     skills_matrix = vectorizer.fit_transform(all_skills)

#     # Split into candidates and experts features
#     candidates_skills_matrix = skills_matrix[:len(candidates_df)]
#     experts_skills_matrix = skills_matrix[len(candidates_df):]

#     # Convert to DataFrames
#     candidates_skills_df = pd.DataFrame(candidates_skills_matrix.toarray(), columns=vectorizer.get_feature_names_out())
#     experts_skills_df = pd.DataFrame(experts_skills_matrix.toarray(), columns=vectorizer.get_feature_names_out())

#     # Add experience as a feature
#     candidates_features = pd.concat([candidates_skills_df, candidates_df['experience']], axis=1)
#     experts_features = pd.concat([experts_skills_df, experts_df['experience']], axis=1)

#     # Ensure 'experience' is numeric
#     candidates_features['experience'] = pd.to_numeric(candidates_features['experience'], errors='coerce')
#     experts_features['experience'] = pd.to_numeric(experts_features['experience'], errors='coerce')

#     # Combine all features for standardization
#     all_features = pd.concat([candidates_features, experts_features], axis=0)

#     # Handle missing values (e.g., fill with 0)
#     all_features.fillna(0, inplace=True)

#     # Standardize the combined features
#     scaler = StandardScaler()
#     all_features_scaled = scaler.fit_transform(all_features)

#     # Split back into candidates and experts
#     candidates_features_scaled = all_features_scaled[:len(candidates_df)]
#     experts_features_scaled = all_features_scaled[len(candidates_df):]

#     # Apply K-Means
#     kmeans = KMeans(n_clusters=len(experts_df), random_state=42)
#     kmeans.fit(all_features_scaled)

#     # Get cluster labels
#     candidates_clusters = kmeans.predict(candidates_features_scaled)
#     experts_clusters = kmeans.predict(experts_features_scaled)

#     # Assign cluster labels back to DataFrames
#     candidates_df['cluster'] = candidates_clusters
#     experts_df['cluster'] = experts_clusters

#     # Compute similarity matrix for each cluster
#     combined_similarity_matrix = {}
#     for cluster in np.unique(candidates_clusters):
#         cluster_candidates = candidates_df[candidates_df['cluster'] == cluster]
#         cluster_experts = experts_df[experts_df['cluster'] == cluster]
        
#         if not cluster_experts.empty and not cluster_candidates.empty:
#             # Combine skills and experience for similarity calculation
#             cluster_candidates_features = cluster_candidates.drop(columns=['id', 'name', 'cluster'])
#             cluster_experts_features = cluster_experts.drop(columns=['id', 'name', 'cluster'])

#             # Convert all columns to numeric and handle missing values
#             cluster_candidates_features = cluster_candidates_features.apply(pd.to_numeric, errors='coerce').fillna(0)
#             cluster_experts_features = cluster_experts_features.apply(pd.to_numeric, errors='coerce').fillna(0)

#             # Compute cosine similarity between candidates and experts in the same cluster
#             similarity_matrix = cosine_similarity(cluster_candidates_features, cluster_experts_features)
            
#             # Store similarity scores in the desired dictionary format
#             for i, candidate_name in enumerate(cluster_candidates['name']):
#                 if candidate_name not in combined_similarity_matrix:
#                     combined_similarity_matrix[candidate_name] = {}
#                 for j, expert_name in enumerate(cluster_experts['name']):
#                     combined_similarity_matrix[candidate_name][expert_name] = similarity_matrix[i][j]

#     return combined_similarity_matrix
