## Model Improvement Suggentions

**1:** Test different methods for each model pipeline steps

**1.1:** Try different Model
- Current Model: all-mpnet-base-v2
- [Hugging Face - Model leaderboard:](https://huggingface.co/spaces/mteb/leaderboard) (Try at least 3 best ones)

**1.2:** Try skill-by-skill vs. bag-of-skills

**1.3:** ...

**2:** Last Step: Cross-Encoder without and with special tokens (new approach)

# 1 Running the Analysis on the Baseline Model

## 1.1 Import the Synthetic Dataset (100 JDs a 6 CVs)

In [1]:
import pandas as pd

# Import the CSV file into a dataframe
file_path = "/Users/timg/Desktop/Workproject/my_fork/workproject_matching_algo/synthetic_data_csv/df_synth_data.csv"
df_synth_data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to verify the import
df_synth_data

Unnamed: 0,JD_ID,JD_title,JD_text,CV_ID,CV_text,label
0,1,Senior Site Reliability Engineer,About the Team Workday is building a new SRE t...,11,"**John Doe** \n123 Tech Lane \nPleasanton, C...",1
1,1,Senior Site Reliability Engineer,About the Team Workday is building a new SRE t...,12,**John Smith** \n[LinkedIn Profile] | [Github...,1
2,1,Senior Site Reliability Engineer,About the Team Workday is building a new SRE t...,13,**John D. Anderson** \n123 Tech Lane \nSan F...,1
3,1,Senior Site Reliability Engineer,About the Team Workday is building a new SRE t...,14,**John Anderson** \n123 Tech Avenue \nPleasa...,0
4,1,Senior Site Reliability Engineer,About the Team Workday is building a new SRE t...,15,**John Smith** \n123 Tech Lane \nSan Francis...,0
...,...,...,...,...,...,...
595,100,Software Engineer,"Why APCON? At APCON, we are committed to setti...",1002,"**John Smith** \n123 Tech Drive \nSan Jose, ...",1
596,100,Software Engineer,"Why APCON? At APCON, we are committed to setti...",1003,**Resume**\n\n**John Doe** \n123 Main Street ...,1
597,100,Software Engineer,"Why APCON? At APCON, we are committed to setti...",1004,"**John Doe** \n[Your Address] \n[City, State...",0
598,100,Software Engineer,"Why APCON? At APCON, we are committed to setti...",1005,"**John Doe** \n1234 Elm Street \nCityville, ...",0


## 1.2 Importing Classes and Functions

In [2]:
import sys
import os
import pandas as pd
from spacy.lang.en import English

# Define project root paths
PROJECT_ROOT = '/Users/timg/Desktop/Workproject/my_fork/workproject_matching_algo'
SERVICES_DIR = os.path.join(PROJECT_ROOT, 'services')
RESOURCES_DIR = os.path.join(PROJECT_ROOT, 'Resources', 'data')

# Add the paths to sys.path if not already present
for module_path in [SERVICES_DIR, PROJECT_ROOT]:
    if module_path not in sys.path:
        sys.path.append(module_path)

# Import the necessary functions
from main import calc_similarity, calc_similarity_sbs, job_info_extraction, resume_extraction  # Import the necessary functions

# Define the path to your skills patterns file
skills_patterns_path = os.path.join(RESOURCES_DIR, 'skills.jsonl')

  from tqdm.autonotebook import tqdm, trange


## 1.3 Setting up the Baseline Model Function

In [None]:
import time

def main_synth_df(synth_data):
    t0 = time.time()

    # Initialize an empty list to store results
    similarity_results = []

    # Iterate over each unique job description ID in the dataset
    for jd_id in synth_data['JD_ID'].unique():
        # Filter the data for the current job description
        job_data = synth_data[synth_data['JD_ID'] == jd_id]

        # Extract the job description text
        jd_text = job_data.iloc[0]['JD_text']
        df_jobs = pd.DataFrame([jd_text], columns=["raw"])
        df_jobs = job_info_extraction(df_jobs)  # Use job_info_extraction to extract job description skills
        
        # Extract resumes for this job description
        resumes = job_data[['CV_ID', 'CV_text']].copy()  # Include necessary columns
        resumes['name'] = resumes['CV_ID']  # Add a "name" column derived from CV_ID for compatibility
        resumes['raw'] = resumes['CV_text']  # Add a "raw" column for compatibility
        df_resumes = resume_extraction(resumes)  # Use resume_extraction to extract resume skills
        
        # Calculate similarity for the job description and its related resumes
        analysis_data_df = calc_similarity(df_resumes, df_jobs, parallel=True)
        
        # Add similarity scores back to the original data
        job_data = job_data.reset_index(drop=True)
        job_data['Similarity_score'] = analysis_data_df['similarity_score']  # Assuming calc_similarity outputs this

        # Append to results
        similarity_results.append(job_data)

    # Combine all results into a single DataFrame
    df_final = pd.concat(similarity_results, ignore_index=True)

    t1 = time.time()
    dt = t1 - t0
    print(f"Processing Time: {dt*1000:.2f}ms")
    
    return df_final

## 1.4 Running the Baseline Model

### 1.4.1 First Attempt

**Specs:**
- 100 JDs a 6 CVs
- "almost perfect" vs. "slightly bad"

In [None]:
df_result = main_synth_df(df_synth_data)

df_result.head(3)

In [None]:
# Add a new column 'label_predicted' where top 3 resumes = 1 and the rest = 0
df_result = df_result.sort_values(['JD_ID', 'Similarity_score'], ascending=[True, False])

# Apply the labeling logic within each JD_ID group
df_result['label_predicted'] = df_result.groupby('JD_ID')['Similarity_score'].rank(ascending=False).apply(lambda x: 1 if x <= 3 else 0)

# Display the updated DataFrame
df_result

In [None]:
df_result["label_predicted"].value_counts()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate the confusion matrix
cm = confusion_matrix(df_result['label'], df_result['label_predicted'])

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Class 0", "Class 1"])
disp.plot(cmap="Blues", values_format="d")

In [None]:
# Calculate mean and median similarity scores for label = 1
mean_label_1 = round(df_result[df_result['label'] == 1]['Similarity_score'].mean(), 4)
median_label_1 = round(df_result[df_result['label'] == 1]['Similarity_score'].median(), 4)

# Calculate mean and median similarity scores for label = 0
mean_label_0 = round(df_result[df_result['label'] == 0]['Similarity_score'].mean(), 4)
median_label_0 = round(df_result[df_result['label'] == 0]['Similarity_score'].median(), 4)

# Calculate differences between label 1 and label 0
mean_difference = round(mean_label_1 - mean_label_0, 4)
median_difference = round(median_label_1 - median_label_0, 4)

# Output the results
print(f"Label 1 - Mean Similarity Score: {mean_label_1}, Median Similarity Score: {median_label_1}")
print(f"Label 0 - Mean Similarity Score: {mean_label_0}, Median Similarity Score: {median_label_0}")
print(f"Difference - Mean: {mean_difference}, Median: {median_difference}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Separate similarity scores for label = 1 and label = 0
scores_label_1 = df_result[df_result['label'] == 1]['Similarity_score']
scores_label_0 = df_result[df_result['label'] == 0]['Similarity_score']

# Create histogram data for both labels with more bins (50 bins)
bins = np.linspace(0, 1, 50)  # Define bins from 0 to 1 for the similarity scores
hist_label_1, _ = np.histogram(scores_label_1, bins=bins, density=True)
hist_label_0, _ = np.histogram(scores_label_0, bins=bins, density=True)

# Calculate the bin centers for the line plot
bin_centers = (bins[:-1] + bins[1:]) / 2

# Plot the line graph
plt.figure(figsize=(12, 6))
plt.plot(bin_centers, hist_label_1, label='Label 1')
plt.plot(bin_centers, hist_label_0, label='Label 0')

# Add titles and labels
plt.title('Distribution of Similarity Scores by Label (Detailed Line Graph)', fontsize=16)
plt.xlabel('Similarity Score', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.legend(fontsize=12)

# Display the plot
plt.show()

In [None]:
# Filter the rows where "label" and "label_predicted" do not match
df_mismatched = df_result[df_result['label'] != df_result['label_predicted']]

# Display the filtered DataFrame
df_mismatched.head()

In [None]:
# Identify JD_IDs with mismatched rows
mismatched_jd_ids = df_result.loc[df_result['label'] != df_result['label_predicted'], 'JD_ID'].unique()

# Filter the DataFrame to include only rows with these JD_IDs
df_mismatched_jd_ids = df_result[df_result['JD_ID'].isin(mismatched_jd_ids)]

# Display the filtered DataFrame
df_mismatched_jd_ids.head(6)

In [None]:
# Sort the DataFrame by JD_ID and Similarity_score
df_mismatched_jd_ids = df_mismatched_jd_ids.sort_values(by=['JD_ID', 'Similarity_score'], ascending=[True, False])

# Display the sorted DataFrame
df_mismatched_jd_ids.head(6)

### 1.4.2 Second Attempt

**Specs:**
- Same, second try

In [None]:
# Run the function
df_result_2 = main_synth_df(df_synth_data)

df_result_2.head(3)

In [None]:
# Add a new column 'label_predicted' where top 3 resumes = 1 and the rest = 0
df_result_2 = df_result_2.sort_values(['JD_ID', 'Similarity_score'], ascending=[True, False])

# Apply the labeling logic within each JD_ID group
df_result_2['label_predicted'] = df_result_2.groupby('JD_ID')['Similarity_score'].rank(ascending=False).apply(lambda x: 1 if x <= 3 else 0)

# Display the updated DataFrame
df_result_2.head(3)

In [None]:
df_result_2["label_predicted"].value_counts()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate the confusion matrix
cm = confusion_matrix(df_result_2['label'], df_result_2['label_predicted'])

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Class 0", "Class 1"])
disp.plot(cmap="Blues", values_format="d")

In [None]:
# Calculate mean and median similarity scores for label = 1
mean_label_1 = round(df_result[df_result['label'] == 1]['Similarity_score'].mean(), 4)
median_label_1 = round(df_result[df_result['label'] == 1]['Similarity_score'].median(), 4)

# Calculate mean and median similarity scores for label = 0
mean_label_0 = round(df_result[df_result['label'] == 0]['Similarity_score'].mean(), 4)
median_label_0 = round(df_result[df_result['label'] == 0]['Similarity_score'].median(), 4)

# Calculate differences between label 1 and label 0
mean_difference = round(mean_label_1 - mean_label_0, 4)
median_difference = round(median_label_1 - median_label_0, 4)

# Output the results
print(f"Label 1 - Mean Similarity Score: {mean_label_1}, Median Similarity Score: {median_label_1}")
print(f"Label 0 - Mean Similarity Score: {mean_label_0}, Median Similarity Score: {median_label_0}")
print(f"Difference - Mean: {mean_difference}, Median: {median_difference}")

### 1.4.3 Third Attempt

**Specs:**
- 20 JDs a 6 CVs
- "very good fit" vs. "moderate fit"

In [None]:
import pandas as pd

# Import the CSV file into a dataframe
file_path = "/Users/timg/Desktop/Workproject/my_fork/workproject_matching_algo/synthetic_data_csv/df_synth_data_2_test.csv"
df_synth_data_2_test = pd.read_csv(file_path)

# Display the first few rows of the dataframe to verify the import
df_synth_data_2_test

In [None]:
df_result_2_test = main_synth_df(df_synth_data_2_test)

df_result_2_test.head(3)

In [None]:
# Add a new column 'label_predicted' where top 3 resumes = 1 and the rest = 0
df_result_2_test = df_result_2_test.sort_values(['JD_ID', 'Similarity_score'], ascending=[True, False])

# Apply the labeling logic within each JD_ID group
df_result_2_test['label_predicted'] = df_result_2_test.groupby('JD_ID')['Similarity_score'].rank(ascending=False).apply(lambda x: 1 if x <= 3 else 0)

# Display the updated DataFrame
df_result_2_test

In [None]:
df_result_2_test["label_predicted"].value_counts()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate the confusion matrix
cm_2_test = confusion_matrix(df_result_2_test['label'], df_result_2_test['label_predicted'])

# Display the confusion matrix
disp_2_test = ConfusionMatrixDisplay(confusion_matrix=cm_2_test, display_labels=["Class 0", "Class 1"])
disp_2_test.plot(cmap="Blues", values_format="d")

In [None]:
# Calculate mean and median similarity scores for label = 1
mean_label_1_2_test = round(df_result_2_test[df_result_2_test['label'] == 1]['Similarity_score'].mean(), 4)
median_label_1_2_test = round(df_result_2_test[df_result_2_test['label'] == 1]['Similarity_score'].median(), 4)

# Calculate mean and median similarity scores for label = 0
mean_label_0_2_test = round(df_result_2_test[df_result_2_test['label'] == 0]['Similarity_score'].mean(), 4)
median_label_0_2_test = round(df_result_2_test[df_result_2_test['label'] == 0]['Similarity_score'].median(), 4)

# Calculate differences between label 1 and label 0
mean_difference_2_test = round(mean_label_1_2_test - mean_label_0_2_test, 4)
median_difference_2_test = round(median_label_1_2_test - median_label_0_2_test, 4)

# Output the results
print(f"Label 1 - Mean Similarity Score: {mean_label_1_2_test}, Median Similarity Score: {median_label_1_2_test}")
print(f"Label 0 - Mean Similarity Score: {mean_label_0_2_test}, Median Similarity Score: {median_label_0_2_test}")
print(f"Difference - Mean: {mean_difference_2_test}, Median: {median_difference_2_test}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Separate similarity scores for label = 1 and label = 0
scores_label_1 = df_result_2_test[df_result_2_test['label'] == 1]['Similarity_score']
scores_label_0 = df_result_2_test[df_result_2_test['label'] == 0]['Similarity_score']

# Create histogram data for both labels with more bins (50 bins)
bins = np.linspace(0, 1, 50)  # Define bins from 0 to 1 for the similarity scores
hist_label_1, _ = np.histogram(scores_label_1, bins=bins, density=True)
hist_label_0, _ = np.histogram(scores_label_0, bins=bins, density=True)

# Calculate the bin centers for the line plot
bin_centers = (bins[:-1] + bins[1:]) / 2

# Plot the line graph
plt.figure(figsize=(12, 6))
plt.plot(bin_centers, hist_label_1, label='Label 1')
plt.plot(bin_centers, hist_label_0, label='Label 0')

# Add titles and labels
plt.title('Distribution of Similarity Scores by Label (Detailed Line Graph)', fontsize=16)
plt.xlabel('Similarity Score', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.legend(fontsize=12)

# Display the plot
plt.show()

### 1.2.4 Fourth Attempt

**Specs:**
- 20 JDs a 6 CVs
- "almost perfect fit" vs. "slightly bad to moderate fit"

In [None]:
import pandas as pd

# Import the CSV file into a dataframe
file_path = "/Users/timg/Desktop/Workproject/my_fork/workproject_matching_algo/synthetic_data_csv/df_synth_data_3_test.csv"
df_synth_data_3_test = pd.read_csv(file_path)

# Display the first few rows of the dataframe to verify the import
df_synth_data_3_test

In [None]:
df_result_3_test = main_synth_df(df_synth_data_3_test)

df_result_3_test.head(3)

In [None]:
# Add a new column 'label_predicted' where top 3 resumes = 1 and the rest = 0
df_result_3_test = df_result_3_test.sort_values(['JD_ID', 'Similarity_score'], ascending=[True, False])

# Apply the labeling logic within each JD_ID group
df_result_3_test['label_predicted'] = df_result_3_test.groupby('JD_ID')['Similarity_score'].rank(ascending=False).apply(lambda x: 1 if x <= 3 else 0)

# Display the updated DataFrame
df_result_3_test

In [None]:
df_result_3_test["label_predicted"].value_counts()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate the confusion matrix
cm_3_test = confusion_matrix(df_result_3_test['label'], df_result_3_test['label_predicted'])

# Display the confusion matrix
disp_3_test = ConfusionMatrixDisplay(confusion_matrix=cm_3_test, display_labels=["Class 0", "Class 1"])
disp_3_test.plot(cmap="Blues", values_format="d")

In [None]:
# Calculate mean and median similarity scores for label = 1
mean_label_1_3_test = round(df_result_3_test[df_result_3_test['label'] == 1]['Similarity_score'].mean(), 4)
median_label_1_3_test = round(df_result_3_test[df_result_3_test['label'] == 1]['Similarity_score'].median(), 4)

# Calculate mean and median similarity scores for label = 0
mean_label_0_3_test = round(df_result_3_test[df_result_3_test['label'] == 0]['Similarity_score'].mean(), 4)
median_label_0_3_test = round(df_result_3_test[df_result_3_test['label'] == 0]['Similarity_score'].median(), 4)

# Calculate differences between label 1 and label 0
mean_difference_3_test = round(mean_label_1_3_test - mean_label_0_3_test, 4)
median_difference_3_test = round(median_label_1_3_test - median_label_0_3_test, 4)

# Output the results
print(f"Label 1 - Mean Similarity Score: {mean_label_1_3_test}, Median Similarity Score: {median_label_1_3_test}")
print(f"Label 0 - Mean Similarity Score: {mean_label_0_3_test}, Median Similarity Score: {median_label_0_3_test}")
print(f"Difference - Mean: {mean_difference_3_test}, Median: {median_difference_3_test}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Separate similarity scores for label = 1 and label = 0
scores_label_1 = df_result_3_test[df_result_3_test['label'] == 1]['Similarity_score']
scores_label_0 = df_result_3_test[df_result_3_test['label'] == 0]['Similarity_score']

# Create histogram data for both labels with more bins (50 bins)
bins = np.linspace(0, 1, 50)  # Define bins from 0 to 1 for the similarity scores
hist_label_1, _ = np.histogram(scores_label_1, bins=bins, density=True)
hist_label_0, _ = np.histogram(scores_label_0, bins=bins, density=True)

# Calculate the bin centers for the line plot
bin_centers = (bins[:-1] + bins[1:]) / 2

# Plot the line graph
plt.figure(figsize=(12, 6))
plt.plot(bin_centers, hist_label_1, label='Label 1')
plt.plot(bin_centers, hist_label_0, label='Label 0')

# Add titles and labels
plt.title('Distribution of Similarity Scores by Label (Detailed Line Graph)', fontsize=16)
plt.xlabel('Similarity Score', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.legend(fontsize=12)

# Display the plot
plt.show()

### 1.4.5 Fifth Attempt

**Specs:**
- **100 JDs** a 6 CVs
- "almost perfect fit" vs. "slightly bad to moderate fit"

In [None]:
import pandas as pd

# Import the CSV file into a dataframe
file_path = "/Users/timg/Desktop/Workproject/my_fork/workproject_matching_algo/synthetic_data_csv/df_synth_data_4_100.csv"
df_synth_data_4_100 = pd.read_csv(file_path)

# Display the first few rows of the dataframe to verify the import
df_synth_data_4_100

In [None]:
df_result_4_100 = main_synth_df(df_synth_data_4_100)

df_result_4_100.head(3)

In [None]:
# Add a new column 'label_predicted' where top 3 resumes = 1 and the rest = 0
df_result_4_100 = df_result_4_100.sort_values(['JD_ID', 'Similarity_score'], ascending=[True, False])

# Apply the labeling logic within each JD_ID group
df_result_4_100['label_predicted'] = df_result_4_100.groupby('JD_ID')['Similarity_score'].rank(ascending=False).apply(lambda x: 1 if x <= 3 else 0)

# Display the updated DataFrame
df_result_4_100

In [None]:
df_result_4_100["label_predicted"].value_counts()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate the confusion matrix
cm_4_100 = confusion_matrix(df_result_4_100['label'], df_result_4_100['label_predicted'])

# Display the confusion matrix
disp_4_100 = ConfusionMatrixDisplay(confusion_matrix=cm_4_100, display_labels=["Class 0", "Class 1"])
disp_4_100.plot(cmap="Blues", values_format="d")

In [None]:
# Calculate mean and median similarity scores for label = 1
mean_label_1_4_100 = round(df_result_4_100[df_result_4_100['label'] == 1]['Similarity_score'].mean(), 4)
median_label_1_4_100 = round(df_result_4_100[df_result_4_100['label'] == 1]['Similarity_score'].median(), 4)

# Calculate mean and median similarity scores for label = 0
mean_label_0_4_100 = round(df_result_4_100[df_result_4_100['label'] == 0]['Similarity_score'].mean(), 4)
median_label_0_4_100 = round(df_result_4_100[df_result_4_100['label'] == 0]['Similarity_score'].median(), 4)

# Calculate differences between label 1 and label 0
mean_difference_4_100 = round(mean_label_1_4_100 - mean_label_0_4_100, 4)
median_difference_4_100 = round(median_label_1_4_100 - median_label_0_4_100, 4)

# Output the results
print(f"Label 1 - Mean Similarity Score: {mean_label_1_4_100}, Median Similarity Score: {median_label_1_4_100}")
print(f"Label 0 - Mean Similarity Score: {mean_label_0_4_100}, Median Similarity Score: {median_label_0_4_100}")
print(f"Difference - Mean: {mean_difference_4_100}, Median: {median_difference_4_100}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Separate similarity scores for label = 1 and label = 0
scores_label_1 = df_result_4_100[df_result_4_100['label'] == 1]['Similarity_score']
scores_label_0 = df_result_4_100[df_result_4_100['label'] == 0]['Similarity_score']

# Create histogram data for both labels with more bins (50 bins)
bins = np.linspace(0, 1, 50)  # Define bins from 0 to 1 for the similarity scores
hist_label_1, _ = np.histogram(scores_label_1, bins=bins, density=True)
hist_label_0, _ = np.histogram(scores_label_0, bins=bins, density=True)

# Calculate the bin centers for the line plot
bin_centers = (bins[:-1] + bins[1:]) / 2

# Plot the line graph
plt.figure(figsize=(12, 6))
plt.plot(bin_centers, hist_label_1, label='Label 1')
plt.plot(bin_centers, hist_label_0, label='Label 0')

# Add titles and labels
plt.title('Distribution of Similarity Scores by Label (Detailed Line Graph)', fontsize=16)
plt.xlabel('Similarity Score', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.legend(fontsize=12)

# Display the plot
plt.show()

# 2 Improving the Model Components

## Model Pipeline
1. **Skills Extraction:** Skills Dictionary (baseline) vs. Taxonomy
2. **Input for Embeddings:** Bunch-of-skills (baseline) vs. Skill-by-skill
3. **Embedding Models:** all-mpnet-base-v2 (baseline) vs. Huggingface leaderboard Models
4. **Modelling Approach:** Bi-Encoder (baseline) vs. Cross-Encoder

In [None]:
# Performance Analysis Function for 1 df
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    precision_recall_curve,
)

def performance_analysis(df):
    # Dynamically get the name of the DataFrame passed to the function
    df_name = [key for key, value in globals().items() if value is df][0]

    # 1. Count "label_predicted" values
    print(f"\n{df_name} - Label Predicted Counts:")
    print(df["label_predicted"].value_counts())

    # 2. Mean and median similarity scores
    mean_label_1 = round(df[df['label'] == 1]['Similarity_score'].mean(), 4)
    median_label_1 = round(df[df['label'] == 1]['Similarity_score'].median(), 4)
    mean_label_0 = round(df[df['label'] == 0]['Similarity_score'].mean(), 4)
    median_label_0 = round(df[df['label'] == 0]['Similarity_score'].median(), 4)
    mean_difference = round(mean_label_1 - mean_label_0, 4)
    median_difference = round(median_label_1 - median_label_0, 4)

    print(f"\n{df_name} - Similarity Scores:")
    print(f"Label 1 - Mean: {mean_label_1}, Median: {median_label_1}")
    print(f"Label 0 - Mean: {mean_label_0}, Median: {median_label_0}")
    print(f"Difference - Mean: {mean_difference}, Median: {median_difference}")

    # 3. Distribution plot for similarity scores
    scores_label_1 = df[df['label'] == 1]['Similarity_score']
    scores_label_0 = df[df['label'] == 0]['Similarity_score']

    bins = np.linspace(0, 1, 50)
    hist_label_1, _ = np.histogram(scores_label_1, bins=bins, density=True)
    hist_label_0, _ = np.histogram(scores_label_0, bins=bins, density=True)
    bin_centers = (bins[:-1] + bins[1:]) / 2

    plt.figure(figsize=(8, 4))
    plt.plot(bin_centers, hist_label_1, label="Label 1")
    plt.plot(bin_centers, hist_label_0, label="Label 0")
    plt.title(f'Distribution of Similarity Scores by Label ({df_name})', fontsize=16)
    plt.xlabel('Similarity Score', fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.legend(fontsize=12)
    plt.show()

    # 4. Confusion matrix
    cm = confusion_matrix(df['label'], df['label_predicted'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Class 0", "Class 1"])
    disp.plot(cmap="Blues", values_format="d")
    plt.title(f"Confusion Matrix ({df_name})", fontsize=16)
    plt.show()

    # 5. Accuracy, Precision, Recall, and F1-Score
    accuracy = accuracy_score(df['label'], df['label_predicted'])
    precision = precision_score(df['label'], df['label_predicted'])
    recall = recall_score(df['label'], df['label_predicted'])
    f1 = f1_score(df['label'], df['label_predicted'])

    metrics_df = pd.DataFrame(
        {
            df_name: [accuracy, precision, recall, f1],
        },
        index=["Accuracy", "Precision", "Recall", "F1-Score"]
    )
    print("\nPerformance Metrics:")
    print(metrics_df)

    # 6. Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(df['label'], df['Similarity_score'])
    plt.figure(figsize=(8, 4))
    plt.plot(recall, precision, label=df_name)
    plt.title("Precision-Recall Curve", fontsize=16)
    plt.xlabel("Recall", fontsize=14)
    plt.ylabel("Precision", fontsize=14)
    plt.legend(fontsize=12)
    plt.show()

In [None]:
# Performance Analysis Function for 2 dfs
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    precision_recall_curve,
)

def performance_comparison(df1, df2, df1_name="DF1", df2_name="DF2"):
    # 1. Count "label_predicted" values
    print(f"\n{df1_name} - Label Predicted Counts:")
    print(df1["label_predicted"].value_counts())
    print(f"\n{df2_name} - Label Predicted Counts:")
    print(df2["label_predicted"].value_counts())

    # 2. Mean and median similarity scores
    def calculate_similarity_stats(df, name):
        mean_label_1 = round(df[df['label'] == 1]['Similarity_score'].mean(), 4)
        median_label_1 = round(df[df['label'] == 1]['Similarity_score'].median(), 4)
        mean_label_0 = round(df[df['label'] == 0]['Similarity_score'].mean(), 4)
        median_label_0 = round(df[df['label'] == 0]['Similarity_score'].median(), 4)
        mean_difference = round(mean_label_1 - mean_label_0, 4)
        median_difference = round(median_label_1 - median_label_0, 4)

        print(f"\n{name} - Similarity Scores:")
        print(f"Label 1 - Mean: {mean_label_1}, Median: {median_label_1}")
        print(f"Label 0 - Mean: {mean_label_0}, Median: {median_label_0}")
        print(f"Difference - Mean: {mean_difference}, Median: {median_difference}")

    calculate_similarity_stats(df1, df1_name)
    calculate_similarity_stats(df2, df2_name)

    # 3. Distribution plot for similarity scores
    def plot_similarity_distribution(df, name):
        scores_label_1 = df[df['label'] == 1]['Similarity_score']
        scores_label_0 = df[df['label'] == 0]['Similarity_score']

        bins = np.linspace(0, 1, 50)
        hist_label_1, _ = np.histogram(scores_label_1, bins=bins, density=True)
        hist_label_0, _ = np.histogram(scores_label_0, bins=bins, density=True)
        bin_centers = (bins[:-1] + bins[1:]) / 2

        plt.figure(figsize=(12, 6))
        plt.plot(bin_centers, hist_label_1, label="Label 1")
        plt.plot(bin_centers, hist_label_0, label="Label 0")
        plt.title(f'Distribution of Similarity Scores by Label ({name})', fontsize=16)
        plt.xlabel('Similarity Score', fontsize=14)
        plt.ylabel('Density', fontsize=14)
        plt.legend(fontsize=12)
        plt.show()

    plot_similarity_distribution(df1, df1_name)
    plot_similarity_distribution(df2, df2_name)

    # 4. Confusion matrix
    def plot_confusion_matrix(df, name):
        cm = confusion_matrix(df['label'], df['label_predicted'])
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Class 0", "Class 1"])
        disp.plot(cmap="Blues", values_format="d")
        plt.title(f"Confusion Matrix ({name})", fontsize=16)
        plt.show()

    plot_confusion_matrix(df1, df1_name)
    plot_confusion_matrix(df2, df2_name)

    # 5. Accuracy, Precision, Recall, and F1-Score
    def calculate_metrics(df):
        accuracy = accuracy_score(df['label'], df['label_predicted'])
        precision = precision_score(df['label'], df['label_predicted'])
        recall = recall_score(df['label'], df['label_predicted'])
        f1 = f1_score(df['label'], df['label_predicted'])
        return [accuracy, precision, recall, f1]

    metrics_df = pd.DataFrame(
        {
            df1_name: calculate_metrics(df1),
            df2_name: calculate_metrics(df2),
        },
        index=["Accuracy", "Precision", "Recall", "F1-Score"]
    )
    print("\nPerformance Metrics:")
    print(metrics_df)

    # 6. Precision-Recall Curve
    def plot_precision_recall(df, name):
        precision, recall, _ = precision_recall_curve(df['label'], df['Similarity_score'])
        plt.plot(recall, precision, label=name)

    plt.figure(figsize=(10, 6))
    plot_precision_recall(df1, df1_name)
    plot_precision_recall(df2, df2_name)
    plt.title("Precision-Recall Curve", fontsize=16)
    plt.xlabel("Recall", fontsize=14)
    plt.ylabel("Precision", fontsize=14)
    plt.legend(fontsize=12)
    plt.show()

## 2.1 Skills Extraction: Skills Dictionary (baseline) vs. Taxonomy

## 2.2 Input for Embeddings: Bunch-of-skills (baseline) vs. Skill-by-skill

### 2.2.1 Bag-of-skills

In [None]:
performance_analysis(df_result_4_100)

### 2.2.2 Skill-by-skill

## 2.3 Embedding Models: all-mpnet-base-v2 (baseline) vs. Huggingface leaderboard Models

### 2.3.1 all-mpnet-base-v2

**Model Characteristics:**
- Model size: 
- Embedding dimensions: 
- Tokenizer:
- Memory Usage (GB, fp32): 
- Max Tokens: 

In [None]:
# In main.py and imported into the Notebook
def calc_similarity(applicant_df, job_df, N=3, parallel=False):
    """Calculate cosine similarity based on MPNET embeddings of combined skills."""

    # Initialize the model once outside the loop for efficiency
    model = SentenceTransformer('all-mpnet-base-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side="right"
    model.eval()

    def add_eos(input_examples):
        """ helper function to add special tokens between each skills"""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Precompute job embeddings
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    job_embeddings = model.encode(
    job_df['Skills_Text'].tolist())
    # Precompute applicant embeddings
    applicant_df['Skills_Text'] = applicant_df['Skills'].apply(add_eos)
    applicant_df['Skills_Text'] = applicant_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    applicant_embeddings = model.encode(
    applicant_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(job_embeddings, applicant_embeddings)

    # Create a DataFrame from the similarity matrix
    similarity_df = pd.DataFrame(similarity_matrix.T, index=applicant_df['name'], columns=job_df.index)
    similarity_df = similarity_df.reset_index().melt(id_vars='name', var_name='job_id', value_name='similarity_score')
    similarity_df['rank'] = similarity_df.groupby('job_id')['similarity_score'].rank(ascending=False)
    similarity_df['interview_status'] = similarity_df['rank'].apply(lambda x: 'Selected' if x <= N else 'Not Selected')

    return similarity_df

### 2.3.2 all-MiniLM-L6-v2

**Model Characteristics:**
- Model size: 22.7M params
- Embedding dimensions: 
- Tokenizer:
- Memory Usage (GB, fp32): 
- Max Tokens: 

In [None]:
# In main.py and imported into the Notebook
def calc_similarity(applicant_df, job_df, N=3, parallel=False):
    """Calculate cosine similarity based on MPNET embeddings of combined skills."""

    # Initialize the model once outside the loop for efficiency
    model = SentenceTransformer('all-MiniLM-L6-v2')
    model.max_seq_length = 75
    model.tokenizer.padding_side="right"
    model.eval()

    def add_eos(input_examples):
        """ helper function to add special tokens between each skills"""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Precompute job embeddings
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    job_embeddings = model.encode(
    job_df['Skills_Text'].tolist())
    # Precompute applicant embeddings
    applicant_df['Skills_Text'] = applicant_df['Skills'].apply(add_eos)
    applicant_df['Skills_Text'] = applicant_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    applicant_embeddings = model.encode(
    applicant_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(job_embeddings, applicant_embeddings)

    # Create a DataFrame from the similarity matrix
    similarity_df = pd.DataFrame(similarity_matrix.T, index=applicant_df['name'], columns=job_df.index)
    similarity_df = similarity_df.reset_index().melt(id_vars='name', var_name='job_id', value_name='similarity_score')
    similarity_df['rank'] = similarity_df.groupby('job_id')['similarity_score'].rank(ascending=False)
    similarity_df['interview_status'] = similarity_df['rank'].apply(lambda x: 'Selected' if x <= N else 'Not Selected')

    return similarity_df

### 2.3.2 voyage-large-2-instruct

**Model Characteristics:**
- Model size: 
- Embedding dimensions: 
- Tokenizer:
- Memory Usage (GB, fp32): 
- Max Tokens: 

In [None]:
import torch
from sentence_transformers import SentenceTransformer

# In main.py and imported into the Notebook
def calc_similarity(applicant_df, job_df, N=3, parallel=False):
    """Calculate cosine similarity based on NV-Embed-v2 embeddings of combined skills."""

    '''# Initialize the model once outside the loop for efficiency
    model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
    model.max_seq_length = 75
    model.tokenizer.padding_side="right"'''

    def add_eos(input_examples):
        """ helper function to add special tokens between each skills"""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Precompute job embeddings
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    job_embeddings = model.encode(
    job_df['Skills_Text'].tolist())
    # Precompute applicant embeddings
    applicant_df['Skills_Text'] = applicant_df['Skills'].apply(add_eos)
    applicant_df['Skills_Text'] = applicant_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    applicant_embeddings = model.encode(
    applicant_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(job_embeddings, applicant_embeddings)

    # Create a DataFrame from the similarity matrix
    similarity_df = pd.DataFrame(similarity_matrix.T, index=applicant_df['name'], columns=job_df.index)
    similarity_df = similarity_df.reset_index().melt(id_vars='name', var_name='job_id', value_name='similarity_score')
    similarity_df['rank'] = similarity_df.groupby('job_id')['similarity_score'].rank(ascending=False)
    similarity_df['interview_status'] = similarity_df['rank'].apply(lambda x: 'Selected' if x <= N else 'Not Selected')

    return similarity_df

### 2.3.3 BinGSE-Meta-Llama-3-8B-Instruct

**Model Characteristics:**
- Model size: 
- Embedding dimensions: 
- Tokenizer:
- Memory Usage (GB, fp32): 
- Max Tokens: 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from peft import PeftModel

# In main.py and imported into the Notebook
def calc_similarity(applicant_df, job_df, N=3, parallel=False):
    """Calculate cosine similarity based on NV-Embed-v2 embeddings of combined skills."""

    # Loading base Meta-Llama-3 model, along with custom code that enables bidirectional connections in decoder-only LLMs.
    tokenizer = AutoTokenizer.from_pretrained(
        "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"
    )
    config = AutoConfig.from_pretrained(
        "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp", trust_remote_code=True
    )
    model = AutoModel.from_pretrained(
        "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
        trust_remote_code=True,
        config=config,
        torch_dtype=torch.bfloat16,
        device_map="cuda" if torch.cuda.is_available() else "cpu",
    )

    # Loading MNTP (Masked Next Token Prediction) model.
    model = PeftModel.from_pretrained(
        model,
        "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
    )

    model = model.merge_and_unload()  # This can take several minutes on cpu

    # Loading BinGSE model. This loads the trained LoRA weights on top of MNTP model. Hence the final weights are -- Base model + MNTP (LoRA) + BinGSE (LoRA).
    model = PeftModel.from_pretrained(
        model, model_path 
    )

    def add_eos(input_examples):
        """ helper function to add special tokens between each skills"""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Precompute job embeddings
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    job_embeddings = model.encode(
    job_df['Skills_Text'].tolist())
    # Precompute applicant embeddings
    applicant_df['Skills_Text'] = applicant_df['Skills'].apply(add_eos)
    applicant_df['Skills_Text'] = applicant_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    applicant_embeddings = model.encode(
    applicant_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(job_embeddings, applicant_embeddings)

    # Create a DataFrame from the similarity matrix
    similarity_df = pd.DataFrame(similarity_matrix.T, index=applicant_df['name'], columns=job_df.index)
    similarity_df = similarity_df.reset_index().melt(id_vars='name', var_name='job_id', value_name='similarity_score')
    similarity_df['rank'] = similarity_df.groupby('job_id')['similarity_score'].rank(ascending=False)
    similarity_df['interview_status'] = similarity_df['rank'].apply(lambda x: 'Selected' if x <= N else 'Not Selected')

    return similarity_df

### 2.3.4 NV-Embed-v2

**Model Characteristics:**
- Model size: 
- Embedding dimensions: 
- Tokenizer:
- Memory Usage (GB, fp32): 
- Max Tokens: 

In [None]:
import torch
from sentence_transformers import SentenceTransformer

# In main.py and imported into the Notebook
def calc_similarity(applicant_df, job_df, N=3, parallel=False):
    """Calculate cosine similarity based on NV-Embed-v2 embeddings of combined skills."""

    # Initialize the model once outside the loop for efficiency
    model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
    model.max_seq_length = 75
    model.tokenizer.padding_side="right"

    def add_eos(input_examples):
        """ helper function to add special tokens between each skills"""
        input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
        return input_examples

    # Precompute job embeddings
    job_df['Skills_Text'] = job_df['Skills'].apply(add_eos)
    job_df['Skills_Text'] = job_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    job_embeddings = model.encode(
    job_df['Skills_Text'].tolist())
    # Precompute applicant embeddings
    applicant_df['Skills_Text'] = applicant_df['Skills'].apply(add_eos)
    applicant_df['Skills_Text'] = applicant_df['Skills_Text'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    applicant_embeddings = model.encode(
    applicant_df['Skills_Text'].tolist(),
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(job_embeddings, applicant_embeddings)

    # Create a DataFrame from the similarity matrix
    similarity_df = pd.DataFrame(similarity_matrix.T, index=applicant_df['name'], columns=job_df.index)
    similarity_df = similarity_df.reset_index().melt(id_vars='name', var_name='job_id', value_name='similarity_score')
    similarity_df['rank'] = similarity_df.groupby('job_id')['similarity_score'].rank(ascending=False)
    similarity_df['interview_status'] = similarity_df['rank'].apply(lambda x: 'Selected' if x <= N else 'Not Selected')

    return similarity_df

### 2.3.5 SFR-Embedding-Mistral

**Model Characteristics:**
- Model size: 
- Embedding dimensions: 
- Tokenizer:
- Memory Usage (GB, fp32): 
- Max Tokens: 

## 2.4 Modelling Approach: Bi-Encoder (baseline) vs. Cross-Encoder

In [None]:
# Victors Code for Cross-Encoder

# Necessary imports
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch
import torch.nn.functional as F

# Similarity Calculation using Cross-Encoder
def calc_cross(applicant_df, job_df, N=3, parallel=False):
    """ Use Cross Encoder to calculate similarity of combined skills."""

    # Initialize the model once outside the loop for efficiency
    model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

     # Precompute job embeddings
    job_df['Skills_Text'] = job_df['Skills'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    query = job_df['Skills_Text'][0]
    # Precompute applicant embeddings
    applicant_df['Skills_Text'] = applicant_df['Skills'].apply(lambda x: ' '.join(sorted(set(x))) if isinstance(x, list) else '')
    applicants = applicant_df['Skills_Text'].tolist()

    ranks = model.rank(
        query,
        applicants,
        batch_size=32,
        num_workers=os.cpu_count() // 2 if parallel else 0,
        show_progress_bar=False
    )

    similarity_df = pd.DataFrame(ranks)
    similarity_df['softmaxed'] = F.softmax(torch.tensor(similarity_df['score']))
    similarity_df = similarity_df.join(applicant_df[["name"]], on="corpus_id")
    
    # similarity_df['interview_status'] = similarity_df.index.apply(lambda x: 'Selected' if x <= N else 'Not Selected')

    return similarity_df