In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
Jobs = pd.read_csv("JobsFE.csv")

In [None]:
# Combine relevant columns into a single text column
Jobs["job_text"] = (
    Jobs["position"].astype(str) + " " +
    Jobs["job_role_and_duties"].astype(str) + " " +
    Jobs["requisite_skill"].astype(str)
)

In [None]:
# Initialize SBERT model
MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2", device="cpu")
job_texts = Jobs["job_text"].tolist()
job_embeddings = MODEL.encode(job_texts, convert_to_numpy=True)
job_embeddings = job_embeddings / np.linalg.norm(job_embeddings, axis=1, keepdims=True)


#### Function to determine the optimal number of clusters using BIC


In [None]:
def bic_method(data, max_clusters):
    bic_scores = []
    K = range(1, max_clusters + 1)
    for k in K:
        gmm = GaussianMixture(n_components=k, random_state=42)
        gmm.fit(data)
        bic_scores.append(gmm.bic(data))
    
    # Plot BIC scores
    plt.figure(figsize=(8, 5))
    plt.plot(K, bic_scores, "bo-", markersize=8)
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("BIC Score")
    plt.title("BIC Method for Optimal k")
    plt.show()

# Apply the BIC Method
bic_method(job_embeddings, max_clusters=50)


In [None]:
# Set optimal number of clusters
optimal_clusters = 4

#### Perform GMM clustering

In [None]:
gmm = GaussianMixture(n_components=optimal_clusters, random_state=42)
cluster_labels = gmm.fit_predict(job_embeddings)


In [None]:
Jobs.columns

In [None]:
# Add the cluster labels as a new column to the dataset
Jobs["cluster"] = cluster_labels

In [None]:
# Save the updated dataset with cluster labels
Jobs.to_csv("Jobs_with_clusters_GMM.csv", index=False)

### Example resume text

In [None]:
resume_text = """Yomna Waleed Elsayed Ahmed Hassan
Junior machine learning engineering
yomnawaleed2002@gmail.com
EGYPT, AlGharbia goverment, tanta city.
Yomna Waleed
yomna_waleed
+201118064546
19/05/2002
YomnaWaleed
yomna_waleed
EDUCATION
Faculty of Engineering, Department of Computers and Control, Tanta university 2020 – 2025
Tanta, Egypt
TRAINING
Manara Tech 04/2023 – present
PROFILE
I am a dedicated engineering student specializing in Computer and Automatic Control, with a strong academic background and practical experience in
programming and software development. I excelled in my coursework, achieving an excellent degree in my second year and a very good grade in my
first year. I have a solid foundation in Python programming and libraries essential for data science and machine learning, including NumPy, pandas,
and Matplotlib. My participation in the International Collegiate Programming Contest (ICPC) as a team leader has further honed my problem-solving
skills.
I am actively expanding my expertise in machine learning (ML), particularly in Natural Language Processing (NLP) and Generative AI, having
completed supervised machine learning courses and gained practical experience with various deep learning architectures, including DNNs, RNNs, and
CNNs. My background also includes full-stack development using React.js and Node.js, enabling me to effectively integrate machine learning models
into web applications.
SKILLS
Programming Languages & Technologies:
•Python: Proficient
•C++: Proficient
•React.js: Competent
•Node.js: Amateur
•HTML/CSS: Competent
Data Structures & Algorithms
•Proficient in implementing algorithms and data structures in Python
and C++
•Strong foundation in problem-solving techniques, demonstrated
through participation in competitive programming
Machine Learning & Data Science:
•Libraries: NumPy, pandas, Matplotlib (Competent)
•Machine Learning: Supervised learning (regression, classification)
using Scikit-learn (Competent)
•Deep Learning: Familiar with TensorFlow and PyTorch (Amateur)
•Architectures: DNN, RNN, CNN, Object Detection, Word Detection,
Transfer Learning (Amateur)
•Generative AI: Currently learning (Amateur)
SOFT SKILLS
Project management and leadership
While I was working on the maze game project, I managed to divide the
tasks among the group, although this was challenging because we chose
a three-dimensional game that was beyond what we had learned.
Thinking outside the box
"While choosing a project, I always try to challenge myself and attempt
something that is difficult for others to implement, such as selecting a
three-dimensional game instead of a two-dimensional one, despite the
difficulty of it. I successfully completed the game."
PROJECTS
Email-SMS-Spam-Classifier, Developed a supervised machine learning model to classify emails and SMS messages as
spam or not using Python and Scikit-learn.
08/2024 – 08/2024
Egypt-House-Price-Prediction--Regression-Project, Built a regression model to predict house prices based on
various features, implemented with an HTML interface for user interaction.
08/2024 – 08/2024
Nonogram Puzzle, Created a Nonogram puzzle generator that utilizes backtracking methods and a CSP approach to
solve generated puzzles. My contribution was in developing the solver algorithm.
2024
3d maze game, Developed a 3D maze game using OpenGL and Python, where players navigate to collect coins and solve
puzzles. My role included building the player's movement mechanics and 3D interactions.
2023
Railway Ticket booking, Designed and implemented a train ticket booking system using Node.js, React.js, and MS SQL
for database management, allowing users to book and cancel tickets.
2023
simple memory gain, Created a memory game using React.js that challenges users to find pairs of matching images. 2023
CERTIFICATES
ICPC Qualification 2022
Problem Solving
Back-end enineering using Nodejs and Express
Ideal Student Recognition
ICPC qualification 2023
Zero to Hero in Front-end Development with React
Fullstack Engineering with React and Node.js
Volunteer Work at College and University
LANGUAGES
English
EF SET English Certificate
French"""

In [None]:
# Encode the resume text
resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True)

In [None]:
# Encode the resume text
resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True)
resume_cluster = gmm.predict(resume_embedding)[0]
print(f"The resume belongs to cluster: {resume_cluster}")

#### Function to evaluate model results using GMM

In [None]:
def evaluate_model_results(recommended_jobs, resume_cluster, Jobs):
    """
    Evaluates the model results by checking how many recommended jobs
    belong to the same cluster as the resume.
    """
    # Check if recommended_jobs is empty
    if not recommended_jobs:
        print("No recommended jobs found.")
        return 0.0

    # Extract cluster labels of recommended jobs
    recommended_clusters = []
    for job in recommended_jobs:
        try:
            job_id = job["Job Id"]  # Ensure "Job Id" is the correct key
            cluster = Jobs.loc[Jobs["Job Id"] == job_id, "cluster"].values[0]
            recommended_clusters.append(cluster)
        except KeyError:
            print(f"Warning: 'Job Id' key not found in job: {job}")
            continue
        except IndexError:
            print(f"Warning: Job ID {job_id} not found in Jobs DataFrame.")
            continue

    # Check if recommended_clusters is empty
    if not recommended_clusters:
        print("No valid clusters found for recommended jobs.")
        return 0.0

    # Calculate cluster purity
    cluster_purity = np.sum(np.array(recommended_clusters) == resume_cluster) / len(recommended_clusters)

    print(f"Clusters of recommended Jobs: {recommended_clusters}")
    print(f"Cluster of resume is {resume_cluster}")
    print(f"Percentage of recommended jobs in the same cluster: {cluster_purity:.2%}")
    return cluster_purity


### calculate the accuracy of SBERT model 

In [None]:
from SBERTmodel import JobRecommendationSystem

In [None]:
# Load the system with job data
import time
import json
start = time.time()
recommender_SBERT = JobRecommendationSystem("Jobs_with_clusters_GMM.csv")
recommended_jobs_SBERT = recommender_SBERT.recommend_jobs(resume_text, top_n=20)
end = time.time()
print(f" the Execution of SBERT model is {end - start}")


In [None]:
Jobs = pd.read_csv("Jobs_with_clusters_GMM.csv")

In [None]:
#call the evaluation metrics 
SBERT_cluster_Purity = evaluate_model_results(recommended_jobs_SBERT["recommended_jobs"], resume_cluster,Jobs)
print(f"Cluster Purity: {SBERT_cluster_Purity:.2%}")

In [None]:
recommended_jobs_SBERT

### calculate the accuracy of FASTTEXT model 

In [None]:
from FastText.FastTextmodel import JobRecommendationSystem

In [None]:
# Load the system with job data
import time
start = time.time()
recommender_fasttext = JobRecommendationSystem("Jobs_with_clusters_GMM.csv")
recommended_jobs_fasttext = recommender_fasttext.recommend_jobs(resume_text, top_n=20)
end = time.time()
print(f" the Excution time is {end - start}")

In [None]:
recommended_jobs_fasttext

In [None]:
#call the evaluation metrics 
FastText_cluster_Purity = evaluate_model_results(recommended_jobs_fasttext["recommended_jobs"], resume_cluster)
print(f"Cluster Purity: {FastText_cluster_Purity:.2%}")

### calculate the accuracy of BM25 

In [None]:
from BM25model import JobRecommendationSystem

In [None]:
# Load the system with job data
import time
import json
start = time.time()
recommender_BM25 = JobRecommendationSystem("Jobs_with_clusters_GMM.csv")
recommended_jobs_BM25 = recommender_BM25.recommend_jobs(resume_text, top_n=20)
end = time.time()
print(f"the time of excution is {end - start}")

In [None]:
recommended_jobs_BM25

In [None]:
#call the evaluation metrics 
BM25_cluster_Purity = evaluate_model_results(recommended_jobs_BM25["recommended_jobs"], resume_cluster)
print(f"Cluster Purity: {BM25_cluster_Purity:.2%}")

### calculate the accuracy of TF-IDF 

In [None]:
from TFIDFmodel import JobRecommendationSystem

In [None]:
# Load the system with job data
import time
import json
start = time.time()
recommender_TFIDF = JobRecommendationSystem("Jobs_with_clusters_GMM.csv")
recommended_jobs_TFIDF = recommender_TFIDF.recommend_jobs(resume_text, top_n=20)
end = time.time()
print(f"the time of excution is {end - start}")

In [None]:
recommended_jobs_TFIDF

In [None]:
#call the evaluation metrics 
TFIDF_cluster_Purity = evaluate_model_results(recommended_jobs_TFIDF["recommended_jobs"], resume_cluster)
print(f"Cluster Purity: {TFIDF_cluster_Purity:.2%}")

### calculate the accuracy of KNN model 

In [None]:
from KNNmodel import JobRecommendationSystem

In [None]:
# Load the system with job data
import time
start = time.time()
recommender_KNN = JobRecommendationSystem("Jobs_with_clusters_GMM.csv")
recommended_jobs_KNN = recommender_KNN.recommend_jobs(resume_text, top_n=20)
end = time.time()
print(f"the time of excution is {end - start}")

In [None]:
recommended_jobs_KNN

In [None]:
#call the evaluation metrics 
KNN_cluster_Purity = evaluate_model_results(recommended_jobs_KNN["recommended_jobs"], resume_cluster)
print(f"Cluster Purity: {KNN_cluster_Purity:.2%}")

### calculate the accuracy of LDA model 

In [None]:
from LDAmodel import JobRecommendationSystem

In [None]:
# Load the system with job data
import time
start = time.time()
recommender_LDA = JobRecommendationSystem("Jobs_with_clusters_GMM.csv")
recommended_jobs_LDA = recommender_LDA.recommend_jobs(resume_text, top_n=20)
end = time.time()
print(f"the time of excution is {end - start}")

In [None]:
recommended_jobs_LDA

In [None]:
#call the evaluation metrics 
LDA_cluster_Purity = evaluate_model_results(recommended_jobs_LDA["recommended_jobs"], resume_cluster)
print(f"Cluster Purity: {LDA_cluster_Purity:.2%}")