In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('jobs.csv')

In [4]:
df.columns

Index(['Job Title', 'Job Description', 'Skills', 'Certifications'], dtype='str')

In [5]:
df = df.rename(columns={
    'Job Description': 'job_description',
    'Job Title': 'job_title',
    'Skills': 'skills',
    'Certifications': 'certifications',
})

In [6]:
df

Unnamed: 0,job_title,job_description,skills,certifications
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War...","Cloudera Certified Professional (CCP), Hortonw..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C...",Red Hat Certified Specialist in Ansible Automa...
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...","JFrog Artifactory Certification, DevOps Instit..."
3,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management...","AI-900: Microsoft Azure AI Fundamentals, Certi..."
4,Artificial Intelligence / Machine Learning Sr....,Senior role overseeing multiple AI/ML initiati...,"AI Strategy, Machine Learning, Team Management...",Certified Artificial Intelligence Practitioner...
...,...,...,...,...
201,Web Designer (UI/UX Designer),"Create front end web application using HTML, C...","UI/UX Design, Wireframing, Prototyping, Adobe ...",
202,Web Developer,Develops and maintains websites and web applic...,"HTML5, CSS3, JavaScript, React, Node.js, Angul...",
203,WordPress Developer,Creates and customizes WordPress websites. Res...,"WordPress, PHP, MySQL, HTML5, CSS3, JavaScript",
204,XL Deploy Engineer,Manages deployment processes using XL Deploy. ...,"XL Deploy, Deployment Automation, CI/CD, Versi...","XL Deploy Certification, DevOps Institute Cert..."


DATA CLEANING

In [7]:
(df.isna().sum().sum() / 206) * 100 #less than 30% of whole data, then drop it.

np.float64(24.271844660194176)

In [8]:
df.dropna(inplace=True)

In [9]:
df.isna().sum()

job_title          0
job_description    0
skills             0
certifications     0
dtype: int64

In [10]:
df.info()

<class 'pandas.DataFrame'>
Index: 157 entries, 0 to 205
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   job_title        157 non-null    str  
 1   job_description  157 non-null    str  
 2   skills           157 non-null    str  
 3   certifications   157 non-null    str  
dtypes: str(4)
memory usage: 6.1 KB


In [11]:
df.columns = df.columns.str.strip()
print(df.columns)


Index(['job_title', 'job_description', 'skills', 'certifications'], dtype='str')


In [12]:

df = df.drop_duplicates(subset=["job_description"]).reset_index(drop=True)
print("Rows:", len(df))
print("Unique job_description:", df["job_description"].nunique())


Rows: 150
Unique job_description: 150


In [14]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
model = SentenceTransformer("all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 755.72it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [16]:
def build_job_text(row):
    title = str(row.get("job_title", "")).strip()
    desc  = str(row.get("job_description", "")).strip()
    skills = str(row.get("skills", "")).strip()
    return f"Job Title: {title}\nJob Description: {desc}\nSkills: {skills}"

job_texts = df.apply(build_job_text, axis=1).tolist()

In [17]:
job_embeddings = model.encode(job_texts, show_progress_bar=True)


Batches: 100%|██████████| 5/5 [00:01<00:00,  4.21it/s]


In [18]:
user_skills = ["neurology"]   # example input from UI
query_text = "Skills: " + ", ".join(user_skills)

query_embedding = model.encode([query_text])

In [20]:
# 5) Similarity scores
scores = cosine_similarity(query_embedding, job_embeddings)[0]
df["match_score"] = scores

# 6) Top matches
top = df.sort_values("match_score", ascending=False).head(10)
top[["job_title", "match_score", "job_description"]]
 

Unnamed: 0,job_title,match_score,job_description
83,Entry Level Developer,0.369277,Assists in developing software applications un...
25,Principle Engineer in Artificial Intelligence,0.353813,Leads AI initiatives within an organization. R...
84,Entry Level Network Engineer,0.352681,Supports network infrastructure by assisting w...
28,Principle Engineer in Machine Learning,0.348702,Senior role overseeing machine learning projec...
86,Entry Level Software Developer,0.345622,Assists in developing software applications wh...
23,Machine Learning Engineer,0.344489,Builds machine learning models by implementing...
6,Artificial Intelligence Researcher,0.341524,Conducts research in AI to develop new algorit...
85,Entry Level Programmer,0.339389,Writes basic code under supervision while lear...
59,CNC Programmer,0.33587,Creates CNC programs for machining operations....
119,Junior Developer,0.333768,Supports development teams by writing code und...
