In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('jobs.csv')

In [18]:
df.columns

Index(['Job Title', 'Job Description', 'Skills', 'Certifications'], dtype='object')

In [19]:
df = df.rename(columns={
    'Job Description': 'job_description',
    'Job Title': 'job_title',
    'Skills': 'skills',
    'Certifications': 'certifications',
})

In [20]:
df

Unnamed: 0,job_title,job_description,skills,certifications
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War...","Cloudera Certified Professional (CCP), Hortonw..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C...",Red Hat Certified Specialist in Ansible Automa...
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...","JFrog Artifactory Certification, DevOps Instit..."
3,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management...","AI-900: Microsoft Azure AI Fundamentals, Certi..."
4,Artificial Intelligence / Machine Learning Sr....,Senior role overseeing multiple AI/ML initiati...,"AI Strategy, Machine Learning, Team Management...",Certified Artificial Intelligence Practitioner...
...,...,...,...,...
201,Web Designer (UI/UX Designer),"Create front end web application using HTML, C...","UI/UX Design, Wireframing, Prototyping, Adobe ...",
202,Web Developer,Develops and maintains websites and web applic...,"HTML5, CSS3, JavaScript, React, Node.js, Angul...",
203,WordPress Developer,Creates and customizes WordPress websites. Res...,"WordPress, PHP, MySQL, HTML5, CSS3, JavaScript",
204,XL Deploy Engineer,Manages deployment processes using XL Deploy. ...,"XL Deploy, Deployment Automation, CI/CD, Versi...","XL Deploy Certification, DevOps Institute Cert..."


DATA CLEANING

In [21]:
(df.isna().sum().sum() / 206) * 100 #less than 30% of whole data, then drop it.

np.float64(24.271844660194176)

In [22]:
df.dropna(inplace=True)

In [23]:
df.isna().sum()

job_title          0
job_description    0
skills             0
certifications     0
dtype: int64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 157 entries, 0 to 205
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        157 non-null    object
 1   job_description  157 non-null    object
 2   skills           157 non-null    object
 3   certifications   157 non-null    object
dtypes: object(4)
memory usage: 6.1+ KB


In [29]:
df.columns = df.columns.str.strip()
print(df.columns)


Index(['job_title', 'job_description', 'skills', 'certifications'], dtype='object')


In [None]:

df = df.drop_duplicates(subset=["job_description"]).reset_index(drop=True)
print("Rows:", len(df))
print("Unique job_description:", df["job_description"].nunique())


Rows: 150
Unique job_description: 150


0    responsible for managing and overseeing big da...
1    focuses on automating it processes using ansib...
2    manages the artifactory repository for build a...
3    leads ai/ml projects and teams, defining strat...
4    senior role overseeing multiple ai/ml initiati...
Name: job_description, dtype: object

In [40]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [41]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [42]:
#Build a single text field per job (title + description + skills if available)
def build_job_text(row):
    title = str(row.get("job_title", "")).strip()
    desc  = str(row.get("job_description", "")).strip()
    skills = str(row.get("skills", "")).strip()
    return f"Job Title: {title}\nJob Description: {desc}\nSkills: {skills}"

job_texts = df.apply(build_job_text, axis=1).tolist()

In [43]:
job_embeddings = model.encode(job_texts, show_progress_bar=True)


Batches: 100%|██████████| 5/5 [00:02<00:00,  2.16it/s]


In [45]:
# 4) User skills → query text
user_skills = ["python", "sql", "excel"]   # example input from UI
query_text = "Skills: " + ", ".join(user_skills)

query_embedding = model.encode([query_text])

In [None]:
# 5) Similarity scores
scores = cosine_similarity(query_embedding, job_embeddings)[0]
df["match_score"] = scores

# 6) Top matches
top = df.sort_values("match_score", ascending=False).head(10)
top[["job_title", "match_score", "job_description"]]
 

Unnamed: 0,job_title,match_score,job_description
13,Data Analysts,0.546577,analyzes datasets to provide actionable insigh...
70,DATA ANALYST,0.511734,analyzes data sets to extract insights that in...
73,DATA SCIENTIST,0.500314,utilizes statistical analysis and machine lear...
85,Entry Level Programmer,0.479041,writes basic code under supervision while lear...
83,Entry Level Developer,0.467204,assists in developing software applications un...
119,Junior Developer,0.460494,supports development teams by writing code und...
86,Entry Level Software Developer,0.454145,assists in developing software applications wh...
27,Principle Engineer in Data Analysis,0.451737,leads data analysis projects to derive insight...
71,Data Engineer,0.444553,builds data pipelines to support analytics ini...
122,Junior Software Developer,0.443734,participates in software development projects ...
