In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('jobs_extended.csv')

In [3]:
df.columns

Index(['Job Title', 'Job Description', 'Skills', 'Certifications'], dtype='object')

In [4]:
df = df.rename(columns={
    'Job Description': 'job_description',
    'Job Title': 'job_title',
    'Skills': 'skills',
    'Certifications': 'certifications',
})

In [5]:
df

Unnamed: 0,job_title,job_description,skills,certifications
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War...","Cloudera Certified Professional (CCP), Hortonw..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C...",Red Hat Certified Specialist in Ansible Automa...
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...","JFrog Artifactory Certification, DevOps Instit..."
3,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management...","AI-900: Microsoft Azure AI Fundamentals, Certi..."
4,Artificial Intelligence / Machine Learning Sr....,Senior role overseeing multiple AI/ML initiati...,"AI Strategy, Machine Learning, Team Management...",Certified Artificial Intelligence Practitioner...
...,...,...,...,...
2201,Embedded Systems Engineer,Provide technical support and troubleshoot iss...,"TensorFlow, Pandas, NumPy",Cisco Certified Network Associate (CCNA)
2202,IT Support Specialist,Ensure system security and monitor vulnerabili...,"React, Node.js, MongoDB",CompTIA Security+
2203,UI/UX Designer,Ensure system security and monitor vulnerabili...,"HTML, CSS, JavaScript",Microsoft Certified Azure Fundamentals
2204,Network Engineer,Analyze data to extract insights and support b...,"C++, Embedded C, RTOS",CompTIA Security+


DATA CLEANING

In [6]:
(df.isna().sum().sum() / 206) * 100 #less than 30% of whole data, then drop it.

np.float64(24.271844660194176)

In [7]:
df.dropna(inplace=True)

In [8]:
df.isna().sum()

job_title          0
job_description    0
skills             0
certifications     0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2157 entries, 0 to 2205
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        2157 non-null   object
 1   job_description  2157 non-null   object
 2   skills           2157 non-null   object
 3   certifications   2157 non-null   object
dtypes: object(4)
memory usage: 84.3+ KB


In [10]:
df.columns = df.columns.str.strip()
print(df.columns)


Index(['job_title', 'job_description', 'skills', 'certifications'], dtype='object')


In [11]:

df = df.drop_duplicates(subset=["job_description"]).reset_index(drop=True)
print("Rows:", len(df))
print("Unique job_description:", df["job_description"].nunique())


Rows: 158
Unique job_description: 158


In [12]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [14]:
def build_job_text(row):
    title = str(row.get("job_title", "")).strip()
    desc  = str(row.get("job_description", "")).strip()
    skills = str(row.get("skills", "")).strip()
    return f"Job Title: {title}\nJob Description: {desc}\nSkills: {skills}"

job_texts = df.apply(build_job_text, axis=1).tolist()

In [15]:
job_embeddings = model.encode(job_texts, show_progress_bar=True)


Batches: 100%|██████████| 5/5 [00:01<00:00,  3.87it/s]


In [16]:
user_skills = ["painting"]   # example input from UI
query_text = "Skills: " + ", ".join(user_skills)

query_embedding = model.encode([query_text])

In [17]:
# 5) Similarity scores
scores = cosine_similarity(query_embedding, job_embeddings)[0]
df["match_score"] = scores

# 6) Top matches
top = df.sort_values("match_score", ascending=False).head(10)
top[["job_title", "match_score", "job_description"]]
 

Unnamed: 0,job_title,match_score,job_description
83,Entry Level Developer,0.397235,Assists in developing software applications un...
85,Entry Level Programmer,0.396601,Writes basic code under supervision while lear...
86,Entry Level Software Developer,0.366072,Assists in developing software applications wh...
118,Jr Developer,0.365635,Assists in software development tasks under su...
120,Junior Front End Developer,0.359949,Assists in developing front-end components of ...
122,Junior Software Developer,0.358208,Participates in software development projects ...
94,Front End Developer,0.357482,Designs and develops user-facing features for ...
119,Junior Developer,0.350907,Supports development teams by writing code und...
76,Developer,0.337386,Writes software applications based on specific...
155,AI Engineer,0.33355,Build and maintain scalable web applications.
