In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import re
import pickle

tqdm.pandas()

In [None]:
job_skills = pd.read_csv('D:\Coding Playground\WebDev\projects\job matcher\job_skills.csv')
linkedin_job_posting = pd.read_csv('D:\Coding Playground\WebDev\projects\job matcher\linkedin_job_postings.csv')

linkedin_job_posting = pd.merge(linkedin_job_posting, job_skills, on='job_link', how = 'inner')

columns_to_drop = [
    'last_processed_time', 'got_summary', 'got_ner', 'is_being_worked',
    'job_location', 'first_seen', 'search_city', 'search_country'
]
linkedin_job_posting = linkedin_job_posting.drop(columns=columns_to_drop)

# Replace 'job_link' with a unique numeric ID
linkedin_job_posting.reset_index(drop=True, inplace=True)
linkedin_job_posting.insert(0, 'job_id', linkedin_job_posting.index)

# Optionally drop 'job_link' if you no longer need it
linkedin_job_posting = linkedin_job_posting.drop(columns=['job_link'])

# Remove rows where job_title or job_skills is NaN or empty string
linkedin_job_posting = linkedin_job_posting[
    linkedin_job_posting['job_title'].notna() &
    linkedin_job_posting['job_title'].astype(str).str.strip().ne('') &
    linkedin_job_posting['job_skills'].notna() &
    linkedin_job_posting['job_skills'].astype(str).str.strip().ne('')
]

linkedin_job_posting.head()

In [None]:
# STEP 1: Load and extract job skills
df = linkedin_job_posting[["job_title", "job_skills"]].dropna()

# STEP 2: Clean each skill
def clean_skill(skill):
    skill = skill.lower().strip()
    skill = re.sub(r"[^a-z0-9+.#\- ]", "", skill)  # retain common technical symbols
    skill = re.sub(r"\s+", " ", skill)
    return skill

def clean_and_split_skills(skill_string):
    return [clean_skill(s) for s in str(skill_string).split(",") if s.strip()]

df["job_skill_list"] = df["job_skills"].progress_apply(clean_and_split_skills)

# STEP 3: Extract all unique cleaned skills
print("Extracting unique skills...")
all_skills = set()
df["job_skill_list"].apply(lambda skills: all_skills.update(skills))

# Optional: filter out junk skills
print("Filtering noisy skills...")
all_skills = {
    s for s in all_skills
    if 2 <= len(s) <= 50 and not any(keyword in s for keyword in ["looking for", "please", "experience in"])
}

print(f"Cleaned unique skills count: {len(all_skills):,}")

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Encoding skill vocabulary...")
skill_list = sorted(all_skills)
skill_embeddings = model.encode(skill_list, batch_size=128, show_progress_bar=True)

In [None]:
skill_embeddings = np.load('skill_embeddings.npy')
skill_list = np.load('skill_list.npy')
skill_to_embedding = dict(zip(skill_list, skill_embeddings))

with open("skill_to_embedding.pkl", "wb") as f:
    pickle.dump(skill_to_embedding, f)

In [None]:
def normalize_skills_fast(skill_list_raw, threshold=0.8):
    skill_list = list(set([clean_skill(s) for s in skill_list_raw if s.strip()]))
    if not skill_list:
        return []

    clustered = []
    used = set()
    for i, skill_i in enumerate(skill_list):
        if i in used or skill_i not in skill_to_embedding:
            continue
        group = [skill_i]
        used.add(i)
        for j in range(i + 1, len(skill_list)):
            if j in used:
                continue
            skill_j = skill_list[j]
            if skill_j not in skill_to_embedding:
                continue
            sim = util.cos_sim(skill_to_embedding[skill_i], skill_to_embedding[skill_j])
            if sim.item() >= threshold:
                group.append(skill_j)
                used.add(j)
        clustered.append(min(group, key=len))
    return list(set(clustered))

# Apply to each row
print("Normalizing skills for each job...")
df["normalized_skills"] = df["job_skill_list"].progress_apply(lambda x: normalize_skills_fast(x))

# Create text representation
df["skill_text"] = df["normalized_skills"].apply(lambda skills: " ".join(skills))

df.to_csv("jobs_normalized.csv", index=False)

In [None]:
df["job_text"] = df["job_title"].fillna("") + " - " + df["skill_text"].fillna("")

print("Encoding job postings...")
job_embeddings = model.encode(df["job_text"].tolist(), show_progress_bar=True, batch_size=128)

np.save("job_embeddings.npy", job_embeddings)
df.to_csv("jobs_with_embeddings.csv", index=False)

In [None]:
index = faiss.IndexFlatL2(384)
index.add(job_embeddings)

faiss.write_index(index, "faiss_jobs.index")

In [None]:
def safe_parse(skill_str):
    if isinstance(skill_str, str):
        return [s.strip().strip("'\"") for s in skill_str.strip("[]").split(",") if s.strip()]
    return []

df["normalized_skills"] = df["normalized_skills"].progress_apply(safe_parse)

# Create a job key to group similar job titles together
df["job_key"] = df["job_title"].str.lower().str.strip()

# Sort by job title to group duplicates close together
df = df.sort_values("job_key").reset_index(drop=True)

# Track rows to keep
keep_indices = []

seen = set()

# Iterate through rows
for i in tqdm(range(len(df))):
    if i in seen:
        continue

    current_title = df.loc[i, "job_key"]
    current_skills = set(df.loc[i, "normalized_skills"])
    current_idx = i
    keep_indices.append(current_idx)

    # Compare with next 5–10 rows only (assumes sorted by title)
    for j in range(i + 1, min(i + 15, len(df))):
        next_title = df.loc[j, "job_key"]
        next_skills = set(df.loc[j, "normalized_skills"])

        # If job title is same
        if current_title == next_title:
            jaccard_sim = len(current_skills & next_skills) / len(current_skills | next_skills)
            if jaccard_sim > 0.85:  # Threshold for "very similar" skills
                seen.add(j)  # Mark as duplicate

# Final cleaned DataFrame
cleaned_df = df.loc[keep_indices].reset_index(drop=True)

print(f"Reduced from {len(df)} rows to {len(cleaned_df)} rows.")

# Save cleaned data
cleaned_df.to_csv("final_jobs.csv", index=False)

In [None]:
df = pd.read_csv("final_jobs.csv")

df["job_text"] = df["job_title"].fillna("") + " - " + df["skill_text"].fillna("")

print("Encoding job postings...")
job_embeddings = model.encode(df["job_text"].tolist(), show_progress_bar=True, batch_size=128)

embedding_dim = job_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(job_embeddings)

faiss.write_index(index, "faiss_jobs_cleaned.index")

In [None]:
title_embeddings = model.encode(df["job_title"].astype(str).tolist(), show_progress_bar=True, batch_size=128)

title_index = faiss.IndexFlatL2(title_embeddings.shape[1])
title_index.add(title_embeddings)

faiss.write_index(title_index, "faiss_job_titles.index")