In [34]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root added:", PROJECT_ROOT)


Project root added: C:\Users\VigneshMurugesan\Desktop\Beta\resume-ai


In [35]:
import kagglehub
from pathlib import Path

taxonomy_path = kagglehub.dataset_download(
    "arbazkhan971/allskillandnonskill"
)
taxonomy_path = Path(taxonomy_path)

taxonomy_csv = next(taxonomy_path.glob("*.csv"))
taxonomy_csv


WindowsPath('C:/Users/VigneshMurugesan/.cache/kagglehub/datasets/arbazkhan971/allskillandnonskill/versions/2/skills.csv')

In [36]:
import pandas as pd

df_tax = pd.read_csv(taxonomy_csv)
print("Taxonomy columns:", df_tax.columns.tolist())
df_tax.head()


Taxonomy columns: ['Skill']


Unnamed: 0,Skill
0,supply chain engineering\n
1,bullet\n
2,commutations\n
3,pay equity\n
4,student retention\n


In [37]:
ml_ds_path = kagglehub.dataset_download(
    "asaniczka/data-science-job-postings-and-skills"
)
ml_ds_path = Path(ml_ds_path)

ml_ds_csv = next(ml_ds_path.glob("*.csv"))
ml_ds_csv


WindowsPath('C:/Users/VigneshMurugesan/.cache/kagglehub/datasets/asaniczka/data-science-job-postings-and-skills/versions/2/job_postings.csv')

In [38]:
df_ml = pd.read_csv(ml_ds_csv)
print("ML/DS columns:", df_ml.columns.tolist())
df_ml.head()


ML/DS columns: ['job_link', 'last_processed_time', 'last_status', 'got_summary', 'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'search_position', 'job_level', 'job_type']


Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite


In [39]:
from src.skills_ingestion import (
    load_resume_skills,
    load_taxonomy_skills,
    load_ml_ds_skills,
    build_unified_skills_catalog
)


In [42]:
import pandas as pd
from src.skills_normalizer import normalize_skill
from pathlib import Path

def load_taxonomy_skills(csv_path: Path) -> set[str]:
    df = pd.read_csv(csv_path)

    print("DEBUG: taxonomy columns =", df.columns.tolist())

    # Hard, explicit handling for this dataset (enterprise-safe)
    if "Skill" in df.columns:
        skill_col = "Skill"
    elif "skill" in df.columns:
        skill_col = "skill"
    else:
        raise ValueError(
            f"❌ Cannot find Skill column. Columns found: {df.columns.tolist()}"
        )

    skills = (
        df[skill_col]
        .dropna()
        .astype(str)
        .str.lower()
        .str.strip()
        .apply(normalize_skill)
    )

    return set(skills)


In [45]:
import pandas as pd

df_ml = pd.read_csv(ml_ds_csv)
df_ml.columns.tolist()


['job_link',
 'last_processed_time',
 'last_status',
 'got_summary',
 'got_ner',
 'is_being_worked',
 'job_title',
 'company',
 'job_location',
 'first_seen',
 'search_city',
 'search_country',
 'search_position',
 'job_level',
 'job_type']

In [None]:
from pathlib import Path
import pandas as pd
from src.skills_normalizer import normalize_skill

def load_ml_ds_skills(csv_path: Path) -> set[str]:
    df = pd.read_csv(csv_path)

    print("DEBUG: ML/DS columns =", df.columns.tolist())

    # Use job titles as a proxy for skills (industry-accepted approach)
    if "job_title" not in df.columns:
        raise ValueError(
            f"job_title column not found. Available columns: {df.columns.tolist()}"
        )

    raw_titles = (
        df["job_title"]
        .dropna()
        .astype(str)
        .str.lower()
        .str.replace(r"[^a-z0-9 ]", " ", regex=True)
    )

    # Tokenize titles into candidate skills
    tokens = (
        raw_titles
        .str.split()
        .explode()
        .str.strip()
    )

    # Filter to meaningful ML/DS tokens
    allowed_tokens = {
        "data", "scientist", "science", "ml", "ai", "engineer",
        "analytics", "analyst", "machine", "learning", "deep",
        "research", "nlp", "cv"
    }

    skills = (
        tokens[tokens.isin(allowed_tokens)]
        .apply(normalize_skill)
        .unique()
    )

    return set(skills)


In [48]:
taxonomy_skills = load_taxonomy_skills(taxonomy_csv)
ml_ds_skills = load_ml_ds_skills(ml_ds_csv)

len(taxonomy_skills), len(ml_ds_skills)


DEBUG: taxonomy columns = ['Skill']
DEBUG: ML/DS columns = ['job_link', 'last_processed_time', 'last_status', 'got_summary', 'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'search_position', 'job_level', 'job_type']


ValueError: 
❌ Could not infer skills column in ML/DS dataset.

Available columns:
['job_link', 'last_processed_time', 'last_status', 'got_summary', 'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'search_position', 'job_level', 'job_type']
