# JobTitle2SOC — Zero‑GitHub‑Interaction Notebook
This notebook fetches code + reference data from a **public GitHub ZIP** (no sign‑in),
lets the user upload their Excel, runs the mapper, and triggers a direct download of the result.

**Owner (you) must set the ZIP URL and repo folder name once before sharing** so HR never touches GitHub.


In [None]:
# Step 1 — Install dependencies (run once per session)
!pip -q install pandas openpyxl rapidfuzz scikit-learn xlsxwriter tqdm


In [None]:
# OWNER-ONLY: Set these two values, then share the notebook with HR.
# Example ZIP URL form: https://github.com/<you>/<repo>/archive/refs/heads/main.zip
REPO_ZIP_URL = 'https://github.com/<you>/<repo>/archive/refs/heads/main.zip'  # <-- CHANGE THIS
REPO_DIR_HINT = '<repo>-main'  # folder name after unzip (usually '<repo>-<branch>')  # <-- CHANGE THIS

assert '<you>' not in REPO_ZIP_URL and '<repo>' not in REPO_ZIP_URL, (
    'Set REPO_ZIP_URL to your public repo ZIP before sharing with HR.'
)
assert '<repo>' not in REPO_DIR_HINT, (
    'Set REPO_DIR_HINT (unzipped folder name) before sharing.'
)


In [None]:
# Step 2 — Fetch and extract the public GitHub ZIP (no GitHub login required)
import os, zipfile, io, requests
r = requests.get(REPO_ZIP_URL)
r.raise_for_status()
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('/content')

import pathlib
base_candidates = [p for p in pathlib.Path('/content').iterdir() if p.is_dir() and p.name == REPO_DIR_HINT]
if not base_candidates:
    # Fallback: try to auto-detect the first directory created by ZIP
    base_candidates = [p for p in pathlib.Path('/content').iterdir() if p.is_dir() and '-' in p.name]
BASE = base_candidates[0] if base_candidates else pathlib.Path('/content')
print('Using repo folder:', BASE)
os.chdir(BASE)
!pwd && ls -la


In [None]:
# Step 3 — Verify expected project layout
from pathlib import Path
CODE = Path('code')/'map_titles_to_soc.py'
OCC  = Path('data')/'Occupation Data.xlsx'
ALT  = Path('data')/'Alternate Titles.xlsx'

for label, p in [('CODE', CODE), ('OCC', OCC), ('ALT', ALT)]:
    print(f"{label}: {p.resolve()}  ->  {'OK' if p.exists() else 'MISSING'}")

if not (CODE.exists() and OCC.exists() and ALT.exists()):
    raise FileNotFoundError('Expected files not found. Ensure the repo has code/map_titles_to_soc.py and data/*.xlsx')


In [None]:
# Step 4 — Upload the *input* Excel from your computer (HR does NOT need GitHub)
from google.colab import files
from pathlib import Path
up = files.upload()  # choose your Company Job Titles.xlsx (any name is fine)
INP = Path(next(iter(up)))
print('Using input:', INP.resolve())


In [None]:
# Step 5 — Run the mapping script and save output to /content, named after the input
import subprocess, sys
OUT = Path('/content') / f"{INP.stem} - Mapped.xlsx"
cmd = [sys.executable, str(CODE),
       '--input', str(INP),
       '--occ',   str(OCC),
       '--alt',   str(ALT),
       '--out',   str(OUT)]
print('Running:\n', ' '.join(cmd))
subprocess.run(cmd, check=True)
print('\nSaved:', OUT)


In [None]:
# Step 6 — Download the result to your computer
from google.colab import files
files.download(str(OUT))
