In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tests: Automated Resume Screener (NLP)\n",
    "\n",
    "Run top-to-bottom. This will generate seed data (if missing), parse PDFs, rank resumes against the JD, and display/save results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional: install packages if your environment is fresh\n",
    "# !pip install -q pandas numpy sentence-transformers pymupdf pdfminer.six rapidfuzz\n",
    "import os, sys, pandas as pd\n",
    "from pathlib import Path\n",
    "sys.path.append(str(Path('.').resolve()))\n",
    "\n",
    "from data.seed_data import make_samples\n",
    "from parser import extract_text_from_pdf\n",
    "from ranker import rank_resumes, ResumeRecord\n",
    "\n",
    "# Create seed sample PDFs + JD if not present\n",
    "make_samples()  # writes data/samples/\n",
    "print('✅ Seed data ready (data/samples)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE = Path('data/samples')\n",
    "jd_path = BASE / 'jd.txt'\n",
    "assert jd_path.exists(), 'JD not found. Did seed_data.py run?'\n",
    "jd_text = jd_path.read_text(encoding='utf-8')\n",
    "print('JD title:', jd_text.split('\\n')[0])  # show first line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pdfs = sorted([p for p in BASE.glob('*.pdf')])\n",
    "assert pdfs, 'No sample resumes found. Did seed_data.py run?'\n",
    "records = []\n",
    "for p in pdfs:\n",
    "    text = extract_text_from_pdf(p.as_posix())\n",
    "    records.append(ResumeRecord(file_name=p.name, text=text))\n",
    "print('Loaded resumes:', len(records))\n",
    "[p.name for p in pdfs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = rank_resumes(jd_text, records, drop_duplicates=True)\n",
    "print('Rows ranked:', len(df))\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display_cols = ['file','name','email','final_score','similarity','skill_overlap','years_match','num_skills_matched','top_skills']\n",
    "df[display_cols].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_path = Path('test_ranking_output.csv')\n",
    "df.to_csv(out_path, index=False)\n",
    "print('Saved CSV:', out_path.resolve().as_posix())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
