In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AI Resume Screening System - Training Demo\n",
    "\n",
    "This notebook demonstrates:\n",
    "1. Loading synthetic data\n",
    "2. Computing features and rankings\n",
    "3. Training optional ML models\n",
    "4. Evaluating model performance\n",
    "5. Analyzing feature importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup\n",
    "import sys\n",
    "sys.path.insert(0, '..')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "import json\n",
    "\n",
    "# Import our modules\n",
    "from src.text_extraction import extract_text_from_pdf\n",
    "from src.preprocessing import preprocess_text, extract_entities\n",
    "from src.embedding import EmbeddingManager\n",
    "from src.ranking import ResumeRanker\n",
    "from src.trainer import RankingModelTrainer\n",
    "\n",
    "# Plotting style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "\n",
    "print(\"✅ Setup complete\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Demo Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load resume data\n",
    "data_dir = Path('../data/demo')\n",
    "resumes_dir = data_dir / 'resumes'\n",
    "jobs_dir = data_dir / 'jobs'\n",
    "\n",
    "# Load resume texts\n",
    "resumes = []\n",
    "for resume_file in sorted(resumes_dir.glob('*.txt')):\n",
    "    with open(resume_file, 'r') as f:\n",
    "        text = f.read()\n",
    "    \n",
    "    cleaned = preprocess_text(text)\n",
    "    entities = extract_entities(cleaned)\n",
    "    \n",
    "    resumes.append({\n",
    "        'filename': resume_file.name,\n",
    "        'raw_text': text,\n",
    "        'cleaned_text': cleaned,\n",
    "        'entities': entities\n",
    "    })\n",
    "\n",
    "print(f\"Loaded {len(resumes)} resumes\")\n",
    "\n",
    "# Load job descriptions\n",
    "jobs = []\n",
    "for job_file in sorted(jobs_dir.glob('*.txt')):\n",
    "    with open(job_file, 'r') as f:\n",
    "        text = f.read()\n",
    "    jobs.append({\n",
    "        'filename': job_file.name,\n",
    "        'text': text\n",
    "    })\n",
    "\n",
    "print(f\"Loaded {len(jobs)} job descriptions\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Initialize Components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize embedding manager and ranker\n",
    "embedding_manager = EmbeddingManager(cache_dir='../cache/embeddings')\n",
    "ranker = ResumeRanker(\n",
    "    weight_keyword=0.3,\n",
    "    weight_semantic=0.4,\n",
    "    weight_experience=0.2,\n",
    "    weight_skills=0.1\n",
    ")\n",
    "\n",
    "print(\"✅ Components initialized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Rank Resumes Against Each Job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rank resumes for each job\n",
    "all_results = []\n",
    "\n",
    "for job in jobs:\n",
    "    print(f\"\\nRanking for: {job['filename']}\")\n",
    "    \n",
    "    results = ranker.rank_candidates(\n",
    "        resumes=resumes,\n",
    "        job_description=job['text'],\n",
    "        embedding_manager=embedding_manager\n",
    "    )\n",
    "    \n",
    "    # Add job info to results\n",
    "    for r in results:\n",
    "        r['job'] = job['filename']\n",
    "    \n",
    "    all_results.extend(results)\n",
    "    \n",
    "    # Show top 3\n",
    "    print(f\"Top 3 candidates:\")\n",
    "    for i, candidate in enumerate(results[:3], 1):\n",
    "        print(f\"  {i}. {candidate['filename']}: {candidate['total_score']:.3f}\")\n",
    "\n",
    "print(f\"\\n✅ Total rankings: {len(all_results)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Analyze Score Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to DataFrame for analysis\n",
    "df = pd.DataFrame(all_results)\n",
    "\n",
    "# Score distribution\n",
    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
    "\n",
    "axes[0, 0].hist(df['keyword_score'], bins=20, alpha=0.7, color='blue')\n",
    "axes[0, 0].set_title('Keyword Score Distribution')\n",
    "axes[0, 0].set_xlabel('Score')\n",
    "axes[0, 0].set_ylabel('Frequency')\n",
    "\n",
    "axes[0, 1].hist(df['semantic_score'], bins=20, alpha=0.7, color='green')\n",
    "axes[0, 1].set_title('Semantic Score Distribution')\n",
    "axes[0, 1].set_xlabel('Score')\n",
    "\n",
    "axes[1, 0].hist(df['experience_score'], bins=20, alpha=0.7, color='orange')\n",
    "axes[1, 0].set_title('Experience Score Distribution')\n",
    "axes[1, 0].set_xlabel('Score')\n",
    "axes[1, 0].set_ylabel('Frequency')\n",
    "\n",
    "axes[1, 1].hist(df['total_score'], bins=20, alpha=0.7, color='red')\n",
    "axes[1, 1].set_title('Total Score Distribution')\n",
    "axes[1, 1].set_xlabel('Score')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"\\nScore Statistics:\")\n",
    "print(df[['keyword_score', 'semantic_score', 'experience_score', 'skills_score', 'total_score']].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Score Correlations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation matrix\n",
    "score_cols = ['keyword_score', 'semantic_score', 'experience_score', 'skills_score', 'total_score']\n",
    "corr_matrix = df[score_cols].corr()\n",
    "\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, \n",
    "            square=True, linewidths=1, cbar_kws={\"shrink\": 0.8})\n",
    "plt.title('Score Correlation Matrix')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Train Classification Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare training data\n",
    "trainer = RankingModelTrainer(model_dir='../models')\n",
    "\n",
    "# Use threshold to create binary labels\n",
    "threshold = 0.6\n",
    "X, y = trainer.prepare_training_data(all_results, threshold=threshold)\n",
    "\n",
    "print(f\"Training data shape: X={X.shape}, y={y.shape}\")\n",
    "print(f\"Positive class: {np.sum(y)} ({np.mean(y):.1%})\")\n",
    "print(f\"Negative class: {len(y) - np.sum(y)} ({1-np.mean(y):.1%})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Logistic Regression\n",
    "lr_results = trainer.train_logistic_regression(X, y)\n",
    "\n",
    "print(\"\\n=== Logistic Regression Results ===\")\n",
    "print(f\"Train Accuracy: {lr_results['train_accuracy']:.3f}\")\n",
    "print(f\"Test Accuracy: {lr_results['test_accuracy']:.3f}\")\n",
    "print(f\"CV Mean: {lr_results['cv_mean']:.3f} (+/- {lr_results['cv_std']:.3f})\")\n",
    "\n",
    "# Save model\n",
    "trainer.save_model('logistic_regression_model.pkl')\n",
    "print(\"\\n✅ Model saved\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Random Forest\n",
    "rf_results = trainer.train_random_forest(X, y)\n",
    "\n",
    "print(\"\\n=== Random Forest Results ===\")\n",
    "print(f\"Train Accuracy: {rf_results['train_accuracy']:.3f}\")\n",
    "print(f\"Test Accuracy: {rf_results['test_accuracy']:.3f}\")\n",
    "\n",
    "print(\"\\nFeature Importance:\")\n",
    "for feature, importance in rf_results['feature_importance'].items():\n",
    "    print(f\"  {feature}: {importance:.3f}\")\n",
    "\n",
    "# Visualize feature importance\n",
    "features = list(rf_results['feature_importance'].keys())\n",
    "importances = list(rf_results['feature_importance'].values())\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.bar(features, importances, color='skyblue')\n",
    "plt.title('Random Forest Feature Importance')\n",
    "plt.xlabel('Feature')\n",
    "plt.ylabel('Importance')\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train XGBoost\n",
    "xgb_results = trainer.train_xgboost(X, y)\n",
    "\n",
    "print(\"\\n=== XGBoost Results ===\")\n",
    "print(f\"Train Accuracy: {xgb_results['train_accuracy']:.3f}\")\n",
    "print(f\"Test Accuracy: {xgb_results['test_accuracy']:.3f}\")\n",
    "\n",
    "print(\"\\nFeature Importance:\")\n",
    "for feature, importance in xgb_results['feature_importance'].items():\n",
    "    print(f\"  {feature}: {importance:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Model Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare models\n",
    "models = ['Logistic Regression', 'Random Forest', 'XGBoost']\n",
    "train_scores = [\n",
    "    lr_results['train_accuracy'],\n",
    "    rf_results['train_accuracy'],\n",
    "    xgb_results['train_accuracy']\n",
    "]\n",
    "test_scores = [\n",
    "    lr_results['test_accuracy'],\n",
    "    rf_results['test_accuracy'],\n",
    "    xgb_results['test_accuracy']\n",
    "]\n",
    "\n",
    "x = np.arange(len(models))\n",
    "width = 0.35\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 6))\n",
    "ax.bar(x - width/2, train_scores, width, label='Train', color='lightblue')\n",
    "ax.bar(x + width/2, test_scores, width, label='Test', color='lightcoral')\n",
    "\n",
    "ax.set_ylabel('Accuracy')\n",
    "ax.set_title('Model Comparison')\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(models)\n",
    "ax.legend()\n",
    "ax.set_ylim([0, 1.1])\n",
    "\n",
    "# Add value labels\n",
    "for i, (train, test) in enumerate(zip(train_scores, test_scores)):\n",
    "    ax.text(i - width/2, train + 0.02, f'{train:.3f}', ha='center', va='bottom')\n",
    "    ax.text(i + width/2, test + 0.02, f'{test:.3f}', ha='center', va='bottom')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Top Candidates Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find top candidates for each job\n",
    "for job in jobs:\n",
    "    job_results = [r for r in all_results if r['job'] == job['filename']]\n",
    "    job_results_sorted = sorted(job_results, key=lambda x: x['total_score'], reverse=True)\n",
    "    \n",
    "    print(f\"\\n{'='*60}\")\n",
    "    print(f\"Job: {job['filename']}\")\n",
    "    print(f\"{'='*60}\")\n",
    "    \n",
    "    for i, candidate in enumerate(job_results_sorted[:5], 1):\n",
    "        print(f\"\\n#{i} - {candidate['filename']}\")\n",
    "        print(f\"  Overall Score: {candidate['total_score']:.3f}\")\n",
    "        print(f\"  Breakdown:\")\n",
    "        print(f\"    - Keyword:   {candidate['keyword_score']:.3f}\")\n",
    "        print(f\"    - Semantic:  {candidate['semantic_score']:.3f}\")\n",
    "        print(f\"    - Experience: {candidate['experience_score']:.3f}\")\n",
    "        print(f\"    - Skills:    {candidate['skills_score']:.3f}\")\n",
    "        print(f\"  Summary: {candidate['explanation']['summary']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Save Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save results to CSV\n",
    "output_file = '../data/demo/ranking_results.csv'\n",
    "df_export = df[['filename', 'job', 'total_score', 'keyword_score', \n",
    "                'semantic_score', 'experience_score', 'skills_score']].copy()\n",
    "df_export.to_csv(output_file, index=False)\n",
    "\n",
    "print(f\"✅ Results saved to {output_file}\")\n",
    "\n",
    "# Save model comparison\n",
    "comparison_df = pd.DataFrame({\n",
    "    'Model': models,\n",
    "    'Train_Accuracy': train_scores,\n",
    "    'Test_Accuracy': test_scores\n",
    "})\n",
    "comparison_df.to_csv('../data/demo/model_comparison.csv', index=False)\n",
    "\n",
    "print(\"✅ Model comparison saved\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Conclusion\n",
    "\n",
    "This notebook demonstrated:\n",
    "- Loading and preprocessing resume data\n",
    "- Computing multi-component relevance scores\n",
    "- Training and comparing ML models\n",
    "- Analyzing feature importance\n",
    "- Identifying top candidates\n",
    "\n",
    "### Next Steps:\n",
    "1. **Fine-tune weights**: Adjust scoring weights based on your priorities\n",
    "2. **Use larger models**: Upgrade to better embedding models for improved accuracy\n",
    "3. **Add more features**: Include education level, certifications, etc.\n",
    "4. **Deploy API**: Use the FastAPI wrapper for programmatic access\n",
    "5. **Integrate with ATS**: Connect to your applicant tracking system"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}