# Resume Extraction & Embedding Pipeline

## How to Use:

### Option 1: Quick Run (Recommended)
1. Run **Cell 1** - Install packages
2. Run **Cell 2** - Setup and configure APIs
3. Run **Cell 11 (Last Cell)** - This will execute the entire pipeline

### Option 2: Step by Step
Run cells in order (1-11) to see detailed output at each step.

## What the Pipeline Does:
1. ✓ Uploads your resume PDF
2. ✓ Parses PDF and extracts text
3. ✓ Generates metadata (file hash, timestamps)
4. ✓ Sends to Gemini API for structured parsing
5. ✓ Builds embedding text (skills, experience, projects)
6. ✓ Generates vector embeddings using BAAI/bge-base-en-v1.5
7. ✓ Exports results to JSON file

## Output:
- Extracted resume data (JSON)
- Vector embeddings
- File saved to: `./resumes/{file_hash}.json`

In [None]:
!pip install pypdf google-generativeai python-dotenv requests -q
!pip install -U sentence-transformers
print("Packages installed successfully")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.2/328.2 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.2.0-py3-none-any.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 5.1.2
    Uninstalling sentence-transformers-5.1.2:
      Successfully uninstalled sentence-transformers-5.1.2
Successfully installed sentence-transformers-5.2.0
Packages installed successfully


In [None]:
from typing import List, Dict, Any

In [None]:
import json
import os
from datetime import datetime
from pathlib import Path
from typing import Optional
from hashlib import sha256
import io
from pypdf import PdfReader
import requests
import google.generativeai as genai
from dotenv import load_dotenv
import re
from google.colab import userdata
from sentence_transformers import SentenceTransformer

# Load environment variables
load_dotenv()

# Configure APIs
GEMINI_API_KEY1 = userdata.get('GEMINI_API_KEY_COLLEGE_ID')
HF_TOKEN =  userdata.get('HF_TOKEN')


if GEMINI_API_KEY1:
    genai.configure(api_key=GEMINI_API_KEY)
    print("✓ Gemini API configured")
else:
    print("⚠ GEMINI_API_KEY not found in environment")

if HF_TOKEN:
    print("✓ Hugging Face token found")
else:
    print("⚠ HF_TOKEN not found in environment")

# Global variables for storing results
resume_text = None
file_name = None
metadata = None
parsed_resume = None
embedding_text = None
embedding = None

✓ Gemini API configured
✓ Hugging Face token found


In [None]:
def upload_resume():
    """Upload multiple resume files from user."""
    from google.colab import files
    import time

    print("Please upload one or more resume PDF files...")
    uploaded = files.upload()

    # Give the system time to write all files
    time.sleep(2)

    resume_file_paths = []

    # Iterate through all uploaded files
    for file_name in uploaded.keys():
        # Handle Colab's file renaming (adds "(1)", "(2)", etc.)
        actual_file_name = file_name

        # Try the original name first
        if os.path.exists(actual_file_name):
            file_size = os.path.getsize(actual_file_name)
            resume_file_paths.append((actual_file_name, file_name))
            print(f"✓ Resume file found: {file_name}")
            print(f"  - File size: {file_size} bytes")
        else:
            print(f"⚠ File not found: {actual_file_name}")

    if not resume_file_paths:
        raise Exception("No files uploaded successfully")

    print(f"\n✓ Total files uploaded: {len(resume_file_paths)}")
    return resume_file_paths


In [None]:
def parse_pdf(resume_file_path):
    """Parse PDF and extract text."""
    print("\n" + "="*60)
    print("STEP 1: PARSE PDF")
    print("="*60 + "\n")

    try:
        with open(resume_file_path, 'rb') as pdf_file:
            reader = PdfReader(pdf_file)
            print(f"Total pages: {len(reader.pages)}")

            resume_text = ""
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text()
                resume_text += page_text
                print(f"  - Page {page_num + 1}: {len(page_text)} characters")

        print(f"\n✓ Successfully extracted {len(resume_text)} characters")
        print(f"\nPreview (first 300 characters):")
        print("-" * 40)
        print(resume_text[:300])
        print("-" * 40)

        return resume_text

    except Exception as e:
        print(f"✗ Error parsing PDF: {e}")
        raise

In [None]:
def extract_metadata(resume_text, file_name):
    """Extract and generate metadata from resume."""
    print("\n" + "="*60)
    print("STEP 2: EXTRACT METADATA")
    print("="*60 + "\n")

    def generate_hash_value(text: str) -> str:
        """Generate SHA256 hash of normalized resume text."""
        normalized = text.strip().lower()
        normalized = normalized.replace('\r\n', '\n')
        normalized = re.sub(r'\b(page|pages)\s+\d+\b', '', normalized)
        normalized = re.sub(r'[^\w\s]', ' ', normalized)
        normalized = re.sub(r'\s+', ' ', normalized).strip()
        return sha256(normalized.encode()).hexdigest()

    file_hash = generate_hash_value(resume_text)

    metadata = {
        "fileName": file_name,
        "fileHash": file_hash,
        "parsedAt": datetime.utcnow().isoformat() + "Z",
        "parserVersion": "1.0.0",
        "Language": "en"
    }

    print(f"Metadata extracted:")
    print(json.dumps(metadata, indent=2))

    return metadata

In [None]:
def extract_resume_with_llm(resume_text):
    """Send resume text to LLM for parsing."""
    print("\n" + "="*60)
    print("STEP 3: LLM EXTRACTION - PARSING RESUME")
    print("="*60 + "\n")

    resume_parsing_prompt = """Return only valid JSON that exactly matches the ParsedResume schema described below. Do NOT add or remove fields, explanations, comments, code fences, markdown, or any text outside the JSON. The output must be a pure JSON object that can be parsed by JSON.parse() with no trailing characters. If a value cannot be determined with reasonable confidence, use null. Dates must follow ISO 8601 (full datetime with Z when possible, or "YYYY-MM-DD", "YYYY-MM", or "YYYY"). For fuzzy dates, use the FlexibleDate structure. Trim all strings and deduplicate lists. Do not fabricate information.

The response MUST be a raw JSON object.
Do NOT use markdown.
Do NOT use triple backticks.
Do NOT wrap the output in ```json or ```.
If any non-JSON character is produced, the output is invalid.

TOP-LEVEL RULES

Always return a JSON object containing exactly the keys defined in the schema.

id: required string; you may generate a deterministic ID such as "resume_main" if nothing else is known.

If a sub-object is optional and no data exists, use null or empty arrays as required by the schema.

Avoid guessing; when uncertain, use null.



No metadata is required or allowed in output.

SCHEMA (NO METADATA FIELD)

The top-level object must contain exactly these keys:

{
  "id": string,
  "analysis": ResumeAnalysis,
  "verification": VerificationFlags,
  "basics": Basics,
  "skills": SkillProfile[],
  "workExperience": WorkExperience[],
  "education": Education[],
  "projects": Project[],
  "certifications": Certification[],
  "languages": Language[]
}

1. analysis (ResumeAnalysis)
{
  "quality": {
    "score": number,                   // 0–100
    "level": "low"|"average"|"high"|"exceptional",
    "hints": ["string"]
  },
  "suspicion": {
    "score": number,                   // 0–100
    "level": "safe"|"concern"|"suspicious"|"high_risk",
    "flags": [
      {
        "type": "string",
        "severity": "low"|"medium"|"critical",
        "description": "string"
      }
    ]
  },
  "writingStyle": {
    "actionVerbsRate": number,         // 0.0–1.0
    "quantificationRate": number,      // 0.0–1.0
    "clicheCount": number
  }
}


Quality level mapping:
0–40 low, >40–70 average, >70–90 high, >90 exceptional.

2. verification (VerificationFlags)
{
  "timeline": {
    "hasGaps": boolean,
    "gaps": [
      {
        "startDate": IsoDate,
        "endDate": IsoDate,
        "durationDays": number
      }
    ]
  },
  "identity": {
    "geoConsistency": "match"|"mismatch"|"unknown",
    "socialFootprintFound": boolean
  }
}


Detect gaps between jobs/education > 60 days.

Social footprint refers to existence of LinkedIn/GitHub/etc. signals in the résumé.

3. basics (Basics)
{
  "name": Traceable<string>,
  "email": Traceable<string>[],
  "phone": Traceable<string>[],
  "location": Location,
  "urls": [
    {
      "type": "linkedin"|"github"|"portfolio"|"personal",
      "url": "string"
    }
  ],
  "summary": "string|null"
}

Traceable<T>
{
  "value": T,
  "rawText": string,
  "confidence": number,     // 0–1
  "pageIndex": number|null
}

Location
{
  "rawInput": string,
  "city": string|null,
  "state": string|null,
  "country": string|null,
  "zipCode": string|null,
  "countryCode": string|null
}


Rules:

Emails must be lowercase, valid format only.

Phones must be normalized to E.164 when possible; digits-only otherwise.

Summary ≤ 800 chars.

4. skills: SkillProfile[]
{
  "name": string,
  "normalizedName": string,
  "category": string,
  "computedLevel": "novice"|"intermediate"|"advanced"|"expert",
  "validityScore": number, // 0–10
  "metadata": {
    "firstSeen": IsoDate,
    "lastUsed": IsoDate,
    "totalMonthsExperience": number,
    "occurrenceCount": number,
    "sources": [
      {
        "sectionId": string,
        "sectionType": "experience"|"education"|"project"
      }
    ]
  }
}


Canonicalize skill names.

Deduplicate strongly.

Max 200 skills.

5. workExperience: WorkExperience[]
{
  "id": string,
  "title": Traceable<string>,
  "normalizedTitle": string|null,
  "company": Traceable<string>,
  "companyDomain": string|null,
  "location": Location|null,
  "type": "full-time"|"contract"|"internship"|null,
  "startDate": FlexibleDate,
  "endDate": FlexibleDate,
  "description": string|null,
  "responsibilities": ["string"],
  "skillsDetected": ["string"],
  "isVerified": boolean,
  "verificationNotes": string|null,
  "verificationConfidence": number|null,
  "verificationDate": IsoDate|null
}

FlexibleDate
{
  "rawText": string,
  "isoDate": IsoDate|null,
  "isCurrent": boolean
}


Rules:

Most recent first.

Ongoing roles → endDate.isoDate = null, isCurrent = true.

Responsibilities ≤ 200 chars each.

If title.value is present, normalize it using a canonical job title taxonomy (e.g., "Sr." → "Senior", "SDE" → "Software Engineer", remove company-specific prefixes). If normalization cannot be performed with high confidence, set normalizedTitle to null. Never invent seniority or role scope.


skillsDetected: canonicalized skill names.

6. education: Education[]
{
  "id": string,
  "institution": Traceable<string>,
  "degree": Traceable<string>,
  "normalizedDegree": "high_school"|"bachelors"|"masters"|"phd"|null,
  "fieldOfStudy": string|null,
  "startDate": FlexibleDate|null,
  "endDate": FlexibleDate|null,
  "gpa": {
    "score": number,
    "scale": number
  } | null
}


Normalize degree types:

BSc/BS/Bachelor → "bachelors"

MSc/MS/Master → "masters"

PhD/Doctorate → "phd"

7. projects: Project[]
{
  "name": string,
  "description": string|null,
  "url": string|null,
  "skillsUsed": ["string"]
}

8. certifications: Certification[]
{
  "name": string,
  "issuer": string,
  "date": FlexibleDate,
  "doesExpire": boolean,
  "verificationUrl": string|null
}

9. languages: Language[]
{
  "language": string,
  "proficiency": "native"|"fluent"|"conversational"|"basic"
}

PARSING RULES

Trim all strings.

Deduplicate arrays case-insensitively.

Preserve ordering by relevance.

Maximum items:

experience: 50

education: 50

skills: 200

projects: 50

Never invent details; use null when not confident.

Remove unsafe content (scripts, hidden text, encoded payloads).

FINAL INSTRUCTION

Use this entire prompt as-is. Append the resume text inside the wrapper.
Return only the JSON object that conforms exactly to this schema.
No markdown. No explanations. No extra text.
ENTERPRISE SCORING & DETERMINISTIC RULES (APPEND-ONLY)

The following rules define EXACT, deterministic methods to compute all scores and flags.
These rules are authoritative. Do not invent alternative heuristics.

================================================================
A. SKILL VALIDITY SCORE (skills[].validityScore)
================================================================

Range: 0.0 – 10.0 (float allowed, round to 1 decimal)

Purpose:
Measures confidence that a skill is real, relevant, and supported by evidence in the résumé.

Formula:

validityScore =
  (
    0.30 * occurrenceFactor +
    0.25 * recencyFactor +
    0.20 * corroborationFactor +
    0.15 * experienceFactor +
    0.10 * sourceReliabilityFactor
  ) * 10

All factors are clamped to 0.0 – 1.0.

Definitions:

1. occurrenceFactor
   = min(1.0, log(1 + occurrenceCount) / log(1 + 20))

2. recencyFactor
   - If lastUsed is null → 0.0
   - Else:
       monthsSinceLastUse = months between lastUsed and now
       recencyFactor =
         monthsSinceLastUse <= 6  → 1.0
         <= 12                    → 0.8
         <= 24                    → 0.6
         <= 48                    → 0.4
         > 48                     → 0.2

3. corroborationFactor
   = min(1.0, number of distinct sectionTypes in sources / 3)
   (experience, project, education)

4. experienceFactor
   - If totalMonthsExperience is null → 0.0
   - Else:
       experienceFactor = min(1.0, totalMonthsExperience / 60)

5. sourceReliabilityFactor
   - Appears in workExperience → 1.0
   - Appears only in projects → 0.7
   - Appears only in education → 0.6
   - Appears only in summary/skills list → 0.4

ComputedLevel Mapping (skills[].computedLevel):
- validityScore < 3.0        → novice
- 3.0 – 5.9                  → intermediate
- 6.0 – 8.4                  → advanced
- ≥ 8.5                      → expert

================================================================
B. RESUME QUALITY SCORE (analysis.quality)
================================================================

Range: 0 – 100 (integer)

quality.score =
  0.30 * structureScore +
  0.30 * contentDepthScore +
  0.20 * clarityScore +
  0.20 * consistencyScore

Each component normalized to 0–100.

1. structureScore
   - Presence of basics, experience, skills, education
   - +25 per major section present (max 100)

2. contentDepthScore
   - Average responsibilities per role ≥ 3 → 100
   - ≥ 2 → 75
   - ≥ 1 → 50
   - else → 25

3. clarityScore
   - Based on writingStyle:
     clarityScore =
       (actionVerbsRate * 50) +
       (quantificationRate * 40) -
       (min(clicheCount, 10) * 2)

   Clamp 0–100.

4. consistencyScore
   - No overlapping dates → 100
   - Minor overlaps or fuzzy dates → 70
   - Multiple conflicts → 40

quality.level mapping:
0–40 low
>40–70 average
>70–90 high
>90 exceptional

================================================================
C. SUSPICION SCORE (analysis.suspicion)
================================================================

Range: 0 – 100

Start at 0, add penalties:

+20  unexplained timeline gap > 12 months
+15  multiple overlapping full-time roles
+15  excessive buzzwords without evidence
+10  skills listed but never used
+10  inconsistent locations across roles
+30  fabricated-looking company names or dates

Clamp to 100.

suspicion.level:
0–20     → safe
21–40    → concern
41–70    → suspicious
>70      → high_risk

================================================================
D. WRITING STYLE METRICS (analysis.writingStyle)
================================================================

actionVerbsRate
= (# bullet points starting with action verb) / (total bullet points)

quantificationRate
= (# bullet points containing numbers, %, $, metrics) / (total bullet points)

clicheCount
= count of overused phrases (e.g., "hard-working", "team player", "go-getter")

================================================================
E. WORK EXPERIENCE VERIFICATION RULES
================================================================

isVerified = true ONLY IF:
- Company domain exists AND
- Role dates are consistent AND
- Skill usage aligns with role title

Else isVerified = false.

verificationConfidence (0.0 – 1.0):
- 1.0 → all signals match
- 0.7 → partial corroboration
- 0.4 → weak evidence
- null → no verification attempted

================================================================
F. TIMELINE GAP DETECTION
================================================================

A gap exists if:
(endDate of previous role) → (startDate of next role) > 60 days

durationDays must be exact.

================================================================
G. TRACEABLE CONFIDENCE RULES
================================================================

Traceable.confidence:
- Exact match (email, phone, URL) → 1.0
- Minor normalization → 0.9
- Heuristic extraction → 0.7
- Inferred / ambiguous → 0.4

================================================================
H. GLOBAL SAFETY & DETERMINISM
================================================================

- Never infer skills, companies, or dates not explicitly present.
- Scores must be explainable using the rules above.
- If required inputs are missing, degrade score deterministically.
- No randomness. Same input must produce same output.

END OF ENTERPRISE EXTENSIONS

"""

    resume_parsing_prompt = resume_parsing_prompt + "\n===START===\n" + resume_text + "\n===END==="

    print("Sending resume text to Gemini API for parsing...")
    print(f"Prompt size: {len(resume_parsing_prompt)} characters\n")

    try:
        print("hello world")
        model = genai.GenerativeModel("gemini-flash-latest")
        response = model.generate_content(resume_parsing_prompt)

        print(f"✓ Received response from Gemini API")
        print(f"Response size: {len(response.text)} characters\n")

        parsed_resume = json.loads(response.text)

        print("✓ Successfully parsed JSON response")
        print(f"\nResume ID: {parsed_resume.get('id')}")
        print(f"Quality Score: {parsed_resume.get('analysis', {}).get('quality', {}).get('score')}")
        print(f"Quality Level: {parsed_resume.get('analysis', {}).get('quality', {}).get('level')}")

        return parsed_resume

    except json.JSONDecodeError as e:
        print(f"✗ Failed to parse LLM response as JSON: {e}")
        print(f"Response preview: {response.text[:500]}")
        raise
    except Exception as e:
        print(f"✗ Error during LLM extraction: {e}")
        raise

In [None]:
def add_metadata_to_resume(parsed_resume, metadata):
    """Add metadata to parsed resume."""
    parsed_resume['metaData'] = metadata

    print("✓ Metadata added to parsed resume")
    print(f"\nMetadata in resume:")
    print(json.dumps(parsed_resume['metaData'], indent=2))

    return parsed_resume

In [None]:
def build_embedding_text(parsed_resume):
    """Build embedding text from parsed resume."""
    print("\n" + "="*60)
    print("STEP 4: BUILD EMBEDDING TEXT")
    print("="*60 + "\n")

    skills = parsed_resume.get('skills', [])
    top_skills = sorted(
        skills,
        key=lambda s: s.get('validityScore', 0),
        reverse=True
    )[:20]

    skills_text = ", ".join([s['name'] for s in top_skills])
    print(f"Top 20 skills extracted: {len(top_skills)} skills")
    print(f"Skills: {skills_text}\n")

    experiences = parsed_resume.get('workExperience', [])[:3]
    experiences_text = ""
    for i, exp in enumerate(experiences, 1):
        title = exp.get('normalizedTitle') or exp.get('title', {}).get('value', 'Unknown')
        description = exp.get('description') or '; '.join(exp.get('responsibilities', [])[:2])
        line = f"{title},{description}"
        experiences_text += line + "\n"
        print(f"{i}. {title}")
        print(f"   {description[:80]}...\n")

    experiences_text = experiences_text.strip()

    projects = parsed_resume.get('projects', [])[:3]
    projects_text = ""
    for i, proj in enumerate(projects, 1):
        line = f"{proj.get('name')}: {proj.get('description', 'No description')}"
        projects_text += line + "\n"
        print(f"Project {i}: {proj.get('name')}")
        print(f"  {(proj.get('description') or 'No description')[:80]}...\n")

    projects_text = projects_text.strip()

    education = parsed_resume.get('education', [])
    education_text = ""
    for i, edu in enumerate(education, 1):
        degree = edu.get('normalizedDegree', 'Degree').replace('_', ' ').title()
        field = edu.get('fieldOfStudy', 'Unspecified')
        line = f"{degree} in {field}"
        education_text += line + "\n"
        print(f"Education {i}: {line}\n")

    education_text = education_text.strip()

    embedding_text = f"""Experience:
{experiences_text}

Projects:
{projects_text}

Skills:
{skills_text}

Education:
{education_text}""".strip()

    print("\n" + "="*60)
    print("EMBEDDING TEXT BUILT")
    print("="*60)
    print(f"\nTotal characters: {len(embedding_text)}")
    print(f"\nPreview:")
    print("-" * 60)
    print(embedding_text)
    print("-" * 60)

    return embedding_text

In [None]:
def generate_embeddings(embedding_text):
    """Generate vector embeddings."""
    print("\n" + "="*60)
    print("STEP 5: GENERATE VECTOR EMBEDDINGS")
    print("="*60 + "\n")

    print(f"Generating embeddings for {len(embedding_text)} characters...\n")

    try:
        model = SentenceTransformer("BAAI/bge-base-en-v1.5")

        embedding = model.encode(
            embedding_text,
            normalize_embeddings=True
        )

        print("✓ Embeddings generated successfully")
        print(f"  - Embedding dimensions: {len(embedding)}")
        print(f"  - Number of vectors: 1")
        print("\nFirst 10 embedding values:")
        print(embedding[:10])

        return embedding
    except Exception as e:
        print(f"✗ Error generating embeddings: {e}")
        raise

In [None]:
def display_and_export(parsed_resume, metadata, embedding_text, embedding):
    """Display results and export to JSON file."""
    print("\n" + "="*60)
    print("EXTRACTED RESUME DATA (JSON)")
    print("="*60 + "\n")

    print(json.dumps(parsed_resume, indent=2))

    print("\n" + "="*60)
    print("EXPORTING RESULTS")
    print("="*60 + "\n")

    EXPORT_DIR = "./resumes"
    os.makedirs(EXPORT_DIR, exist_ok=True)

    file_hash = metadata["fileHash"]
    output_file = os.path.join(EXPORT_DIR, f"{file_hash}.json")

    if os.path.exists(output_file):
        print(f"✗ Duplicate detected: resume with hash {file_hash} already exists")
        print(f"  - Path: {output_file}")
    else:
        output_data = {
            "metadata": metadata,
            "extractedResume": parsed_resume,
            "embeddingText": embedding_text,
            "embeddingInfo": {
                "model": "BAAI/bge-base-en-v1.5",
                "dimensions": len(embedding),
                "vectorCount": 1,
                "embedding": embedding.tolist()
            }
        }

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(output_data, f, indent=2)

        print(f"✓ Resume data exported successfully")
        print(f"  - Path: {output_file}")
        print(f"  - File size: {os.path.getsize(output_file)} bytes")
        print("  - Contains: metadata, extracted resume, embedding text, embedding info")

    return output_file

In [None]:
async def process_resume_pipeline():
    """Main pipeline - run this cell to process multiple resumes."""
    try:
        # Step 0: Upload resumes
        print("\n" + "="*60)
        print("STEP 0: UPLOAD RESUMES")
        print("="*60 + "\n")
        resume_file_paths = upload_resume()

        total_files = len(resume_file_paths)
        successful_files = 0
        failed_files = 0
        processed_files = []

        # Process each resume
        for file_index, (resume_file_path, file_name) in enumerate(resume_file_paths, 1):
            print("\n" + "█"*60)
            print(f"PROCESSING FILE {file_index}/{total_files}: {file_name}")
            print("█"*60 + "\n")

            try:
                # Step 1: Parse PDF
                resume_text = parse_pdf(resume_file_path)

                # Step 2: Extract metadata
                metadata = extract_metadata(resume_text, file_name)

                # Step 3: LLM extraction
                parsed_resume = extract_resume_with_llm(resume_text)

                # Step 4: Add metadata
                parsed_resume = add_metadata_to_resume(parsed_resume, metadata)

                # Step 5: Build embedding text
                embedding_text = build_embedding_text(parsed_resume)

                # Step 6: Generate embeddings
                embedding = generate_embeddings(embedding_text)

                # Step 7: Display and export
                output_file = display_and_export(parsed_resume, metadata, embedding_text, embedding)

                processed_files.append({
                    "fileName": file_name,
                    "fileHash": metadata["fileHash"],
                    "outputFile": output_file,
                    "qualityScore": parsed_resume.get('analysis', {}).get('quality', {}).get('score'),
                    "status": "✓ SUCCESS"
                })
                successful_files += 1

                print(f"\n✓ File {file_index}/{total_files} processed successfully")

            except Exception as e:
                print(f"\n✗ Error processing {file_name}: {e}")
                processed_files.append({
                    "fileName": file_name,
                    "status": f"✗ FAILED: {str(e)[:100]}"
                })
                failed_files += 1

        # Final summary
        print("\n" + "="*60)
        print("BATCH PROCESSING COMPLETE")
        print("="*60)
        print(f"\nTotal files: {total_files}")
        print(f"✓ Successful: {successful_files}")
        print(f"✗ Failed: {failed_files}")

        print("\n" + "-"*60)
        print("PROCESSING SUMMARY:")
        print("-"*60)
        for i, result in enumerate(processed_files, 1):
            print(f"\n{i}. {result['fileName']}")
            print(f"   Status: {result['status']}")
            if result['status'].startswith('✓'):
                print(f"   Hash: {result['fileHash']}")
                print(f"   Quality Score: {result['qualityScore']}/100")
                print(f"   Output: {result['outputFile']}")

        print("\n" + "="*60)
        print("✓ PIPELINE COMPLETED SUCCESSFULLY")
        print("="*60)

    except Exception as e:
        print(f"\n✗ Pipeline failed: {e}")
        raise

# RUN THIS CELL TO START THE ENTIRE PIPELINE
import asyncio
await process_resume_pipeline()


STEP 0: UPLOAD RESUMES

Please upload one or more resume PDF files...


# Resume Filtering Part

In [17]:
from datetime import date, datetime
import os
import json

In [18]:
RESUME_DIR = "./resumes/parsed"

def load_all_resumes(resume_dir: str) -> List[Dict[str,Any]]:
  resumes = []
  for file in os.listdir(RESUME_DIR):
    if file.endswith(".json"):
      with open(os.path.join(resume_dir,file),"r",encoding="utf-8") as f:
        resumes.append(json.load(f))
  return resumes
print(os.listdir(RESUME_DIR))
allResume = load_all_resumes(RESUME_DIR)
print(allResume)

['326e891505900873d99554bcfc54a599dd81b8a65d0cfbe24398cc1bfe52d499.json', '5de19d5713bffdb8eea75ba42eb9f7f1fb77109bbacbfd952da1a2a1495ed6a9.json', '8647c7180442e944180d4681273c247b1f2876474de361e2cdfb88f1111f64fc.json', 'a172fdd98d8dc172f165e67742a88b6da6737da06a6f9286dba1380305d49bb8.json', 'a1a490916a30e5e00123c8229538cef99a8047b6682dc4ce0d0b25f3c4449b37.json']
[{'metadata': {'fileName': 'SHIVA PRASAD CHAPAGAIN-1 (1).pdf', 'fileHash': '326e891505900873d99554bcfc54a599dd81b8a65d0cfbe24398cc1bfe52d499', 'parsedAt': '2025-12-15T21:29:10.540292Z', 'parserVersion': '1.0.0', 'Language': 'en'}, 'extractedResume': {'id': 'resume_main', 'analysis': {'quality': {'score': 63.9, 'level': 'average', 'hints': ['Low quantification in responsibilities', 'Few action verbs in responsibilities']}, 'suspicion': {'score': 10, 'level': 'safe', 'flags': [{'type': 'skills_without_evidence', 'severity': 'low', 'description': 'Several skills listed in dedicated section or summary do not appear to be utilized 

In [9]:
def passes_quality_gate(resume:Dict[str,Any],min_quality:int=60,max_suspicion:int=40):
  quality_score = resume.get("extractedResume",{}).get("analysis",{}).get("quality",{}).get("score",0)
  suspicion_score = resume.get("extractedResume",{}).get("analysis",{}).get("suspicion",{}).get("score",0)
  return quality_score >= min_quality and suspicion_score <= max_suspicion

In [3]:
from typing import List, Dict, Any

In [None]:
def getTotalMonths(startDate:Dict[str,Any],endDate:Dict[str,Any]):
  if not startDate or not endDate or not startDate.get("isoDate"):
    return 0
  try:
    startDate = datetime.fromisoformat(startDate.get("isoDate")).date()
  except:
    return 0
  print(startDate,endDate)
  if endDate and endDate.get("isCurrent") is True:
    endDate = date.today()
  elif endDate and endDate.get("isoDate"):
    try:
      endDate = datetime.fromisoformat(endDate.get("isoDate")).date()
    except:
      endDate = date.today()
  else:
    endDate = date.today()
  year_diff = endDate.year - startDate.year
  month_diff = endDate.month - startDate.month
  total_months = year_diff * 12 + month_diff
  return max(total_months,0)
def industry_experience_gate(resume:Dict[str,Any],min_industry_experience:int=3):
  total_months = 0
  workExperiences = resume.get("extractedResume",{}).get("workExperience",{})
  for workExperience in workExperiences:
    total_months += getTotalMonths(workExperience.get("startDate"),workExperience.get("endDate"))
  print(total_months)
  return total_months >= min_industry_experience*12

industry_experience_gate(allResume[0])

0


False

In [None]:
resumes = load_all_resumes(RESUME_DIR)
print(resumes)

NameError: name 'load_all_resumes' is not defined