In [2]:
# Imports
import os
import json
from typing import Dict, Any
from getpass import getpass

# LangChain + Google GenAI
from langchain_google_genai import ChatGoogleGenerativeAI

# Document processing
import fitz  # PyMuPDF
from docx import Document

# ML models
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Embedder model from huggingface
from sentence_transformers import SentenceTransformer




In [3]:
# For file upload in Colab
try:
    from google.colab import files
    IN_COLAB = True
except Exception:
    IN_COLAB = False

In [15]:
# Global config
GEMINI_MODEL = "gemini-2.5-flash"
GOOGLE_API_KEY = getpass('ENTER GEMINI API KEY: ')

In [5]:
# Embedder
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL_NAME)

In [6]:
def scrape_pdf_with_gemini(file_path: str, api_key: str) -> str:
    """Use Gemini to extract and structure text from PDF."""
    # configure API key
    os.environ['GOOGLE_API_KEY'] = api_key

    # first extract text using PyMuPDF
    doc = fitz.open(file_path)
    extracted_text = ""
    for page in doc:
        extracted_text += page.get_text()
    doc.close()

    # Then use Gemini to clean and structure the text
    llm = ChatGoogleGenerativeAI(model=GEMINI_MODEL, temperature=0.0)

    prompt = f"""
    The following is raw text extracted from a PDF resume/CV document.
    Please clean, organize, and structure this text properly.
    Preserve all information but make it well-formatted and readable.
    Include all sections like contact info, summary, experience, education, skills, etc.

    Raw text:
    {extracted_text}
    """

    response = llm.invoke(prompt)
    return response.content

def scrape_docx_with_gemini(file_path: str, api_key: str) -> str:
    """Use Gemini to extract and structure text from DOCX."""
    # Configure API key
    os.environ['GOOGLE_API_KEY'] = api_key

    # First extract text using python-docx
    doc = Document(file_path)
    extracted_text = "\n\n".join([paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()])

    # Then use Gemini to clean and structure the text
    llm = ChatGoogleGenerativeAI(model=GEMINI_MODEL, temperature=0.0)

    prompt = f"""
    The following is raw text extracted from a DOCX resume/CV document.
    Please clean, organize, and structure this text properly.
    Preserve all information but make it well-formatted and readable.
    Include all sections like contact info, summary, experience, education, skills, etc.

    Raw text:
    {extracted_text}
    """

    response = llm.invoke(prompt)
    return response.content

In [19]:
# TODO: REPLACE WITH ACTUAL BUILDED MODELS

class ResumeModels:
    """Container for your trained models."""

    def __init__(self):
        # Initialize placeholder models
        self.role_pipeline = self._init_role_model()
        self.acceptance_pipeline = self._init_acceptance_model()

    def _init_role_model(self):
        """Initialize role prediction model (replace with your trained model)."""
        role_vectorizer = TfidfVectorizer(max_features=2000)
        role_clf = DummyClassifier(strategy="most_frequent")
        pipeline = Pipeline([('tfidf', role_vectorizer), ('clf', role_clf)])

        # Train on sample data (replace with your real model loading)
        X_sample = [
            "Experienced backend engineer with python, django, rest apis",
            "Junior frontend developer skilled in react, javascript, css",
            "Data scientist with experience in sklearn, pandas, ml models",
        ]
        y_sample = ["Backend Engineer", "Frontend Engineer", "Data Scientist"]
        pipeline.fit(X_sample, y_sample)

        return pipeline

    def _init_acceptance_model(self):
        """Initialize acceptance prediction model (replace with your trained model)."""
        accept_vectorizer = TfidfVectorizer(max_features=2000)
        accept_clf = DummyClassifier(strategy="stratified")
        pipeline = Pipeline([('tfidf', accept_vectorizer), ('clf', accept_clf)])

        # Train on sample data (replace with your real model loading)
        X_sample = [
            "Experienced backend engineer with python, django, rest apis",
            "Junior frontend developer skilled in react, javascript, css",
            "Data scientist with experience in sklearn, pandas, ml models",
        ]
        y_sample = [1, 0, 1]
        pipeline.fit(X_sample, y_sample)

        return pipeline

    def predict_role(self, resume_text: str) -> str:
        """Predict job role from resume text."""
        try:
            pred = self.role_pipeline.predict([resume_text])[0]
            return str(pred)
        except Exception as e:
            # Fallback heuristic
            text_lower = resume_text.lower()
            if "data" in text_lower or "machine learning" in text_lower:
                return "Data Scientist"
            elif "react" in text_lower or "frontend" in text_lower:
                return "Frontend Engineer"
            else:
                return "Backend Engineer"

    def predict_acceptance(self, resume_text: str) -> float:
        """Predict acceptance probability (0-1)."""
        try:
            prob = self.acceptance_pipeline.predict_proba([resume_text])[0]
            if len(prob) == 2:
                return float(prob[1])
            return float(prob.max())
        except Exception as e:
            # Fallback heuristic
            keywords = ["experienced", "senior", "lead", "machine learning", "phd", "master"]
            score = sum([1 for k in keywords if k in resume_text.lower()]) / len(keywords)
            return float(score)

In [8]:
def compute_resume_jd_similarity(resume_text: str, jd_text: str) -> Dict[str, Any]:
    """
    Compute similarity between resume and job description using embeddings.
    Uses the embedder (all-MiniLM-L6-v2).
    """
    # Get embeddings from your separate embedder module
    texts = [resume_text, jd_text]
    embeddings = embedder.encode(texts, show_progress_bar=False)

    resume_vec = embeddings[0]
    jd_vec = embeddings[1]

    # Cosine similarity
    cosine_sim = np.dot(resume_vec, jd_vec) / (
        np.linalg.norm(resume_vec) * np.linalg.norm(jd_vec) + 1e-9
    )

    # Find missing keywords
    jd_tokens = set(jd_text.lower().split())
    resume_tokens = set(resume_text.lower().split())
    missing_keywords = list(jd_tokens - resume_tokens)[:20]

    return {
        'similarity_score': float(cosine_sim),
        'missing_keywords': missing_keywords
    }

In [9]:
def get_gemini_assessment(
    resume_text: str,
    jd_text: str,
    predicted_role: str,
    acceptance_prob: float,
    similarity_score: float,
    missing_keywords: list,
    api_key: str
) -> Dict[str, Any]:
    """
    Send all model outputs to Gemini for final scoring and revision recommendations.
    """
    # Configure API key
    os.environ['GOOGLE_API_KEY'] = api_key

    llm = ChatGoogleGenerativeAI(model=GEMINI_MODEL, temperature=0.0)

    prompt = f"""
You are an expert resume/CV reviewer and career consultant.

I have analyzed a resume using ML models and need your expert assessment and recommendations.

**Resume Text:**
{resume_text[:3000]}

**Job Description:**
{jd_text}

**ML Model Analysis Results:**
- Predicted Role: {predicted_role}
- Acceptance Probability: {acceptance_prob:.2%}
- Resume-JD Similarity Score: {similarity_score:.3f} (0-1 scale, based on semantic embeddings)
- Missing Keywords from JD: {', '.join(missing_keywords[:15])}

**Your Task:**
Based on the resume, job description, and ML model outputs, provide:

1. **Overall Score (0-100)**: A numerical score indicating how well this resume matches the job
2. **Score Explanation**: 2-3 sentences explaining the score
3. **Strengths**: 2-3 key strengths of this resume for this role
4. **Weaknesses**: 2-3 key weaknesses or gaps
5. **Revision Suggestions**: 5-7 specific, actionable suggestions to improve the resume (be concrete)
6. **Keyword Recommendations**: Specific keywords/skills to add based on missing keywords
7. **Summary Rewrite**: A suggested 2-3 sentence professional summary optimized for this role
8. **ATS Optimization Tips**: 2-3 tips to improve ATS (Applicant Tracking System) compatibility

Return your response in valid JSON format with these exact keys:
{{
  "score": <number 0-100>,
  "explanation": "<string>",
  "strengths": ["<string>", "<string>", ...],
  "weaknesses": ["<string>", "<string>", ...],
  "revision_suggestions": ["<string>", "<string>", ...],
  "keyword_recommendations": ["<string>", "<string>", ...],
  "summary_rewrite": "<string>",
  "ats_tips": ["<string>", "<string>", ...]
}}
"""

    response = llm.invoke(prompt)

    try:
        # Extract JSON from response
        response_text = response.content.strip()
        # Remove markdown code blocks if present
        if response_text.startswith('```'):
            response_text = response_text.split('```')[1]
            if response_text.startswith('json'):
                response_text = response_text[4:]
            response_text = response_text.rsplit('```', 1)[0]

        assessment = json.loads(response_text)
        return assessment
    except json.JSONDecodeError:
        # Fallback if JSON parsing fails
        return {
            'score': 0,
            'explanation': 'Error parsing Gemini response',
            'raw_response': response.content
        }

In [10]:
def analyze_resume(
    resume_file_path: str,
    job_description: str,
    api_key: str
) -> Dict[str, Any]:
    """
    Complete resume analysis pipeline.

    Steps:
    1. Gemini scrapes the resume file
    2. Your models analyze the resume
    3. Embedder computes similarity
    4. Gemini provides final assessment and recommendations
    """

    print("Step 1: Extracting text from resume using Gemini...")
    # Scrape resume with Gemini
    if resume_file_path.lower().endswith('.pdf'):
        resume_text = scrape_pdf_with_gemini(resume_file_path, api_key)
    elif resume_file_path.lower().endswith('.docx'):
        resume_text = scrape_docx_with_gemini(resume_file_path, api_key)
    else:
        raise ValueError("File must be PDF or DOCX")

    print("Step 2: Running your ML models...")
    # Initialize your models
    models = ResumeModels()

    # Get predictions from your models
    predicted_role = models.predict_role(resume_text)
    acceptance_prob = models.predict_acceptance(resume_text)

    print("Step 3: Computing embedding similarity...")
    # Compute similarity using your embedder
    similarity_result = compute_resume_jd_similarity(resume_text, job_description)

    print("Step 4: Getting Gemini assessment and recommendations...")
    # Get final assessment from Gemini
    gemini_assessment = get_gemini_assessment(
        resume_text=resume_text,
        jd_text=job_description,
        predicted_role=predicted_role,
        acceptance_prob=acceptance_prob,
        similarity_score=similarity_result['similarity_score'],
        missing_keywords=similarity_result['missing_keywords'],
        api_key=api_key
    )

    # Compile final results
    results = {
        'resume_text': resume_text,
        'model_outputs': {
            'predicted_role': predicted_role,
            'acceptance_probability': acceptance_prob,
            'similarity_score': similarity_result['similarity_score'],
            'missing_keywords': similarity_result['missing_keywords']
        },
        'gemini_assessment': gemini_assessment
    }

    return results

In [11]:
# Example Job Description
EXAMPLE_JD = """
We are seeking a Backend Engineer with 3+ years of experience in Python, Django, REST APIs, and
cloud deployment (GCP or AWS). Experience with PostgreSQL and Docker is required. The candidate
will build scalable microservices and own features end-to-end.

Required Skills:
- Python, Django, Flask
- REST API design
- PostgreSQL, Redis
- Docker, Kubernetes
- GCP or AWS
- Git, CI/CD

Nice to have:
- Experience with microservices architecture
- Knowledge of GraphQL
- Frontend experience (React/Vue)
"""

In [None]:
# Upload and analyze
if IN_COLAB:
    print("Upload your resume (PDF or DOCX):")
    uploaded = files.upload()

    if uploaded:
        resume_file = list(uploaded.keys())[0]
        print(f"\nAnalyzing: {resume_file}\n")

        # Set your API key
        API_KEY = input("Enter your Google API Key: ")

        # Run analysis
        results = analyze_resume(resume_file, EXAMPLE_JD, API_KEY)

        # Display results
        print("\n" + "="*60)
        print("RESUME ANALYSIS RESULTS")
        print("="*60)

        print("\n--- MODEL OUTPUTS ---")
        print(f"Predicted Role: {results['model_outputs']['predicted_role']}")
        print(f"Acceptance Probability: {results['model_outputs']['acceptance_probability']:.2%}")
        print(f"Similarity Score: {results['model_outputs']['similarity_score']:.3f}")
        print(f"Missing Keywords: {', '.join(results['model_outputs']['missing_keywords'][:10])}")

        print("\n--- GEMINI ASSESSMENT ---")
        assessment = results['gemini_assessment']
        print(f"Overall Score: {assessment.get('score', 'N/A')}/100")
        print(f"\nExplanation: {assessment.get('explanation', 'N/A')}")

        if 'revision_suggestions' in assessment:
            print("\nRevision Suggestions:")
            for i, suggestion in enumerate(assessment['revision_suggestions'], 1):
                print(f"  {i}. {suggestion}")

        print("\n" + "="*60)
        print("\nFull results saved in 'results' variable")

else:
    print("Not in Colab. Example usage:")
    print("""
    # Set your API key
    GOOGLE_API_KEY = 'your-google-api-key'

    # Analyze resume
    results = analyze_resume('path/to/resume.pdf', EXAMPLE_JD, API_KEY)

    # Print results
    print(json.dumps(results['gemini_assessment'], indent=2))
    """)

Not in Colab. Example usage:

    # Set your API key
    API_KEY = 'your-google-api-key'

    # Analyze resume
    results = analyze_resume('path/to/resume.pdf', EXAMPLE_JD, API_KEY)

    # Print results
    print(json.dumps(results['gemini_assessment'], indent=2))
    


In [None]:
class ResumeAnalysisDebugger:
    """Comprehensive debugging tools for the resume analysis pipeline."""

    def __init__(self, verbose=True):
        self.verbose = verbose
        self.logs = []

    def log(self, message, level="INFO"):
        """Log a message with timestamp."""
        import datetime
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] [{level}] {message}"
        self.logs.append(log_entry)
        if self.verbose:
            print(log_entry)

    def test_gemini_connection(self, api_key: str) -> bool:
        """Test if Gemini API is accessible and working."""
        self.log("Testing Gemini API connection...")
        try:
            os.environ['GOOGLE_API_KEY'] = api_key
            llm = ChatGoogleGenerativeAI(model=GEMINI_MODEL, temperature=0.0)
            response = llm.invoke("Say 'connection successful'")
            if response.content:
                self.log("Gemini API connection successful", "SUCCESS")
                return True
        except Exception as e:
            self.log(f"Gemini API connection failed: {str(e)}", "ERROR")
            return False

    def test_embedder_module(self) -> bool:
        """Test if embedder module is properly loaded."""
        self.log("Testing embedder module...")
        try:
            test_text = ["Hello world", "Test sentence"]
            embeddings = embedder.encode(test_text, show_progress_bar=False)
            if embeddings.shape[0] == 2:
                self.log(f"Embedder loaded successfully. Embedding dimension: {embeddings.shape[1]}", "SUCCESS")
                return True
        except ImportError as e:
            self.log(f"Cannot import embedder_module: {str(e)}", "ERROR")
            self.log("Make sure embedder_module.py exists in the same directory", "ERROR")
            return False
        except Exception as e:
            self.log(f"Embedder error: {str(e)}", "ERROR")
            return False

    def test_file_scraping(self, file_path: str, api_key: str) -> Dict[str, Any]:
        """Test file scraping functionality."""
        self.log(f"Testing file scraping for: {file_path}")
        results = {
            'file_exists': False,
            'file_type': None,
            'scraping_successful': False,
            'text_length': 0,
            'error': None
        }

        # Check file exists
        if not os.path.exists(file_path):
            results['error'] = "File does not exist"
            self.log(f"File not found: {file_path}", "ERROR")
            return results

        results['file_exists'] = True

        # Determine file type
        if file_path.lower().endswith('.pdf'):
            results['file_type'] = 'PDF'
        elif file_path.lower().endswith('.docx'):
            results['file_type'] = 'DOCX'
        else:
            results['error'] = "Unsupported file type"
            self.log("File must be PDF or DOCX", "ERROR")
            return results

        # Test scraping
        try:
            if results['file_type'] == 'PDF':
                text = scrape_pdf_with_gemini(file_path, api_key)
            else:
                text = scrape_docx_with_gemini(file_path, api_key)

            results['scraping_successful'] = True
            results['text_length'] = len(text)
            self.log(f"Successfully scraped {results['text_length']} characters", "SUCCESS")
            self.log(f"Preview: {text[:200]}...", "INFO")

        except Exception as e:
            results['error'] = str(e)
            self.log(f"✗ Scraping failed: {str(e)}", "ERROR")

        return results

    def test_models(self, sample_text: str = None) -> Dict[str, Any]:
        """Test if ML models are working."""
        self.log("Testing ML models...")

        if sample_text is None:
            sample_text = """
            Experienced Software Engineer with 5 years in backend development.
            Proficient in Python, Django, PostgreSQL, and AWS.
            Led team of 3 developers in building scalable microservices.
            """

        results = {
            'role_prediction': None,
            'acceptance_prediction': None,
            'errors': []
        }

        try:
            models = ResumeModels()

            # Test role prediction
            try:
                role = models.predict_role(sample_text)
                results['role_prediction'] = role
                self.log(f"Role prediction: {role}", "SUCCESS")
            except Exception as e:
                results['errors'].append(f"Role prediction error: {str(e)}")
                self.log(f"✗ Role prediction failed: {str(e)}", "ERROR")

            # Test acceptance prediction
            try:
                acceptance = models.predict_acceptance(sample_text)
                results['acceptance_prediction'] = acceptance
                self.log(f"Acceptance prediction: {acceptance:.2%}", "SUCCESS")
            except Exception as e:
                results['errors'].append(f"Acceptance prediction error: {str(e)}")
                self.log(f"Acceptance prediction failed: {str(e)}", "ERROR")

        except Exception as e:
            results['errors'].append(f"Model initialization error: {str(e)}")
            self.log(f"Model initialization failed: {str(e)}", "ERROR")

        return results

    def test_similarity_computation(self) -> bool:
        """Test embedding similarity computation."""
        self.log("Testing similarity computation...")

        try:
          resume = "Python developer with Django experience"
          jd = "Looking for Python Django developer"

          result = compute_resume_jd_similarity(resume, jd)

          self.log(f"Similarity score: {result['similarity_score']:.3f}", "SUCCESS")
          self.log(f"Missing keywords: {result['missing_keywords'][:5]}", "INFO")
          return True

        except Exception as e:
          self.log(f"Similarity computation failed: {str(e)}", "ERROR")
          return False

    def test_gemini_assessment(self, api_key: str) -> bool:
        """Test Gemini assessment with sample data."""
        self.log("Testing Gemini assessment...")

        try:
          # resume
          sample_resume = "Backend Engineer with Python, Django, PostgreSQL experience"

          # job description
          sample_jd = "Looking for Backend Engineer with Python skills"

          assessment = get_gemini_assessment(
              resume_text=sample_resume,
              jd_text=sample_jd,
              predicted_role="Backend Engineer",
              acceptance_prob=0.75,
              similarity_score=0.85,
              missing_keywords=["docker", "kubernetes"],
              api_key=api_key
          )

          if 'score' in assessment:
            self.log(f"Gemini assessment successful. Score: {assessment['score']}/100", "SUCCESS")
            return True
          else:
            self.log("Gemini assessment returned incomplete response", "ERROR")
            return False

        except Exception as e:
          self.log(f"Gemini assessment failed: {str(e)}", "ERROR")
          return False

    def run_full_diagnostics(self, api_key: str, test_file_path: str = None) -> Dict[str, Any]:
        """Run complete diagnostic tests on the entire pipeline."""
        self.log("="*60, "INFO")
        self.log("STARTING FULL DIAGNOSTICS", "INFO")
        self.log("="*60, "INFO")

        results = {
            'gemini_connection': False,
            'embedder_module': False,
            'models': False,
            'similarity': False,
            'gemini_assessment': False,
            'file_scraping': None,
            'overall_status': 'FAILED'
        }

        # Test 1: Gemini connection
        results['gemini_connection'] = self.test_gemini_connection(api_key)

        # Test 2: Embedder module
        results['embedder_module'] = self.test_embedder_module()

        # Test 3: ML models
        model_results = self.test_models()
        results['models'] = len(model_results['errors']) == 0

        # Test 4: Similarity computation
        if results['embedder_module']:
            results['similarity'] = self.test_similarity_computation()

        # Test 5: Gemini assessment
        if results['gemini_connection']:
            results['gemini_assessment'] = self.test_gemini_assessment(api_key)

        # Test 6: File scraping (optional)
        if test_file_path and results['gemini_connection']:
            results['file_scraping'] = self.test_file_scraping(test_file_path, api_key)

        # Overall status
        critical_tests = [
            results['gemini_connection'],
            results['embedder_module'],
            results['models']
        ]

        if all(critical_tests):
            results['overall_status'] = 'PASSED'
            self.log("="*60, "SUCCESS")
            self.log("ALL CRITICAL TESTS PASSED", "SUCCESS")
            self.log("="*60, "SUCCESS")
        else:
            self.log("="*60, "ERROR")
            self.log("SOME TESTS FAILED - CHECK LOGS ABOVE", "ERROR")
            self.log("="*60, "ERROR")

        return results

    def save_logs(self, filename="debug_logs.txt"):
        """Save all logs to a file."""
        with open(filename, 'w') as f:
            f.write('\n'.join(self.logs))
        self.log(f"Logs saved to {filename}", "INFO")

In [16]:
debugger = ResumeAnalysisDebugger(verbose=True)

debugger.test_gemini_connection(GOOGLE_API_KEY)


[2025-12-01 23:00:47] [INFO] Testing Gemini API connection...
[2025-12-01 23:00:48] [SUCCESS] Gemini API connection successful


True

In [None]:
debugger.test_file_scraping('', GOOGLE_API_KEY)

[2025-12-01 23:00:50] [INFO] Testing file scraping for: /content/example.pdf
[2025-12-01 23:00:50] [ERROR] File not found: /content/example.pdf


{'file_exists': False,
 'file_type': None,
 'scraping_successful': False,
 'text_length': 0,
 'error': 'File does not exist'}

In [18]:
debugger.test_embedder_module()

[2025-12-01 23:00:56] [INFO] Testing embedder module...
[2025-12-01 23:00:56] [SUCCESS] Embedder loaded successfully. Embedding dimension: 384


True

In [None]:
debugger.run_full_diagnostics(GOOGLE_API_KEY, '/content/example.pdf')

[2025-12-01 15:07:13] [INFO] STARTING FULL DIAGNOSTICS
[2025-12-01 15:07:13] [INFO] Testing Gemini API connection...
[2025-12-01 15:07:14] [SUCCESS] Gemini API connection successful
[2025-12-01 15:07:14] [INFO] Testing embedder module...
[2025-12-01 15:07:14] [SUCCESS] Embedder loaded successfully. Embedding dimension: 384
[2025-12-01 15:07:14] [INFO] Testing ML models...
[2025-12-01 15:07:14] [SUCCESS] Role prediction: Backend Engineer
[2025-12-01 15:07:14] [SUCCESS] Acceptance prediction: 100.00%
[2025-12-01 15:07:14] [INFO] Testing similarity computation...
[2025-12-01 15:07:14] [SUCCESS] Similarity score: 0.886
[2025-12-01 15:07:14] [INFO] Missing keywords: ['looking', 'for']
[2025-12-01 15:07:14] [INFO] Testing Gemini assessment...
[2025-12-01 15:07:22] [SUCCESS] Gemini assessment successful. Score: 82/100
[2025-12-01 15:07:22] [INFO] Testing file scraping for: /content/example.pdf
[2025-12-01 15:07:36] [SUCCESS] Successfully scraped 5819 characters
[2025-12-01 15:07:36] [INFO] Pr

{'gemini_connection': True,
 'embedder_module': True,
 'models': True,
 'similarity': True,
 'gemini_assessment': True,
 'file_scraping': {'file_exists': True,
  'file_type': 'PDF',
  'scraping_successful': True,
  'text_length': 5819,
  'error': None},
 'overall_status': 'PASSED'}