# AI Resume Ranker - Dataset Evaluation Notebook

This notebook reads folder containing PDF/DOC files, calls web app models to analyze and evaluate performance.

## Features:
- Parse PDF/DOC files into JSON format
- Extract entities using NER models  
- Calculate semantic similarity
- Generate performance metrics
- Create visualizations
- Export results


In [None]:
# Import required libraries
import sys
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add path to import modules
sys.path.append('.')

print("✅ Import libraries completed!")


In [None]:
# Import modules from web app
try:
    from services import (
        extract_text, extract_entities_ner, extract_basic_entities, 
        extract_skills_spacy, calculate_semantic_similarity
    )
    print("✅ Web app modules imported successfully!")
except ImportError as e:
    print(f"❌ Error importing web app modules: {e}")
    print("Make sure you're running this notebook from the src directory")


In [None]:
# Setup dataset path
dataset_path = input("Enter path to dataset folder: ").strip()
if not dataset_path:
    dataset_path = "notebook/dataset"  # Default path

print(f"📁 Dataset path: {dataset_path}")

# Check if path exists
if not os.path.exists(dataset_path):
    print(f"❌ Dataset path does not exist: {dataset_path}")
    print("Please create the dataset folder and add your PDF/DOC files")
else:
    print("✅ Dataset path exists")


In [None]:
# List files in dataset
def list_dataset_files(dataset_path):
    """List all PDF and DOC files in folder"""
    if not os.path.exists(dataset_path):
        return []
    
    files = []
    for file in os.listdir(dataset_path):
        if file.lower().endswith(('.pdf', '.docx', '.doc')):
            files.append(file)
    
    return sorted(files)

# Get list of files
dataset_files = list_dataset_files(dataset_path)
print(f"📊 Found {len(dataset_files)} files in dataset")

if dataset_files:
    print("\\nFiles found:")
    for i, file in enumerate(dataset_files[:10], 1):  # Show first 10 files
        print(f"  {i}. {file}")
    
    if len(dataset_files) > 10:
        print(f"  ... and {len(dataset_files) - 10} more files")
else:
    print("❌ No PDF/DOC files found in dataset folder")


In [None]:
# Process all files in dataset
def process_dataset_files(dataset_path, dataset_files):
    """Process all files in dataset"""
    processed_cvs = []
    
    print(f"🔄 Processing {len(dataset_files)} files...")
    
    for i, filename in enumerate(dataset_files, 1):
        print(f"\\nProcessing {i}/{len(dataset_files)}: {filename}")
        
        file_path = os.path.join(dataset_path, filename)
        
        try:
            # Extract text from file
            text = extract_text(file_path)
            if not text:
                print(f"❌ Failed to extract text from {filename}")
                continue
            
            # Extract entities
            entities = extract_entities_ner(text)
            basic = extract_basic_entities(text)
            skills = extract_skills_spacy(text)
            
            # Create CV data structure
            cv_data = {
                'filename': filename,
                'text': text,
                'entities': entities,
                'name': basic.get('name', ''),
                'email': basic.get('email', ''),
                'phone': basic.get('phone', ''),
                'years_exp': basic.get('years_exp', 0),
                'skills': skills,
                'processed_at': datetime.now().isoformat()
            }
            
            processed_cvs.append(cv_data)
            print(f"✅ Processed: {filename}")
            print(f"   - Text length: {len(text)} characters")
            print(f"   - Skills found: {len(skills)}")
            print(f"   - Years experience: {basic.get('years_exp', 0)}")
            
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")
            continue
    
    return processed_cvs

# Process dataset
if dataset_files:
    processed_cvs = process_dataset_files(dataset_path, dataset_files)
    print(f"\\n📋 Processing completed: {len(processed_cvs)} CVs processed successfully")
else:
    print("❌ No files to process")
    processed_cvs = []


In [None]:
# Job Description input and parsing
print("📝 Job Description Input")
print("=" * 50)

# Get job description from user
job_description = input("Enter job description (or press Enter to use sample): ").strip()

if not job_description:
    # Sample job description
    job_description = """
    We are looking for a Senior Software Engineer with the following requirements:
    
    Required Skills:
    - Python programming (5+ years)
    - JavaScript/React (3+ years)
    - SQL and database design
    - AWS cloud services
    - Docker and Kubernetes
    - Git version control
    
    Experience:
    - 5+ years of software development experience
    - Experience with microservices architecture
    - Experience with CI/CD pipelines
    - Experience with Agile/Scrum methodologies
    
    Education:
    - Bachelor's degree in Computer Science or related field
    
    Responsibilities:
    - Design and develop scalable web applications
    - Collaborate with cross-functional teams
    - Mentor junior developers
    - Participate in code reviews
    - Contribute to technical documentation
    """
    print("Using sample job description")

print(f"\\n📄 Job Description length: {len(job_description)} characters")
print(f"📄 Job Description preview: {job_description[:200]}...")


In [None]:
# Run web app analysis and ranking
def run_web_app_analysis(processed_cvs, job_description):
    """Run analysis using web app models"""
    if not processed_cvs or not job_description:
        print("❌ No data available for analysis")
        return None
    
    print("🔄 Running web app analysis...")
    
    try:
        # Get resume texts
        resume_texts = [cv['text'] for cv in processed_cvs]
        
        # Calculate semantic similarity using web app function
        similarities = calculate_semantic_similarity(resume_texts, job_description)
        
        # Handle both list and numpy array
        if isinstance(similarities, list):
            similarities = similarities
        else:
            similarities = similarities.tolist()
        
        # Create ranking results
        ranking_results = []
        for i, cv in enumerate(processed_cvs):
            ranking_results.append({
                'filename': cv['filename'],
                'name': cv['name'],
                'email': cv['email'],
                'phone': cv['phone'],
                'years_exp': cv['years_exp'],
                'skills': cv['skills'],
                'semantic_score': similarities[i] if i < len(similarities) else 0.0,
                'rank': i + 1
            })
        
        # Sort by semantic score
        ranking_results.sort(key=lambda x: x['semantic_score'], reverse=True)
        
        # Update ranks
        for i, result in enumerate(ranking_results):
            result['rank'] = i + 1
        
        print(f"✅ Analysis completed: {len(ranking_results)} candidates ranked")
        return ranking_results
        
    except Exception as e:
        print(f"❌ Error in analysis: {e}")
        return None

# Run analysis
if processed_cvs and job_description:
    ranking_results = run_web_app_analysis(processed_cvs, job_description)
else:
    print("❌ No data available for analysis")
    ranking_results = []


In [None]:
# Display ranking results
if ranking_results:
    print("🏆 RANKING RESULTS:")
    print("=" * 80)
    
    # Show top 10 candidates
    for i, result in enumerate(ranking_results[:10], 1):
        print(f"\\n{i}. {result['name'] or result['filename']}")
        print(f"   Email: {result['email']}")
        print(f"   Phone: {result['phone']}")
        print(f"   Experience: {result['years_exp']} years")
        print(f"   Skills: {', '.join(result['skills'][:5])}{'...' if len(result['skills']) > 5 else ''}")
        print(f"   Semantic Score: {result['semantic_score']:.3f}")
        print(f"   Rank: {result['rank']}")
    
    if len(ranking_results) > 10:
        print(f"\\n... and {len(ranking_results) - 10} more candidates")
    
    # Summary statistics
    print(f"\\n📊 SUMMARY STATISTICS:")
    print(f"Total candidates: {len(ranking_results)}")
    print(f"Average semantic score: {np.mean([r['semantic_score'] for r in ranking_results]):.3f}")
    print(f"Highest score: {max([r['semantic_score'] for r in ranking_results]):.3f}")
    print(f"Lowest score: {min([r['semantic_score'] for r in ranking_results]):.3f}")
    
else:
    print("❌ No ranking results available")


In [None]:
# Create visualizations
def create_visualizations(ranking_results):
    """Create visualizations for dataset analysis"""
    if not ranking_results:
        print("❌ No data available for visualization")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('AI Resume Ranker - Dataset Analysis', fontsize=16)
    
    # 1. Semantic Scores
    scores = [r['semantic_score'] for r in ranking_results[:10]]
    names = [r['name'] or r['filename'][:20] for r in ranking_results[:10]]
    
    axes[0, 0].bar(range(len(scores)), scores, color='skyblue')
    axes[0, 0].set_title('Top 10 Candidates - Semantic Scores')
    axes[0, 0].set_xlabel('Candidates')
    axes[0, 0].set_ylabel('Semantic Score')
    axes[0, 0].set_xticks(range(len(names)))
    axes[0, 0].set_xticklabels(names, rotation=45, ha='right')
    
    # Add values on bars
    for i, v in enumerate(scores):
        axes[0, 0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
    
    # 2. Years of Experience Distribution
    exp_values = [r['years_exp'] for r in ranking_results]
    axes[0, 1].hist(exp_values, bins=10, color='orange', alpha=0.7)
    axes[0, 1].set_title('Years of Experience Distribution')
    axes[0, 1].set_xlabel('Years of Experience')
    axes[0, 1].set_ylabel('Number of Candidates')
    
    # 3. Skills Count Distribution
    skills_counts = [len(r['skills']) for r in ranking_results]
    axes[1, 0].hist(skills_counts, bins=10, color='green', alpha=0.7)
    axes[1, 0].set_title('Skills Count Distribution')
    axes[1, 0].set_xlabel('Number of Skills')
    axes[1, 0].set_ylabel('Number of Candidates')
    
    # 4. Skills Count vs Semantic Score
    axes[1, 1].scatter(skills_counts, [r['semantic_score'] for r in ranking_results], alpha=0.6, color='purple')
    axes[1, 1].set_title('Skills Count vs Semantic Score')
    axes[1, 1].set_xlabel('Number of Skills')
    axes[1, 1].set_ylabel('Semantic Score')
    
    plt.tight_layout()
    plt.show()

# Create visualizations
if ranking_results:
    create_visualizations(ranking_results)
else:
    print("❌ No data available for visualization")


In [None]:
# Export results
def export_analysis_results(processed_cvs, ranking_results, job_description):
    """Export analysis results to files"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Export processed CVs
    with open(f'processed_cvs_{timestamp}.json', 'w', encoding='utf-8') as f:
        json.dump(processed_cvs, f, indent=2, ensure_ascii=False)
    print(f"✅ Exported processed CVs: processed_cvs_{timestamp}.json")
    
    # Export job description
    jd_data = {'job_description': job_description, 'timestamp': timestamp}
    with open(f'job_description_{timestamp}.json', 'w', encoding='utf-8') as f:
        json.dump(jd_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Exported job description: job_description_{timestamp}.json")
    
    # Export ranking results as CSV
    if ranking_results:
        ranking_df = pd.DataFrame(ranking_results)
        ranking_df.to_csv(f'ranking_results_{timestamp}.csv', index=False)
        print(f"✅ Exported ranking results: ranking_results_{timestamp}.csv")
    
    # Create summary report
    summary = {
        'timestamp': timestamp,
        'total_cvs_processed': len(processed_cvs),
        'total_candidates_ranked': len(ranking_results),
        'job_description_length': len(job_description),
        'top_candidate': ranking_results[0] if ranking_results else None,
        'average_semantic_score': np.mean([r['semantic_score'] for r in ranking_results]) if ranking_results else 0
    }
    
    with open(f'summary_report_{timestamp}.json', 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)
    print(f"✅ Exported summary report: summary_report_{timestamp}.json")

# Export results
if processed_cvs and ranking_results:
    export_analysis_results(processed_cvs, ranking_results, job_description)
    print("\\n🎉 All results exported successfully!")
else:
    print("❌ No data available for export")
