# AI Resume Ranker - Dataset Evaluation Notebook

Notebook này đọc folder chứa các file PDF/DOC, gọi model của web app để phân tích và đánh giá hiệu suất.

In [None]:
# Import các thư viện cần thiết
import sys
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Thêm path để import modules
sys.path.append('.')

print("✅ Import libraries completed!")

In [None]:
# Import các modules từ web app
try:
    from services import (
        extract_text,
        extract_entities_ner,
        extract_basic_entities,
        extract_skills_spacy,
        calculate_semantic_similarity,
        nlp,
        sbert_model
    )
    print("✅ Services imported successfully!")
except Exception as e:
    print(f"❌ Error importing services: {e}")

try:
    from control.uploads_controller import extract_cv_data
    print("✅ Uploads controller imported successfully!")
except Exception as e:
    print(f"❌ Error importing uploads controller: {e}")

## 1. Setup Dataset Path

In [None]:
# Thiết lập đường dẫn dataset
dataset_path = input("Enter path to dataset folder: ").strip()
if not dataset_path:
    dataset_path = "./dataset"  # Default path

if not os.path.exists(dataset_path):
    print(f"❌ Dataset folder not found: {dataset_path}")
    print("Please create the folder and add your PDF/DOC files")
else:
    print(f"✅ Dataset folder found: {dataset_path}")

# Liệt kê các file trong dataset
def list_dataset_files(folder_path: str) -> List[str]:
    """Liệt kê tất cả file PDF và DOC trong folder"""
    files = []
    supported_extensions = ['.pdf', '.docx', '.doc']
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            _, ext = os.path.splitext(filename.lower())
            if ext in supported_extensions:
                files.append(file_path)
    
    return files

dataset_files = list_dataset_files(dataset_path)
print(f"\n   Found {len(dataset_files)} files in dataset:")
for i, file_path in enumerate(dataset_files, 1):
    filename = os.path.basename(file_path)
    file_size = os.path.getsize(file_path) / 1024  # KB
    print(f"  {i}. {filename} ({file_size:.1f} KB)")

## 2. Load and Process Dataset

In [None]:
def process_dataset_files(file_paths: List[str]) -> List[Dict]:
    """Xử lý tất cả file trong dataset"""
    processed_data = []
    
    print(f"\n🔄 Processing {len(file_paths)} files...")
    
    for i, file_path in enumerate(file_paths, 1):
        filename = os.path.basename(file_path)
        print(f"\n📄 Processing {i}/{len(file_paths)}: {filename}")
        
        try:
            # Extract text từ file
            text = extract_text(file_path)
            
            if not text.strip():
                print(f"  ⚠️ No text extracted from {filename}")
                continue
            
            print(f"  ✅ Extracted {len(text)} characters")
            
            # Extract entities using web app functions
            entities = extract_entities_ner(text)
            emails, phones, years_exp = extract_basic_entities(text)
            skills = extract_skills_spacy(text)
            
            # Tạo CV data structure
            cv_data = {
                'filename': filename,
                'file_path': file_path,
                'text': text,
                'extracted_data': {
                    'name': entities['PERSON'][0] if entities['PERSON'] else "",
                    'email': emails[0] if emails else "",
                    'phone': phones[0] if phones else "",
                    'years_exp': years_exp,
                    'skills': skills,
                    'entities': entities
                },
                'file_size': os.path.getsize(file_path),
                'text_length': len(text)
            }
            
            processed_data.append(cv_data)
            
            # Print extracted information
            print(f"     Name: {cv_data['extracted_data']['name'] or 'Not found'}")
            print(f"     Email: {cv_data['extracted_data']['email'] or 'Not found'}")
            print(f"     Phone: {cv_data['extracted_data']['phone'] or 'Not found'}")
            print(f"  💼 Years Exp: {cv_data['extracted_data']['years_exp']}")
            print(f"  🛠️ Skills: {len(skills)} skills found")
            
        except Exception as e:
            print(f"  ❌ Error processing {filename}: {e}")
            continue
    
    print(f"\n✅ Successfully processed {len(processed_data)}/{len(file_paths)} files")
    return processed_data

# Process dataset
if dataset_files:
    processed_cvs = process_dataset_files(dataset_files)
else:
    print("❌ No files to process")
    processed_cvs = []

## 3. Analyze Dataset Statistics

In [None]:
def analyze_dataset_statistics(processed_cvs: List[Dict]) -> Dict:
    """Phân tích thống kê dataset"""
    if not processed_cvs:
        return {}
    
    stats = {
        'total_files': len(processed_cvs),
        'file_types': {},
        'text_lengths': [],
        'file_sizes': [],
        'extraction_success': {
            'name': 0,
            'email': 0,
            'phone': 0,
            'years_exp': 0,
            'skills': 0
        },
        'skills_distribution': {},
        'years_exp_distribution': []
    }
    
    for cv in processed_cvs:
        # File types
        ext = os.path.splitext(cv['filename'])[1].lower()
        stats['file_types'][ext] = stats['file_types'].get(ext, 0) + 1
        
        # Text lengths and file sizes
        stats['text_lengths'].append(cv['text_length'])
        stats['file_sizes'].append(cv['file_size'] / 1024)  # KB
        
        # Extraction success
        extracted = cv['extracted_data']
        if extracted['name']:
            stats['extraction_success']['name'] += 1
        if extracted['email']:
            stats['extraction_success']['email'] += 1
        if extracted['phone']:
            stats['extraction_success']['phone'] += 1
        if extracted['years_exp'] > 0:
            stats['extraction_success']['years_exp'] += 1
        if extracted['skills']:
            stats['extraction_success']['skills'] += 1
        
        # Skills distribution
        for skill in extracted['skills']:
            stats['skills_distribution'][skill] = stats['skills_distribution'].get(skill, 0) + 1
        
        # Years experience distribution
        if extracted['years_exp'] > 0:
            stats['years_exp_distribution'].append(extracted['years_exp'])
    
    return stats

# Analyze dataset
dataset_stats = analyze_dataset_statistics(processed_cvs)

if dataset_stats:
    print("\n📊 Dataset Statistics:")
    print("=" * 50)
    print(f"Total files: {dataset_stats['total_files']}")
    print(f"\nFile types:")
    for ext, count in dataset_stats['file_types'].items():
        print(f"  {ext}: {count} files")
    
    print(f"\nText extraction:")
    print(f"  Average text length: {np.mean(dataset_stats['text_lengths']):.0f} characters")
    print(f"  Average file size: {np.mean(dataset_stats['file_sizes']):.1f} KB")
    
    print(f"\nEntity extraction success rate:")
    total = dataset_stats['total_files']
    for entity, count in dataset_stats['extraction_success'].items():
        rate = (count / total) * 100 if total > 0 else 0
        print(f"  {entity}: {count}/{total} ({rate:.1f}%)")
    
    print(f"\nTop 10 skills:")
    top_skills = sorted(dataset_stats['skills_distribution'].items(), 
                       key=lambda x: x[1], reverse=True)[:10]
    for skill, count in top_skills:
        print(f"  {skill}: {count} CVs")
    
    if dataset_stats['years_exp_distribution']:
        print(f"\nYears experience:")
        print(f"  Average: {np.mean(dataset_stats['years_exp_distribution']):.1f} years")
        print(f"  Range: {min(dataset_stats['years_exp_distribution'])} - {max(dataset_stats['years_exp_distribution'])} years")
else:
    print("❌ No data to analyze")

## 4. Test Ranking with Job Description

In [None]:
def test_ranking_with_job_description(processed_cvs: List[Dict], job_description: str) -> List[Dict]:
    """Test ranking với job description"""
    if not processed_cvs:
        return []
    
    print(f"\n🔍 Testing ranking with job description...")
    print(f"Job Description: {job_description[:100]}...")
    
    # Extract CV texts
    cv_texts = [cv['text'] for cv in processed_cvs]
    
    # Calculate semantic similarity
    similarities = calculate_semantic_similarity(cv_texts, job_description)
    
    # Create ranking results
    ranking_results = []
    for i, cv in enumerate(processed_cvs):
        ranking_results.append({
            'filename': cv['filename'],
            'name': cv['extracted_data']['name'],
            'similarity_score': similarities[i],
            'years_exp': cv['extracted_data']['years_exp'],
            'skills': cv['extracted_data']['skills'],
            'email': cv['extracted_data']['email']
        })
    
    # Sort by similarity score
    ranking_results.sort(key=lambda x: x['similarity_score'], reverse=True)
    
    print(f"\n   Ranking Results:")
    for i, result in enumerate(ranking_results, 1):
        print(f"  {i}. {result['filename']} - {result['name']} (Score: {result['similarity_score']:.3f})")
    
    return ranking_results

# Test ranking với job description mẫu
sample_job_description = """
We are looking for a Senior Software Engineer with the following requirements:
- 5+ years of software development experience
- Strong programming skills in Python, Java, or JavaScript
- Experience with web frameworks (Django, Flask, Spring, React)
- Knowledge of databases (SQL, PostgreSQL, MongoDB)
- Experience with cloud platforms (AWS, Azure, GCP)
- Strong problem-solving and communication skills
- Bachelor's degree in Computer Science or related field
- Experience with version control (Git)
- Knowledge of software development best practices
"""

if processed_cvs:
    ranking_results = test_ranking_with_job_description(processed_cvs, sample_job_description)
else:
    print("❌ No CVs to rank")
    ranking_results = []

## 5. Create Visualizations

In [None]:
def create_dataset_visualizations(dataset_stats: Dict, ranking_results: List[Dict]):
    """Tạo visualizations cho dataset analysis"""
    if not dataset_stats:
        print("❌ No data to visualize")
        return
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. File Types Distribution
    file_types = list(dataset_stats['file_types'].keys())
    file_counts = list(dataset_stats['file_types'].values())
    
    axes[0,0].pie(file_counts, labels=file_types, autopct='%1.1f%%', startangle=90)
    axes[0,0].set_title('File Types Distribution')
    
    # 2. Entity Extraction Success Rate
    entities = list(dataset_stats['extraction_success'].keys())
    success_counts = list(dataset_stats['extraction_success'].values())
    total = dataset_stats['total_files']
    success_rates = [(count / total) * 100 for count in success_counts]
    
    bars = axes[0,1].bar(entities, success_rates, color='skyblue', alpha=0.8)
    axes[0,1].set_title('Entity Extraction Success Rate (%)')
    axes[0,1].set_ylabel('Success Rate (%)')
    axes[0,1].tick_params(axis='x', rotation=45)
    axes[0,1].set_ylim(0, 100)
    
    # Thêm giá trị lên bars
    for bar, rate in zip(bars, success_rates):
        axes[0,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                       f'{rate:.1f}%', ha='center', va='bottom')
    
    # 3. Text Length Distribution
    axes[0,2].hist(dataset_stats['text_lengths'], bins=20, color='lightgreen', alpha=0.7)
    axes[0,2].set_title('Text Length Distribution')
    axes[0,2].set_xlabel('Text Length (characters)')
    axes[0,2].set_ylabel('Frequency')
    
    # 4. Top Skills Distribution
    top_skills = sorted(dataset_stats['skills_distribution'].items(), 
                       key=lambda x: x[1], reverse=True)[:10]
    if top_skills:
        skills, counts = zip(*top_skills)
        axes[1,0].barh(skills, counts, color='salmon', alpha=0.8)
        axes[1,0].set_title('Top 10 Skills Distribution')
        axes[1,0].set_xlabel('Number of CVs')
    
    # 5. Years Experience Distribution
    if dataset_stats['years_exp_distribution']:
        axes[1,1].hist(dataset_stats['years_exp_distribution'], bins=15, color='gold', alpha=0.7)
        axes[1,1].set_title('Years Experience Distribution')
        axes[1,1].set_xlabel('Years of Experience')
        axes[1,1].set_ylabel('Frequency')
    
    # 6. Ranking Results (if available)
    if ranking_results:
        filenames = [r['filename'][:15] + '...' if len(r['filename']) > 15 else r['filename'] 
                    for r in ranking_results[:10]]
        scores = [r['similarity_score'] for r in ranking_results[:10]]
        
        bars = axes[1,2].bar(range(len(filenames)), scores, color='lightcoral', alpha=0.8)
        axes[1,2].set_title('Top 10 CVs by Similarity Score')
        axes[1,2].set_ylabel('Similarity Score')
        axes[1,2].set_xticks(range(len(filenames)))
        axes[1,2].set_xticklabels(filenames, rotation=45, ha='right')
        
        # Thêm giá trị lên bars
        for bar, score in zip(bars, scores):
            axes[1,2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
                           f'{score:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("\n📊 Visualizations created!")

In [None]:
# Tạo visualizations
create_dataset_visualizations(dataset_stats, ranking_results)

## 6. Export Results

In [None]:
def export_results(processed_cvs: List[Dict], dataset_stats: Dict, ranking_results: List[Dict]):
    """Export kết quả phân tích"""
    timestamp = datetime.now().isoformat()
    
    # Tạo summary report
    summary = {
        'timestamp': timestamp,
        'dataset_path': dataset_path,
        'total_files_processed': len(processed_cvs),
        'dataset_statistics': dataset_stats,
        'ranking_results': ranking_results,
        'extraction_summary': {
            'successful_extractions': len(processed_cvs),
            'name_extraction_rate': (dataset_stats['extraction_success']['name'] / len(processed_cvs)) * 100 if processed_cvs else 0,
            'email_extraction_rate': (dataset_stats['extraction_success']['email'] / len(processed_cvs)) * 100 if processed_cvs else 0,
            'phone_extraction_rate': (dataset_stats['extraction_success']['phone'] / len(processed_cvs)) * 100 if processed_cvs else 0,
            'skills_extraction_rate': (dataset_stats['extraction_success']['skills'] / len(processed_cvs)) * 100 if processed_cvs else 0
        }
    }
    
    # Lưu summary
    with open('dataset_analysis_summary.json', 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)
    
    # Tạo detailed CSV
    if processed_cvs:
        csv_data = []
        for cv in processed_cvs:
            csv_data.append({
                'filename': cv['filename'],
                'name': cv['extracted_data']['name'],
                'email': cv['extracted_data']['email'],
                'phone': cv['extracted_data']['phone'],
                'years_exp': cv['extracted_data']['years_exp'],
                'skills_count': len(cv['extracted_data']['skills']),
                'skills': ', '.join(cv['extracted_data']['skills']),
                'text_length': cv['text_length'],
                'file_size_kb': cv['file_size'] / 1024
            })
        
        df = pd.DataFrame(csv_data)
        df.to_csv('dataset_analysis_detailed.csv', index=False, encoding='utf-8')
    
    # Tạo ranking CSV
    if ranking_results:
        ranking_df = pd.DataFrame(ranking_results)
        ranking_df.to_csv('dataset_ranking_results.csv', index=False, encoding='utf-8')
    
    print("\n✅ Results exported:")
    print("  📄 dataset_analysis_summary.json - Summary report")
    if processed_cvs:
        print("  📊 dataset_analysis_detailed.csv - Detailed CV data")
    if ranking_results:
        print("  📈 dataset_ranking_results.csv - Ranking results")
    
    return summary

# Export results
if processed_cvs:
    export_summary = export_results(processed_cvs, dataset_stats, ranking_results)
    
    # Print final summary
    print("\n" + "="*60)
    print("🎯 DATASET ANALYSIS SUMMARY")
    print("="*60)
    print(f"📁 Dataset: {dataset_path}")
    print(f"📄 Files processed: {len(processed_cvs)}")
    print(f"\n   Extraction Success Rates:")
    print(f"  Name: {export_summary['extraction_summary']['name_extraction_rate']:.1f}%")
    print(f"  Email: {export_summary['extraction_summary']['email_extraction_rate']:.1f}%")
    print(f"  Phone: {export_summary['extraction_summary']['phone_extraction_rate']:.1f}%")
    print(f"  Skills: {export_summary['extraction_summary']['skills_extraction_rate']:.1f}%")
    
    if ranking_results:
        print(f"\n📈 Ranking Results:")
        print(f"  Top CV: {ranking_results[0]['filename']} (Score: {ranking_results[0]['similarity_score']:.3f})")
        print(f"  Average similarity: {np.mean([r['similarity_score'] for r in ranking_results]):.3f}")
    
    print(f"\n✅ Analysis completed successfully!")
else:
    print("❌ No data to export")

## 7. Interactive Analysis

In [None]:
# Interactive analysis - cho phép user test với job description khác
def interactive_ranking_test(processed_cvs: List[Dict]):
    """Interactive ranking test"""
    if not processed_cvs:
        print("❌ No CVs to test")
        return
    
    print("\n   Interactive Ranking Test")
    print("Enter a job description to test ranking (or 'quit' to exit):")
    
    while True:
        job_desc = input("\nJob Description: ").strip()
        
        if job_desc.lower() == 'quit':
            break
        
        if not job_desc:
            print("Please enter a job description")
            continue
        
        # Test ranking
        results = test_ranking_with_job_description(processed_cvs, job_desc)
        
        # Show top 5 results
        print("\n🏆 Top 5 Matches:")
        for i, result in enumerate(results[:5], 1):
            print(f"  {i}. {result['name']} - {result['filename']} (Score: {result['similarity_score']:.3f})")
            print(f"     Experience: {result['years_exp']} years, Skills: {len(result['skills'])} skills")

# Uncomment để chạy interactive test
# interactive_ranking_test(processed_cvs)

print("\n   Dataset evaluation completed!")
print("\n📋 Next steps:")
print("  1. Review the exported files")
print("  2. Analyze the visualizations")
print("  3. Use the ranking results for candidate selection")
print("  4. Run interactive ranking test if needed")