In [2]:
import os
import docx
import PyPDF2

# 🔹 Path to resumes folder
FOLDER_PATH = r"C:\Users\SHREEL\OneDrive\Documents\Shreel\New_Resume"

# 🔹 Keywords to match (all lowercase for case-insensitive search)
keywords = [
    
  "Data collection",
    "Data cleaning",
    "Data organization",
    "Data analysis",
    "Trend identification",
    "Pattern recognition",
    "Statistical analysis",
    "Analytical tools",
    "Data visualization",
    "Dashboards",
    "Reports",
    "Tableau",
    "Power BI",
    "Microsoft Excel",
    "Data-driven decision making",
    "Stakeholder collaboration",
    "Data validation",
    "Data quality checks",
    "Industry trends",
    "Best practices",
    "Data tools",
    
    # Required Qualifications
    "Master’s degree",
    "Analytical thinking",
    "Critical thinking",
    "Excel proficiency",
    "Tableau proficiency",
    "Power BI proficiency",
    "Communication skills",
    "Attention to detail",
    "Time management",
    "US work authorization",
    
    # Preferred Qualifications
    "Internship experience",
    "Project experience",
    "SQL",
    "Database querying",
    "Database management",
    "Statistical concepts",
    "Python",
    "R",
]
keywords=[keyword.lower() for keyword in keywords]

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def search_folder(folder):
    results = {}
    for file in os.listdir(folder):
        path = os.path.join(folder, file)
        if not os.path.isfile(path):
            continue
        text = ""
        try:
            if file.endswith(".docx"):
                text = read_docx(path)
            elif file.endswith(".pdf"):
                text = read_pdf(path)
            elif file.endswith(".txt"):
                text = read_txt(path)
        except Exception as e:
            print(f"❌ Error reading {file}: {e}")
            continue

        text_lower = text.lower()
        found = [kw for kw in keywords if kw in text_lower]
        results[file] = {
            "matches": len(found),
            "keywords": found
        }
    return results

if __name__ == "__main__":
    matches = search_folder(FOLDER_PATH)
    # Sort resumes by number of matched keywords (descending)
    ranked = sorted(matches.items(), key=lambda x: x[1]["matches"], reverse=True)

    print("\n🔝 Top 5 Resumes by Keyword Matches:\n")
    for i, (file, data) in enumerate(ranked[:10], start=1):
        print(f"{i}. {file} → {data['matches']} matches")
        print(f"   Keywords: {data['keywords']}\n")



🔝 Top 5 Resumes by Keyword Matches:

1. Shreel_resume_Quant_VOYA.docx → 8 matches
   Keywords: ['statistical analysis', 'data visualization', 'dashboards', 'reports', 'power bi', 'sql', 'python', 'r']

2. Shreel_resume_Broadbridge.docx → 7 matches
   Keywords: ['dashboards', 'tableau', 'stakeholder collaboration', 'best practices', 'sql', 'python', 'r']

3. Shreel_resume_JPMC_JQ.docx → 7 matches
   Keywords: ['dashboards', 'tableau', 'stakeholder collaboration', 'best practices', 'sql', 'python', 'r']

4. Shreel_resume_JPMC_JQ.pdf → 7 matches
   Keywords: ['dashboards', 'tableau', 'stakeholder collaboration', 'best practices', 'sql', 'python', 'r']

5. Shreel Patel_Jr. AI Developer_20250908.pdf → 6 matches
   Keywords: ['data analysis', 'power bi', 'data-driven decision making', 'sql', 'python', 'r']

6. Shreel_resume_IA.docx → 6 matches
   Keywords: ['dashboards', 'reports', 'power bi', 'sql', 'python', 'r']

7. Shreel_resume_JPMG.docx → 6 matches
   Keywords: ['statistical analysis'