In [2]:
import os
import spacy
from spacy.matcher import PhraseMatcher
from pdf2image import convert_from_path
import pytesseract
from docx import Document

# Set your poppler and tesseract paths
tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
poppler_path = r"C:\Release-24.08.0-0\poppler-24.08.0\Library\bin"
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")  # Replace with your fine-tuned model if available

def extract_text_from_pdf(pdf_path):
    print("[INFO] Converting PDF pages to images...")
    images = convert_from_path(pdf_path, poppler_path=poppler_path)
    text = ""
    for i, img in enumerate(images):
        print(f"[INFO] Running OCR on page {i+1}")
        text += pytesseract.image_to_string(img)
    return text

def extract_text_from_docx(docx_path):
    print("[INFO] Reading DOCX file...")
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def load_keywords_from_folder(folder_path):
    keywords = set()
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") or filename.endswith(".csv"):
            with open(os.path.join(folder_path, filename), encoding='utf-8') as f:
                for line in f:
                    # Don't split on commas if you want multi-word phrases
                    kw_clean = line.strip().lower()
                    if kw_clean:
                        keywords.add(kw_clean)
    return list(keywords)

def create_phrase_matcher(keywords, label):
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    patterns = [nlp.make_doc(text) for text in keywords]
    matcher.add(label, patterns)
    return matcher

def extract_keywords_with_spacy(text, skills_keywords, titles_keywords):
    doc = nlp(text)
    skills_matcher = create_phrase_matcher(skills_keywords, "SKILL")
    titles_matcher = create_phrase_matcher(titles_keywords, "TITLE")

    skills_found = set()
    titles_found = set()

    # PhraseMatcher results
    for match_id, start, end in skills_matcher(doc):
        span = doc[start:end]
        skills_found.add(span.text)

    for match_id, start, end in titles_matcher(doc):
        span = doc[start:end]
        titles_found.add(span.text)

    # NER results
    for ent in doc.ents:
        if ent.label_.lower() in {"skill", "job_title", "work_of_art", "org", "product"}:
            # Add additional logic if needed to separate skills from titles
            if "developer" in ent.text.lower() or "engineer" in ent.text.lower():
                titles_found.add(ent.text)
            else:
                skills_found.add(ent.text)

    return list(skills_found), list(titles_found)

def process_resume(file_path, skills_folder, titles_folder):
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type. Please use PDF or DOCX.")

    print("[INFO] Loading skills and job titles...")
    skills = load_keywords_from_folder(skills_folder)
    titles = load_keywords_from_folder(titles_folder)

    print("[INFO] Extracting skills and titles using spaCy...")
    found_skills, found_titles = extract_keywords_with_spacy(text, skills, titles)

    print("\n===== RESUME PARSE RESULT =====")
    print("Top 500 Characters of Extracted Text:\n", text[:500], "\n")
    print("Skills Found:", found_skills)
    print("Job Titles Found:", found_titles)

    return {
        "text": text,
        "skills": found_skills,
        "titles": found_titles
    }

if __name__ == "__main__":
    file_path = r"C:\Users\velam\OneDrive\Documents\pavan's resume.docx"
    skills_folder = r"C:\amrita_uni\Projects\asmacs internship\recognition of skills and title\skills"
    titles_folder = r"C:\amrita_uni\Projects\asmacs internship\recognition of skills and title\titles"
    file_path2 = r"C:\amrita_uni\Projects\asmacs internship\recognition of skills and title\Imgdownload2.pdf"
    
    result1 = process_resume(file_path, skills_folder, titles_folder)
    result2 = process_resume(file_path2, skills_folder, titles_folder)


[INFO] Reading DOCX file...
[INFO] Loading skills and job titles...
[INFO] Extracting skills and titles using spaCy...

===== RESUME PARSE RESULT =====
Top 500 Characters of Extracted Text:
 Velamala Pavan Krishna

SUMMARY
Passionate computer science and AI student with a strong foundation in machine learning, data analysis, and model building . Eager to explore emerging technologies and apply creative solutions to real-world problems . Committed to continuous learning, collaboration, and making a meaningful impact through AI.

Phone:
+91 7569637875
Email:
velamalapavankrishna@gmail.com
Address: Visakhapatnam , 
Andhra Pradesh , India
    530046

Profile links

Linkedin: linkedin.co 

Skills Found: ['Computer Science', 'CS', 'Using Python to Interact', 'Developers', 'javascript', 'Collaboration', 'AI Club', 'assembler', 'Python', 'computer science', 'Adaptive learning', 'MySQL', 'building', 'Email', 'HealthCare', 'ML', 'Mental health', 'AI Fundamentals', 'data analysis', 'CERTIFICATION