In [None]:
!pip install spacy

In [None]:
!pip install PyMuPDF

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!pip install gradio

In [None]:
!pip install scikit-learn

In [6]:
import fitz # PyMuPDF
import spacy
import gradio as gr
import re
# import nltk
# from nltk.corpus import stopwords
from spacy.matcher import PhraseMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Environment Ready!")

Environment Ready!


In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [7]:
# Load Engilsh model
nlp = spacy.load("en_core_web_sm")

In [8]:
def extract_text_from_pdf(pdf_file):
  """
  PDF text reader
  """

  try:
    doc = fitz.open(pdf_file.name)

    text = ""
    for page in doc:
      text += page.get_text()

    cleaned_text = " ".join(text.split())

    if not cleaned_text.strip():
      return "Error: No text found, either it's not in pdf form or it's scanned image"

    return cleaned_text

  except Exception as e:
    return f"Error occurred: {str(e)}"

In [9]:
def extract_contact_info(text):
    """
    Find specific contact details using Regex patterns
    """

    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, text)

    # phone_pattern = r'^[6-9]\d{9}$'
    phone_pattern = r'[6-9]\d{9}'
    phones = re.findall(phone_pattern, text)

    linkedin = re.findall(r'linkedin\.com/in/[\w.-]+', text)

    github = re.findall(r'github\.com/[\w.-]+', text)

    return {
        "Emails": emails[0] if emails else "Email Not Found",
        "Phones": phones[0] if phones else "Phone Not Found",
        "LinkedIn": linkedin[0] if linkedin else "Link Not Found",
        "Github": github[0] if github else "Link Not Found"
    }

In [10]:
# def clean_resume_text(text):
#     text = text.lower()

#     # Remove URLs, Emails and Phone numbers
#     text = re.sub(r'\S+@\S+','', text)
#     text = re.sub(r'http\S+', '', text)

#     # Remove special characters and numbers (alphanumeric and basic punctuation)
#     text = re.sub(r'[^a-zA-Z\s]', ' ', str(text))

#     # Tokenize and remove stopwords
#     stop_words = set(stopwords.words('english'))
#     words = nltk.word_tokenize(text)
#     filtered_text = [w for w in words if w not in stop_words]

#     return " ".join(filtered_text)

In [11]:
def extract_entities(text):
    """
    Identify Names and Organizations using spaCy
    """

    doc = nlp(text)
    entities = {
        "Name": [],
        "Organizations": []
    }

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["Name"].append(ent.text)
        elif ent.label_ == "ORG":
            entities["Organizations"].append(ent.text)

    primary_name = entities["Name"][0] if entities["Name"] else "Not Identified"

    return {
        "Candidate Name": primary_name,
        "All Names Found": list(set(entities["Name"])),
        "Comapanies/Institutions": list(set(entities["Organizations"]))
    }

In [12]:
def extract_skills(text):
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab, attr = "LOWER")

    # Skill categories
    skills_db = {
        "Programming": ["Python", "Java", "C++", "JavaScript", "SQL", "GO", "Rust"],
        "Machine Learning": ["PyTorch", "TensorFlow", "Scikit-learn", "NLP", "Computer Vision"],
        "Cloud": ["AWS", "Azure", "Docker", "Kubernetes", "GCP"],
        "Tools": ["Git", "Jira", "Excel", "Tableau"]
    }

    # Add petterns to matcher
    for category, skill_list in skills_db.items():
        patterns = [nlp.make_doc(skill) for skill in skill_list]
        matcher.add(category, patterns)

    doc = nlp(text)
    matches = matcher(doc)

    # Extraction of found skills
    found_skills = {}
    for match_id, start, end in matches:
        category = nlp.vocab.strings[match_id]
        skill = doc[start:end].text

        if category not in found_skills:
            found_skills[category] = set()

        found_skills[category].add(skill)

    # Conversion from sets to list for JSON output
    return {k: list(v) for k, v in found_skills.items()}

In [13]:
def segment_resume(text):
    # Standardized headers
    sections = [
        'EDUCATION',
        'EXPERIENCE',
        'WORK EXPERIENCE',
        'PROJECTS',
        'SKILLS',
        'CERTIFICATIONS',
        'SUMMARY'
    ]

    header_pattern = r'(?i)\b(?:'+'|'.join(sections) + r')\b'

    # Find all matches and their positions
    matches = list(re.finditer(header_pattern, text))

    segmented_data = {}

    if not matches:
        return {"Full text":text}

    # Iterate through matches to slice the text
    for i in range(len(matches)):
        start_idx = matches[i].start()
        header_name = matches[i].group().upper()

        # The end of this section is start of the next one
        if i + 1 < len(matches):
            end_idx = matches[i+1].start()
        else:
            end_idx = len(text)

        content = text[start_idx:end_idx].replace(header_name, "").strip()
        segmented_data[header_name] = content

    return segmented_data

In [14]:
def safe_resume_parser(pdf_file):
    try:
        # Extraction with validation
        raw_text = extract_text_from_pdf(pdf_file)
        if "Error" in raw_text:
            return {"Status":"Failed", "Reason":"Invalid PDF or Image_based PDF"}

        # Sequential Extraction
        contacts = extract_contact_info(raw_text)
        entities = extract_entities(raw_text)
        segments = segment_resume(raw_text)
        skills = extract_skills(raw_text)

        # Handle missing data
        candidate_name = entities.get("Candidate Name", "Not Identified")
        if candidate_name == "Not Identified" and contacts["Emails"] != "Not Found":
            candidate_name = contacts["Emails"].split('@')[0].capitalize()

        return {
            "Status": "Success",
            "Metadata": {
                "Filename": pdf_file.name.split('/')[-1],
                "Text_Length": len(raw_text)
            },
            "Extracted_Data": {
                "Name": candidate_name,
                "Contact": contacts,
                "Sections_Detected": list(segments.keys()),
                "Skills": skills
            }
        }

    except Exception as e:
        return {"Status": "Critical Error", "Details": str(e)}

In [15]:
def calculate_match_score(resume_text, job_description):
    """
    Similarity score between a resume and a JD
    """

    text_list = [resume_text, job_description]

    # Initialize Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')

    # Transform text into a matrix of numbers(vectors)
    tfidf_matrix = vectorizer.fit_transform(text_list)

    # Calculate similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    # Score as a percentage
    match_score = round(similarity_matrix[0][0] * 100, 2)

    return match_score

In [16]:
def resume_parser(pdf_file, job_desc):
    # Parse resume
    parsed_result = safe_resume_parser(pdf_file)

    if parsed_result["Status"] != "Success":
        return parsed_result, 0

    # Get raw text and calculate score
    raw_resume_text = extract_text_from_pdf(pdf_file)
    score = calculate_match_score(raw_resume_text, job_desc)

    return parsed_result["Extracted_Data"], f"{score} % Match"

In [17]:
# New Gradio UI
interface = gr.Interface(
    fn = resume_parser,
    inputs = [
        gr.File(label = "Upload Resume (PDF only)"),
        gr.Textbox(label = "Paste Job Description (JD) here", lines = 10)
    ],
    outputs = [
        gr.JSON(label = "Parsed Details"),
        gr.Label(label = "ATS Match Score")
    ],
    title = "AI Resume Parser"
)

In [None]:
interface.launch(debug = True)