In [None]:
!pip install spacy

In [None]:
!pip install PyMuPDF

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!pip install gradio

In [None]:
!pip install scikit-learn

In [None]:
import fitz # PyMuPDF
import spacy
import gradio as gr
import re
# import nltk
# from nltk.corpus import stopwords
from spacy.matcher import PhraseMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Environment Ready!")

In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [9]:
# Load Engilsh model
nlp = spacy.load("en_core_web_sm")

In [10]:
def extract_text_from_pdf(pdf_file):
  """
  PDF text reader
  """

  try:
    doc = fitz.open(pdf_file.name)

    text = ""
    for page in doc:
      text += page.get_text()

    cleaned_text = " ".join(text.split())

    if not cleaned_text.strip():
      return "Error: No text found, either it's not in pdf form or it's scanned image"

    return cleaned_text

  except Exception as e:
    return f"Error occurred: {str(e)}"

In [11]:
def extract_contact_info(text):
    """
    Find specific contact details using Regex patterns
    """

    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, text)

    # phone_pattern = r'^[6-9]\d{9}$'
    phone_pattern = r'[6-9]\d{9}'
    phones = re.findall(phone_pattern, text)

    linkedin = re.findall(r'linkedin\.com/in/[\w.-]+', text)

    github = re.findall(r'github\.com/[\w.-]+', text)

    return {
        "Emails": emails[0] if emails else "Email Not Found",
        "Phones": phones[0] if phones else "Phone Not Found",
        "LinkedIn": linkedin[0] if linkedin else "Link Not Found",
        "Github": github[0] if github else "Link Not Found"
    }

In [12]:
# def clean_resume_text(text):
#     text = text.lower()

#     # Remove URLs, Emails and Phone numbers
#     text = re.sub(r'\S+@\S+','', text)
#     text = re.sub(r'http\S+', '', text)

#     # Remove special characters and numbers (alphanumeric and basic punctuation)
#     text = re.sub(r'[^a-zA-Z\s]', ' ', str(text))

#     # Tokenize and remove stopwords
#     stop_words = set(stopwords.words('english'))
#     words = nltk.word_tokenize(text)
#     filtered_text = [w for w in words if w not in stop_words]

#     return " ".join(filtered_text)

In [13]:
def extract_entities(text):
    """
    Identify Names and Organizations using spaCy
    """

    doc = nlp(text)
    entities = {
        "Name": [],
        "Organizations": []
    }

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["Name"].append(ent.text)
        elif ent.label_ == "ORG":
            entities["Organizations"].append(ent.text)

    primary_name = entities["Name"][0] if entities["Name"] else "Not Identified"

    return {
        "Candidate Name": primary_name,
        "All Names Found": list(set(entities["Name"])),
        "Comapanies/Institutions": list(set(entities["Organizations"]))
    }

In [15]:
def extract_skills(text):
    # nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab, attr = "LOWER")

    # Skill categories
    skills_db = {
        "Programming": ["Python", "Java", "C++", "JavaScript", "SQL", "GO", "Rust"],
        "Machine Learning": ["PyTorch", "TensorFlow", "Scikit-learn", "NLP", "Computer Vision"],
        "Cloud": ["AWS", "Azure", "Docker", "Kubernetes", "GCP"],
        "Tools": ["Git", "Jira", "Excel", "Tableau"]
    }

    # Add petterns to matcher
    for category, skill_list in skills_db.items():
        patterns = [nlp.make_doc(skill) for skill in skill_list]
        matcher.add(category, patterns)

    doc = nlp(text)
    matches = matcher(doc)

    # Extraction of found skills
    found_skills = {}
    for match_id, start, end in matches:
        category = nlp.vocab.strings[match_id]
        skill = doc[start:end].text

        if category not in found_skills:
            found_skills[category] = set()

        found_skills[category].add(skill)

    # Conversion from sets to list for JSON output
    return {k: list(v) for k, v in found_skills.items()}

In [16]:
# def segment_resume(text):
#     # Standardized headers
#     sections = [
#         'EDUCATION',
#         'EXPERIENCE',
#         'WORK EXPERIENCE',
#         'PROJECTS',
#         'SKILLS',
#         'CERTIFICATIONS',
#         'SUMMARY'
#     ]

#     header_pattern = r'(?i)\b(?:'+'|'.join(sections) + r')\b'

#     # Find all matches and their positions
#     matches = list(re.finditer(header_pattern, text))

#     segmented_data = {}

#     if not matches:
#         return {"Full text":text}

#     # Iterate through matches to slice the text
#     for i in range(len(matches)):
#         start_idx = matches[i].start()
#         header_name = matches[i].group().upper()

#         # The end of this section is start of the next one
#         if i + 1 < len(matches):
#             end_idx = matches[i+1].start()
#         else:
#             end_idx = len(text)

#         content = text[start_idx:end_idx].replace(header_name, "").strip()
#         segmented_data[header_name] = content

#     return segmented_data

In [17]:
# def safe_resume_parser(pdf_file):
#     try:
#         # Extraction with validation
#         raw_text = extract_text_from_pdf(pdf_file)
#         if "Error" in raw_text:
#             return {"Status":"Failed", "Reason":"Invalid PDF or Image_based PDF"}

#         # Sequential Extraction
#         contacts = extract_contact_info(raw_text)
#         entities = extract_entities(raw_text)
#         segments = segment_resume(raw_text)
#         skills = extract_skills(raw_text)

#         # Handle missing data
#         candidate_name = entities.get("Candidate Name", "Not Identified")
#         if candidate_name == "Not Identified" and contacts["Emails"] != "Not Found":
#             candidate_name = contacts["Emails"].split('@')[0].capitalize()

#         return {
#             "Status": "Success",
#             "Metadata": {
#                 "Filename": pdf_file.name.split('/')[-1],
#                 "Text_Length": len(raw_text)
#             },
#             "Extracted_Data": {
#                 "Name": candidate_name,
#                 "Contact": contacts,
#                 "Sections_Detected": list(segments.keys()),
#                 "Skills": skills
#             }
#         }

#     except Exception as e:
#         return {"Status": "Critical Error", "Details": str(e)}

In [18]:
def calculate_match_score(resume_text, job_description):
    """
    Similarity score between a resume and a JD
    """

    text_list = [resume_text, job_description]

    # Initialize Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')

    # Transform text into a matrix of numbers(vectors)
    tfidf_matrix = vectorizer.fit_transform(text_list)

    # Calculate similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    # Score as a percentage
    match_score = round(similarity_matrix[0][0] * 100, 2)

    return match_score

In [19]:
def  analyze_skill_gap(resume_skills_dict, jd_text):
    """
    Identifies which required skills are missing from the resume.
    """

    jd_skills_dict = extract_skills(jd_text)

    # Flatten both dictionaries into simple lists/sets of skills
    resume_skills_set = set([skill.lower() for sublist in resume_skills_dict.values() for skill in sublist])
    jd_skills_set = set([skill.lower() for sublist in jd_skills_dict.values() for skill in sublist])

    matched = jd_skills_set.intersection(resume_skills_set)
    missing = jd_skills_set.difference(resume_skills_set)

    return {
        "Matched_Skills": list(matched),
        "Missing_Skills": list(missing),
        "skill_coverage": f"{len(matched)} / {len(jd_skills_set)}" if jd_skills_set else "N/A"
    }

In [20]:
def final_resume_analyzer(pdf_file, job_description):
    # Pipeline Execution
    raw_text = extract_text_from_pdf(pdf_file)
    contacts = extract_contact_info(raw_text)
    entities = extract_entities(raw_text)
    resume_skills = extract_skills(raw_text)

    # Logic layer
    score = calculate_match_score(raw_text, job_description)
    gap_analysis = analyze_skill_gap(resume_skills, job_description)

    # Decision logic
    status = "Shortlist" if score > 70 else "Review" if score > 40 else "Reject"

    return {
        "Candidate Profile": {
            "Name": entities["Candidate Name"],
            "Contact" : contacts,
            "Top Skills": resume_skills
        },
        "ATS Analysis": {
            "Match Score": f"{score}%",
            "Recommendation": status,
            "Missing Keywords": gap_analysis["Missing_Skills"]
        }
    }

In [21]:
# UI Update
interface = gr.Interface(
    fn = final_resume_analyzer,
    inputs = [
        gr.File(label = "Upload Resume"),
        gr.Textbox(label = "Job Description", lines = 8)
    ],
    outputs = [
        gr.JSON(label = "Analysis")
    ],
    title = "AI Resume Parser"
)

In [None]:
interface.launch(debug = True)

### Sample Job Description

**Job Title:** Senior Machine Learning Engineer

**Company:** Tech Innovations Inc.

**Location:** Remote

**About Us:**
Tech Innovations Inc. is a leading technology company at the forefront of developing cutting-edge AI solutions. We are looking for a highly skilled and motivated Senior Machine Learning Engineer to join our dynamic team and contribute to the design, development, and deployment of advanced machine learning models.

**Responsibilities:**
*   Design, develop, and deploy scalable machine learning models into production.
*   Collaborate with data scientists and product managers to understand business requirements and translate them into technical specifications.
*   Optimize existing machine learning algorithms for performance and efficiency.
*   Implement and maintain robust MLOps practices, including CI/CD pipelines, model monitoring, and version control.
*   Conduct rigorous testing and validation of models to ensure accuracy and reliability.
*   Stay up-to-date with the latest advancements in machine learning and artificial intelligence.
*   Mentor junior engineers and contribute to a culture of continuous learning.

**Required Qualifications:**
*   Master's or Ph.D. in Computer Science, Machine Learning, Statistics, or a related field.
*   5+ years of experience in machine learning engineering.
*   Strong proficiency in Python and experience with libraries such as TensorFlow, PyTorch, or scikit-learn.
*   Extensive experience with cloud platforms (AWS, GCP, Azure) and containerization technologies (Docker, Kubernetes).
*   Solid understanding of data structures, algorithms, and software design principles.
*   Experience with big data technologies (e.g., Spark, Hadoop).
*   Excellent communication and teamwork skills.

**Preferred Qualifications:**
*   Experience with Natural Language Processing (NLP) or Computer Vision (CV).
*   Familiarity with distributed machine learning frameworks.
*   Publications in top-tier conferences or journals.



```markdown
# YOUR NAME

[Email: your.email@example.com](mailto:your.email@example.com) | [Phone: (123) 456-7890](tel:1234567890) | [LinkedIn: linkedin.com/in/yourprofile](https://www.linkedin.com/in/yourprofile) | [GitHub: github.com/yourprofile](https://github.com/yourprofile)

---

## Summary

A highly motivated and results-oriented professional with X years of experience in [Your Field]. Seeking to leverage expertise in [Key Skill 1], [Key Skill 2], and [Key Skill 3] to contribute to [Company Name]'s success as a [Job Title].

---

## Skills

**Programming Languages:** Python, Java, C++, JavaScript, SQL

**Machine Learning:** PyTorch, TensorFlow, Scikit-learn, NLP, Computer Vision, Data Analysis, Predictive Modeling

**Cloud Platforms:** AWS, Azure, GCP, Docker, Kubernetes

**Tools & Technologies:** Git, Jira, Tableau, Excel, Linux

**Other:** Problem Solving, Communication, Teamwork, Project Management

---

## Experience

### Senior Machine Learning Engineer | Tech Solutions Inc. | City, State

**Month, Year – Present**

*   Designed and implemented scalable machine learning models for [Specific Project/Product].
*   Optimized existing algorithms, resulting in a 20% improvement in model performance and a 15% reduction in computational cost.
*   Collaborated with cross-functional teams to define project requirements and deploy solutions.
*   Developed and maintained CI/CD pipelines for automated model deployment and monitoring.

### Machine Learning Engineer | Data Insights Co. | City, State

**Month, Year – Month, Year**

*   Built and evaluated machine learning models for [Specific Application].
*   Conducted extensive data preprocessing and feature engineering.
*   Contributed to the research and development of new AI technologies.

---

## Education

### Master of Science in Computer Science | University Name | City, State

**Month, Year – Month, Year**

*   Specialization: Artificial Intelligence, Machine Learning
*   Relevant Coursework: Advanced Machine Learning, Deep Learning, Natural Language Processing

### Bachelor of Science in Electrical Engineering | University Name | City, State

**Month, Year – Month, Year**

---

## Projects

### Project Title 1

*   Developed a [brief description of project] using [technologies used].
*   Achieved [quantifiable result or impact].
*   [Link to project/GitHub]

### Project Title 2

*   [Description]

---

## Certifications

*   AWS Certified Machine Learning – Specialty
*   Google Cloud Professional Machine Learning Engineer

```

You can use this job description to create a resume that includes relevant skills (Python, TensorFlow, PyTorch, AWS, GCP, Azure, Docker, Kubernetes, NLP, Computer Vision, etc.) and experience. You can then upload this created resume PDF and paste the above job description into the Gradio interface to test the resume parser and analyzer.