<a href="https://colab.research.google.com/github/Stdioch1/Machine-Learning/blob/main/ATS%20Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install spacy
!python -m spacy download en_core_web_md
from IPython import get_ipython
from IPython.display import display
# %%
!pip install python-docx # Install the python-docx package to read DOCX files.
import re
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
!pip install PyPDF2
import docx
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
!pip install textract
!pip install requests
import requests # Import the requests library to make HTTP requests
try:
    from IPython.display import display, FileUpload # Import FileUpload for resume upload
except ImportError:
    # If still not found, try importing from ipywidgets
    from ipywidgets import FileUpload
    print("FileUpload imported from ipywidgets.")

class ATSAnalyzer:
    def __init__(self):
        # Initialize NLP tools
        nltk.download('punkt')
        nltk.download('stopwords')
        self.nlp = spacy.load('en_core_web_md')

        # Define ATS systems and their specific algorithms/configurations
        self.ats_systems = {
            'workday': {
                'parser_type': 'keyword_focused',
                'section_importance': {'skills': 0.3, 'experience': 0.5, 'education': 0.2},
                'format_preference': 'chronological',
                'keyword_match_weight': 0.7
            },
            'icims': {
                'parser_type': 'context_aware',
                'section_importance': {'skills': 0.25, 'experience': 0.55, 'education': 0.2},
                'format_preference': 'chronological',
                'keyword_match_weight': 0.6
            },
            'bamboohr': {
                'parser_type': 'semantic',
                'section_importance': {'skills': 0.35, 'experience': 0.45, 'education': 0.2},
                'format_preference': 'flexible',
                'keyword_match_weight': 0.5
            },
            'greenhouse': {
                'parser_type': 'hybrid',
                'section_importance': {'skills': 0.4, 'experience': 0.4, 'education': 0.2},
                'format_preference': 'flexible',
                'keyword_match_weight': 0.55
            },
            'taleo': {
                'parser_type': 'keyword_focused',
                'section_importance': {'skills': 0.3, 'experience': 0.5, 'education': 0.2},
                'format_preference': 'strict_chronological',
                'keyword_match_weight': 0.75
            },
            'lever': {
                'parser_type': 'semantic',
                'section_importance': {'skills': 0.4, 'experience': 0.4, 'education': 0.2},
                'format_preference': 'flexible',
                'keyword_match_weight': 0.5
            },
            'successfactors': {
                'parser_type': 'hybrid',
                'section_importance': {'skills': 0.3, 'experience': 0.5, 'education': 0.2},
                'format_preference': 'chronological',
                'keyword_match_weight': 0.65
            }
        }

    def parse_resume(self, file_path):
        """Extract text from resume file (PDF, DOCX, or TXT)"""
        _, file_extension = os.path.splitext(file_path)

        if file_extension.lower() == '.pdf':
            text = self._parse_pdf(file_path)
        elif file_extension.lower() == '.docx':
            text = self._parse_docx(file_path)
        elif file_extension.lower() == '.txt':
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        else:
            # Attempt to use textract for other file types
            try:
                text = textract.process(file_path).decode('utf-8')
            except:
                raise ValueError(f"Unsupported file format: {file_extension}")

        return text

    def _parse_pdf(self, file_path):
        """Extract text from PDF file"""
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
        return text

    def _parse_docx(self, file_path):
        """Extract text from DOCX file"""
        doc = docx.Document(file_path)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text

    def parse_job_description(self, jd_text):
        """Parse and extract key information from job description"""
        doc = self.nlp(jd_text)

        # Extract requirements, skills, and qualifications
        skills = []
        requirements = []
        qualifications = []

        # Simple keyword-based extraction (in a real implementation,
        # this would use more sophisticated NLP techniques)
        skill_indicators = ["proficient in", "experience with", "knowledge of", "skills", "ability to"]
        req_indicators = ["requirements", "required", "must have", "essential"]
        qual_indicators = ["qualifications", "education", "degree", "certification"]

        for sentence in doc.sents:
            sent_text = sentence.text.lower()

            # Check for skills
            if any(indicator in sent_text for indicator in skill_indicators):
                skills.append(sentence.text)

            # Check for requirements
            if any(indicator in sent_text for indicator in req_indicators):
                requirements.append(sentence.text)

            # Check for qualifications
            if any(indicator in sent_text for indicator in qual_indicators):
                qualifications.append(sentence.text)

        # Extract important keywords using NLP
        keywords = []
        for token in doc:
            if (token.is_alpha and not token.is_stop and
                (token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_)):
                keywords.append(token.text.lower())

        # Count frequency of keywords
        keyword_freq = {}
        for word in keywords:
            if word in keyword_freq:
                keyword_freq[word] += 1
            else:
                keyword_freq[word] = 1

        # Sort by frequency
        sorted_keywords = sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)
        top_keywords = [k for k, v in sorted_keywords[:30]]  # Top 30 keywords

        return {
            "skills": skills,
            "requirements": requirements,
            "qualifications": qualifications,
            "keywords": top_keywords
        }

    def segment_resume(self, resume_text):
        """Split resume into sections like experience, education, skills"""
        # This is a simplified approach - a production system would use more robust section detection
        sections = {}

        # Common section headers
        section_headers = {
            "experience": ["experience", "work experience", "professional experience", "employment"],
            "education": ["education", "academic background", "academic", "educational background"],
            "skills": ["skills", "technical skills", "core competencies", "key skills", "capabilities"],
            "summary": ["summary", "professional summary", "profile", "about me"],
            "certifications": ["certifications", "certificates", "credentials"],
            "projects": ["projects", "personal projects", "professional projects"]
        }

        # Find potential section boundaries
        lines = resume_text.split('\n')
        current_section = "unknown"

        for i, line in enumerate(lines):
            clean_line = line.strip().lower()

            # Check if this line is a section header
            for section, headers in section_headers.items():
                if any(header == clean_line or header in clean_line for header in headers):
                    current_section = section
                    sections[current_section] = []
                    break

            # Add content to current section
            if current_section in sections:
                sections[current_section].append(line)

        # Join the content within each section
        for section in sections:
            sections[section] = '\n'.join(sections[section])

        return sections

    def extract_entities(self, text):
        """Extract named entities (companies, titles, dates, etc.)"""
        doc = self.nlp(text)
        entities = {}

        for ent in doc.ents:
            if ent.label_ not in entities:
                entities[ent.label_] = []
            entities[ent.label_].append(ent.text)

        return entities

    def analyze_format(self, resume_text):
        """Analyze the format and structure of the resume"""
        lines = resume_text.split('\n')
        structure = {
            "total_lines": len(lines),
            "blank_lines": sum(1 for line in lines if not line.strip()),
            "bullet_points": sum(1 for line in lines if line.strip().startswith(('•', '-', '*', '→'))),
            "avg_line_length": sum(len(line) for line in lines if line.strip()) / max(1, sum(1 for line in lines if line.strip())),
            "sections": len(self.segment_resume(resume_text))
        }

        # Determine format type
        if structure["bullet_points"] > structure["total_lines"] * 0.3:
            format_type = "bullet-heavy"
        else:
            format_type = "paragraph-style"

        # Check for chronological vs functional
        resume_sections = self.segment_resume(resume_text)
        if "experience" in resume_sections:
            exp_text = resume_sections["experience"]
            date_pattern = r'(19|20)\d{2}\s*(-|–|to|—)\s*(present|current|now|[0-9]{4})'
            dates = re.findall(date_pattern, exp_text, re.IGNORECASE)

            if len(dates) >= 2:
                # Check if dates are in descending order (newest first)
                is_chronological = True
                for i in range(len(dates) - 1):
                    # This is simplified logic - would need more robust date parsing
                    if "present" in dates[i][2].lower() or "current" in dates[i][2].lower() or "now" in dates[i][2].lower():
                        continue

                    if dates[i][0] < dates[i+1][0]:
                        is_chronological = False

                format_type += ", chronological" if is_chronological else ", non-chronological"
            else:
                format_type += ", possibly functional"

        return {
            "format_type": format_type,
            "structure": structure
        }

    def calculate_keyword_match(self, resume_text, job_keywords):
        """Calculate the keyword match percentage"""
        resume_text = resume_text.lower()
        matched_keywords = [k for k in job_keywords if k.lower() in resume_text]

        match_percentage = len(matched_keywords) / len(job_keywords) * 100
        return {
            "match_percentage": match_percentage,
            "matched_keywords": matched_keywords,
            "missing_keywords": [k for k in job_keywords if k.lower() not in resume_text]
        }

    def analyze_ats_compatibility(self, resume_text, job_description, ats_name="all"):
        """Analyze resume compatibility with specific or all ATS systems"""
        # Parse job description
        jd_data = self.parse_job_description(job_description)

        # Segment resume
        resume_sections = self.segment_resume(resume_text)

        # Format analysis
        format_analysis = self.analyze_format(resume_text)

        # Keyword match
        keyword_match = self.calculate_keyword_match(resume_text, jd_data["keywords"])

        # Specific ATS systems to analyze
        systems_to_analyze = []
        if ats_name.lower() == "all":
            systems_to_analyze = list(self.ats_systems.keys())
        elif ats_name.lower() in self.ats_systems:
            systems_to_analyze = [ats_name.lower()]
        else:
            return {"error": f"ATS system '{ats_name}' not recognized"}

        # Analyze for each ATS
        results = {}
        for system in systems_to_analyze:
            ats_config = self.ats_systems[system]

            # Calculate base score from keyword match
            keyword_score = keyword_match["match_percentage"] / 100 * ats_config["keyword_match_weight"]

            # Format compatibility score
            format_score = 0
            if ats_config["format_preference"] == "chronological" and "chronological" in format_analysis["format_type"]:
                format_score = 0.2
            elif ats_config["format_preference"] == "flexible":
                format_score = 0.15
            elif ats_config["format_preference"] == "strict_chronological" and "chronological" in format_analysis["format_type"]:
                format_score = 0.25

            # Section weighting
            section_score = 0
            for section, weight in ats_config["section_importance"].items():
                if section in resume_sections:
                    # Simplified scoring - in a real implementation, would analyze section content
                    section_keywords = [k for k in jd_data["keywords"] if k.lower() in resume_sections[section].lower()]
                    section_match = len(section_keywords) / max(1, len(jd_data["keywords"]))
                    section_score += section_match * weight

            # Calculate final score for this ATS
            final_score = (keyword_score + format_score + section_score) * 100

            # Cap at 100%
            final_score = min(100, final_score)

            # Generate recommendations
            recommendations = []

            # Keyword recommendations
            if keyword_match["match_percentage"] < 70:
                recommendations.append(f"Add more of these missing keywords: {', '.join(keyword_match['missing_keywords'][:5])}")

            # Format recommendations
            if ats_config["format_preference"] == "strict_chronological" and "chronological" not in format_analysis["format_type"]:
                recommendations.append("Use a strict chronological format with clear date ranges")

            # Section recommendations
            for section, weight in ats_config["section_importance"].items():
                if section not in resume_sections and weight > 0.1:
                    recommendations.append(f"Add a dedicated '{section}' section")

            results[system] = {
                "score": round(final_score, 1),
                "strengths": {
                    "keyword_match": keyword_match["match_percentage"],
                    "format_compatibility": "High" if format_score > 0.15 else "Medium" if format_score > 0 else "Low",
                    "section_coverage": round(section_score * 100 / sum(ats_config["section_importance"].values()), 1)
                },
                "recommendations": recommendations
            }

        return {
            "overall_compatibility": round(sum(r["score"] for r in results.values()) / len(results), 1),
            "keyword_match": keyword_match,
            "format_analysis": format_analysis,
            "ats_specific_results": results
        }

    def generate_report(self, analysis_results, output_format="json"):
        """Generate a comprehensive report of the analysis"""
        if output_format == "json":
            return json.dumps(analysis_results, indent=2)
        elif output_format == "text":
            # Create plain text report
            report = []
            report.append("RESUME ATS COMPATIBILITY ANALYSIS")
            report.append("=" * 50)
            report.append(f"Overall Compatibility Score: {analysis_results['overall_compatibility']}%")
            report.append("-" * 50)

            report.append("KEYWORD ANALYSIS")
            report.append(f"Match Percentage: {analysis_results['keyword_match']['match_percentage']:.1f}%")
            report.append("Top Matched Keywords:")
            for kw in analysis_results['keyword_match']['matched_keywords'][:10]:
                report.append(f"  - {kw}")
            report.append("Missing Keywords:")
            for kw in analysis_results['keyword_match']['missing_keywords'][:10]:
                report.append(f"  - {kw}")

            report.append("-" * 50)
            report.append("FORMAT ANALYSIS")
            report.append(f"Format Type: {analysis_results['format_analysis']['format_type']}")

            report.append("-" * 50)
            report.append("ATS-SPECIFIC RESULTS")

            for ats, results in analysis_results['ats_specific_results'].items():
                report.append(f"\n{ats.upper()} - Score: {results['score']}%")
                report.append("Strengths:")
                for key, value in results['strengths'].items():
                    report.append(f"  - {key}: {value}")

                report.append("Recommendations:")
                for rec in results['recommendations']:
                    report.append(f"  - {rec}")

            return "\n".join(report)
        else:
            return {"error": f"Output format '{output_format}' not supported"}


# Example usage:
def main():
    analyzer = ATSAnalyzer()

    # Fetch job description from URL
    job_description_url = "https://careers-vizio.icims.com/jobs/3432/senior%2c-data-scientist/job?mode=view&mobile=false&width=1170&height=500&bga=true&needsRedirect=false&jan1offset=-360&jun1offset=-300"
    response = requests.get(job_description_url)
    job_description = response.text  # Assuming the response is HTML, you might need to further extract the relevant job description text using an HTML parser.

    # Provide the path to your resume file directly
    resume_path = "SATHISH RESUME.docx"  # Update with the correct path if necessary

    # Parse the resume content
    resume_text = analyzer.parse_resume(resume_path)

    # Analyze against all ATS systems
    analysis_results = analyzer.analyze_ats_compatibility(resume_text, job_description, "all")

    # Generate and print report
    report = analyzer.generate_report(analysis_results, "text")
    print(report)

    # Save JSON report
    with open("ats_analysis_report.json", "w") as f:
        json.dump(analysis_results, f, indent=2)

if __name__ == "__main__":
    main()

Collecting en-core-web-md==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting textract
  Using cached textract-1.6.5-py3-none-any.whl.metadata (2.5 kB)
Requested textract from https://files.pythonhosted.org/packages/6b/3e/ac16b6bf28edf78296aea7d0cb416b49ed30282ac8c711662541015ee6f3/textract-1.6.5-py3-none-any.whl has invalid metadata: .* suffix can only be used with `==` or `!=` operators
    extract-msg (<=0.29.*)
                 ~~~~~~~^
Please use pip<24.1 if you need to use this version.[0m[33m
[0m  Using cached te

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


PackageNotFoundError: Package not found at 'SATHISH RESUME.docx'