In [4]:
"""
Simple Resume Parser for Google Colab
Extracts: First Name, Last Name, Email, Phone Number, Country
"""

# Install required libraries
!pip install -q spacy pymupdf phonenumbers
!python -m spacy download en_core_web_sm

import re
import fitz  # PyMuPDF
import spacy
import phonenumbers
from google.colab import files
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

class ResumeParser:
    def __init__(self):
        self.data = {
            'first_name': '',
            'last_name': '',
            'email': '',
            'phone': '',
            'country': ''
        }

    def extract_text_from_pdf(self, file_path):
        """Extract text from PDF using PyMuPDF"""
        text = ""
        doc = fitz.open(file_path)
        for page in doc:
            text += page.get_text()
        return text

    def extract_email(self, text):
        """Extract email using regex"""
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, text)
        return emails[0] if emails else ''

    def extract_phone(self, text):
        """Extract phone number"""
        # Try to find phone numbers
        for match in phonenumbers.PhoneNumberMatcher(text, None):
            return phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)

        # Fallback regex for common formats
        phone_pattern = r'[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,4}[-\s\.]?[0-9]{1,9}'
        phones = re.findall(phone_pattern, text)
        return phones[0] if phones else ''

    def extract_name(self, text):
        """Extract name using spaCy NER"""
        doc = nlp(text[:1000])  # Process first 1000 chars for efficiency

        for ent in doc.ents:
            if ent.label_ == "PERSON":
                name_parts = ent.text.split()
                if len(name_parts) >= 2:
                    return name_parts[0], ' '.join(name_parts[1:])
                elif len(name_parts) == 1:
                    return name_parts[0], ''

        # Fallback: assume first line might contain name
        lines = text.split('\n')
        for line in lines[:5]:
            line = line.strip()
            if line and len(line.split()) <= 4 and len(line) < 50:
                name_parts = line.split()
                if len(name_parts) >= 2:
                    return name_parts[0], ' '.join(name_parts[1:])

        return '', ''

    def extract_country(self, text):
        """Extract country using spaCy NER"""
        doc = nlp(text)
        countries = []

        for ent in doc.ents:
            if ent.label_ in ["GPE", "LOC"]:
                countries.append(ent.text)

        # Common countries list for validation
        common_countries = ['USA', 'United States', 'India', 'UK', 'United Kingdom',
                          'Canada', 'Australia', 'Germany', 'France', 'China',
                          'Japan', 'Singapore', 'Malaysia', 'Nepal', 'Pakistan']

        for country in countries:
            for common in common_countries:
                if common.lower() in country.lower():
                    return country

        return countries[0] if countries else ''

    def parse_resume(self, file_path, progress_callback):
        """Main parsing function with progress tracking"""
        progress_callback(20, "Extracting text from PDF...")
        text = self.extract_text_from_pdf(file_path)

        progress_callback(40, "Extracting email...")
        self.data['email'] = self.extract_email(text)

        progress_callback(60, "Extracting phone number...")
        self.data['phone'] = self.extract_phone(text)

        progress_callback(70, "Extracting name...")
        first, last = self.extract_name(text)
        self.data['first_name'] = first
        self.data['last_name'] = last

        progress_callback(90, "Extracting country...")
        self.data['country'] = self.extract_country(text)

        progress_callback(100, "Parsing complete!")

        return self.data

# Create UI
def create_form():
    # Style
    style = """
    <style>
        .form-container {
            max-width: 600px;
            margin: 20px auto;
            padding: 20px;
            border: 1px solid #ddd;
            border-radius: 8px;
            background: #f9f9f9;
        }
        .form-title {
            text-align: center;
            color: #333;
            margin-bottom: 20px;
        }
        .progress-box {
            background: white;
            border: 2px solid #4CAF50;
            border-radius: 5px;
            padding: 15px;
            margin: 20px 0;
            text-align: center;
        }
        .progress-bar {
            width: 100%;
            height: 25px;
            background: #f0f0f0;
            border-radius: 5px;
            overflow: hidden;
            margin-top: 10px;
        }
        .progress-fill {
            height: 100%;
            background: #4CAF50;
            transition: width 0.3s;
            text-align: center;
            color: white;
            line-height: 25px;
        }
    </style>
    """

    # Widgets
    upload_btn = widgets.FileUpload(accept='.pdf', multiple=False, description='Upload Resume')
    first_name = widgets.Text(description='First Name:', style={'description_width': '120px'})
    last_name = widgets.Text(description='Last Name:', style={'description_width': '120px'})
    email = widgets.Text(description='Email:', style={'description_width': '120px'})
    phone = widgets.Text(description='Phone:', style={'description_width': '120px'})
    country = widgets.Text(description='Country:', style={'description_width': '120px'})

    output = widgets.Output()
    progress_output = widgets.Output()

    parser = ResumeParser()

    def update_progress(percent, message):
        with progress_output:
            clear_output(wait=True)
            display(HTML(f"""
                <div class="progress-box">
                    <div><strong>üîÑ {message}</strong></div>
                    <div class="progress-bar">
                        <div class="progress-fill" style="width: {percent}%">{percent}%</div>
                    </div>
                </div>
            """))

    def on_upload(change):
        if upload_btn.value:
            with output:
                clear_output()
                try:
                    # Get uploaded file
                    uploaded_file = list(upload_btn.value.values())[0]
                    file_path = f"/tmp/{uploaded_file['metadata']['name']}"

                    # Save file
                    with open(file_path, 'wb') as f:
                        f.write(uploaded_file['content'])

                    # Parse with progress
                    data = parser.parse_resume(file_path, update_progress)

                    # Update form fields
                    first_name.value = data['first_name']
                    last_name.value = data['last_name']
                    email.value = data['email']
                    phone.value = data['phone']
                    country.value = data['country']

                    with progress_output:
                        clear_output(wait=True)
                        display(HTML('<div class="progress-box" style="border-color: #4CAF50;"><strong>‚úÖ Resume parsed successfully!</strong></div>'))

                except Exception as e:
                    with progress_output:
                        clear_output()
                        display(HTML(f'<div class="progress-box" style="border-color: #f44336;"><strong>‚ùå Error: {str(e)}</strong></div>'))

    upload_btn.observe(on_upload, names='value')

    # Display form
    display(HTML(style))
    display(HTML('<div class="form-container"><h2 class="form-title">üìÑ Resume Parser Form</h2></div>'))
    display(upload_btn)
    display(progress_output)
    display(output)
    display(first_name)
    display(last_name)
    display(email)
    display(phone)
    display(country)

# Run the form
create_form()

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


FileUpload(value={}, accept='.pdf', description='Upload Resume')

Output()

Output()

Text(value='', description='First Name:', style=DescriptionStyle(description_width='120px'))

Text(value='', description='Last Name:', style=DescriptionStyle(description_width='120px'))

Text(value='', description='Email:', style=DescriptionStyle(description_width='120px'))

Text(value='', description='Phone:', style=DescriptionStyle(description_width='120px'))

Text(value='', description='Country:', style=DescriptionStyle(description_width='120px'))