In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [18]:
import spacy
import re

# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file."""
    from PyPDF2 import PdfReader
    pdf_reader = PdfReader(pdf_file)
    text = ''
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

In [34]:
def extract_email(text):
    """Extracts email from text using regex and removes any invalid prefixes before the '@' symbol."""
    # Define a regex pattern to extract emails
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    # Split the text into lines and join them to ensure email continuity across lines
    lines = text.split('\n')
    joined_text = " ".join(lines)

    # Find all matches for email addresses
    emails = re.findall(email_pattern, joined_text)

    if emails:
        # Iterate over the found emails and clean them if necessary
        for email in emails:
            # Check if there's any unwanted text before '@', ensuring valid email
            email_parts = email.split('@')
            if len(email_parts) == 2 and len(email_parts[0]) > 0:
                # Return the cleaned email
                return email
            else:
                continue

    # If no valid email is found, return a not-found message
    return "Email not found"

def extract_phone(text):
    """Extracts phone number from text using regex."""
    phone_pattern = r'\+?\d[\d -]{8,12}\d'
    phone = re.findall(phone_pattern, text)
    return phone[0] if phone else None

def extract_name(text):
    """Extracts the name by looking for the first meaningful line and excluding non-name lines."""
    lines = text.split('\n')
    name = None

    for line in lines[:15]:  # Focus on the first 15 lines of the document
        line = line.strip()
        if not line or "@" in line or any(char.isdigit() for char in line):  # Skip lines with emails, numbers
            continue

        # Assume the first meaningful line that doesn't contain numbers is the name
        doc = nlp(line)
        if len(doc.ents) > 0:
            for ent in doc.ents:
                if ent.label_ == "PERSON":
                    name = ent.text
                    break

        if name:
            break
        else:
            # If spaCy doesn't detect a name, use the line itself
            return line if line else "Name not found"

    return name if name else "Name not found"

def extract_professional_summary(text):
    """Extracts the professional summary by capturing everything between 'Profile' and 'Professional Experience'."""
    lines = text.split('\n')
    summary = []
    recording = False

    for line in lines:
        line = line.strip()

        # Start recording after finding 'Profile'
        if "profile" in line.lower() and not recording:
            recording = True
            continue

        # Stop recording at 'Professional Experience' or similar section
        if recording and ("professional experience" in line.lower() or "experience" in line.lower()):
            break

        # Record the lines that are part of the summary, excluding empty lines
        if recording and line:
            summary.append(line)

    return ' '.join(summary).strip() if summary else "Professional summary not found"

def extract_work_experience(text):
    """Extracts work experience by looking for keywords."""
    experience_keywords = ["experience", "employment", "work history", "professional experience"]
    lines = text.split('\n')
    experience = []
    recording = False

    for line in lines:
        if any(keyword.lower() in line.lower() for keyword in experience_keywords):
            recording = True
            continue
        if recording and (line.strip() == '' or any(keyword.lower() in line.lower() for keyword in ["education", "skills"])):
            break
        if recording:
            experience.append(line.strip())

    return experience if experience else None

def extract_education(text):
    """Extracts education details by looking for degree names."""
    degrees = ["B.Sc", "M.Sc", "B.Tech", "M.Tech", "PhD", "Bachelor", "Master", "Doctorate"]
    education = []
    for degree in degrees:
        if degree.lower() in text.lower():
            education.append(degree)
    return education

def extract_skills(text):
    """Extracts skills from text by looking for common skill keywords."""
    skills = ["Python", "Java", "C++", "Machine Learning", "Data Science", "AI", "Deep Learning", "NLP"]
    extracted_skills = [skill for skill in skills if skill.lower() in text.lower()]
    return extracted_skills

def extract_certifications(text):
    """Extracts certifications by looking for common certification keywords and avoids educational references."""
    certification_keywords = ["certification", "certified", "certificate", "accreditation"]
    lines = text.split('\n')
    certifications = []

    recording = False
    for line in lines:
        if any(keyword.lower() in line.lower() for keyword in certification_keywords):
            recording = True
            certifications.append(line.strip())
        elif recording and line.strip() == '':
            break

    # Filter out any mistakenly included education data
    certifications = [cert for cert in certifications if not any(kw in cert.lower() for kw in ["school", "examination", "secondary"])]

    return certifications if certifications else None

In [30]:
def process_resume(pdf_path):
    """Main function to process the resume from PDF."""
    resume_text = extract_text_from_pdf(pdf_path)

    resume_info = {
        "PersonalData": {
            "Name": extract_name(resume_text),
            "ContactInformation": {
                "Email": extract_email(resume_text),
                "Phone": extract_phone(resume_text)
            },
            "ProfessionalSummary": extract_professional_summary(resume_text)
        },
        "Experience": extract_work_experience(resume_text),
        "Education": extract_education(resume_text),
        "Skills": extract_skills(resume_text),
        "Certifications": extract_certifications(resume_text)
    }

    return resume_info


In [35]:
# Example Usage
pdf_file_path = '/content/drive/MyDrive/GenAI/SHRIYA CHOWDHURY RESUME.pdf'  # Path to your resume PDF
resume_data = process_resume(pdf_file_path)

# Output the extracted data
print(resume_data)

# Output the extracted data
import json
print(json.dumps(resume_data, indent=4))

# You can save the extracted information into a JSON file
with open('/content/extracted_resume.json', 'w') as json_file:
    json.dump(resume_data, json_file, indent=4)

{'PersonalData': {'Name': 'Shriya Chowdhury', 'ContactInformation': {'Email': 'Singleshriyachowdhury24@gmail.com', 'Phone': '+919330563437'}, 'ProfessionalSummary': 'I am a pre-final year undergraduate student at VIT, Vellore pursuing B.Tech in Electronics and Communication Engineering. My primary areas of interest are Machine Learning, Data Science and Internet of Things. I have worked as a Product Design Intern and a Machine Learning Intern, which required me to polish my skills in Arduino Programming, building Deep Learning Models such as a Convolutional Neural Network Architecture for Binary Classification, Machine Learning Regression Models for Predicting House Prices, Image Processing and Optical Character Recognition for Text Feature Extraction, Relational Database Management using SQL and Natural Language Processing to build a Stacking Classifier for Fake News Prediction Modelling. Having been someone who loves continuous learning, I have always wanted to explore the horizons o