In [1]:
import re
import spacy
from pathlib import Path
#!python -m spacy download en_core_web_sm


In [2]:


resume_text = Path("../data/extracted_texts/resume1_pdf.txt").read_text()


In [3]:
nlp = spacy.load("en_core_web_sm")

def extract_email(text):
    match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    return match.group(0) if match else None

def extract_phone(text):
    pattern = r"(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
    match = re.search(pattern, text)
    return match.group(0) if match else None

# To get name without surname
# def extract_name(text):
#     # NER first
#     doc = nlp(text)
#     names = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]
#     if names:
#         longest = max(names, key=lambda n: len(n.split()))
#         if len(longest.split()) >= 2:
#             return longest
        
def extract_name_from_header(text):
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for line in lines[:5]:  # Look at first 5 lines
        if "@" not in line and not re.search(r"\d", line):
            if 2 <= len(line.split()) <= 4:
                return line
    return None





# Example usage
email = extract_email(resume_text)
phone = extract_phone(resume_text)
name = extract_name_from_header(resume_text)

print(f"👤 Name: {name}")
print(f"📧 Email: {email}")
print(f"📱 Phone: {phone}")


👤 Name: Vedururu Srinivasa Rao
📧 Email: srinivasvedururu@gmail.com
📱 Phone: +91-9032086597


In [4]:
import json

with open("../data/skills.json") as f:
    skill_list = json.load(f)

def extract_skills(text, skills):
    tokens = [token.text.lower() for token in nlp(text)]
    found = [skill for skill in skills if skill.lower() in tokens]
    return list(set(found))

skills = extract_skills(resume_text, skill_list)
print(f"🛠️ Skills: {skills}")


🛠️ Skills: ['Excel', 'NumPy', 'Python', 'Communication', 'Git', 'SQL', 'Matplotlib', 'Pandas']


In [16]:
def extract_education(text):
    edu_match = re.search(r'EDUCATION\s*([\s\S]*?)(?:SKILLS|Experience)', text, re.IGNORECASE)
    education_section = edu_match.group(1) if edu_match else ""
    
    # regex : capture institution, degree (with commas), year range or completed year
    pattern = r"([A-Za-z\s\.&,-]+?)\s*—\s*([A-Za-z\s,&\.-]+?)\s*\((?:\s*(\d{4}\s*[–-]\s*\d{4})|Completed in\s*(\d{4}))\s*\)"
    
    matches = re.findall(pattern, education_section)
    
    results = []
    for institution, degree, year_range, completed_year in matches:
        year = year_range if year_range else completed_year
        results.append(f"{degree.strip()} at {institution.strip()} ({year.strip()})")
    return results

def extract_experience(text):
    pattern = r"(Accenture North America.*?Job Simulation)\s*[-–]\s*([A-Za-z]+\s+\d{4})"
    matches = re.findall(pattern, text)
    return [f"{role.strip()} ({date.strip()})" for role, date in matches]

# Run extraction
education = extract_education(resume_text)
experience = extract_experience(resume_text)

# Output
print("🎓 Education:", education)
print("🧑‍💼 Experience:", experience)


🎓 Education: ['B.Tech in Artificial Intelligence & Data Science at ST . Martins Engineering College, Hyderabad (2023 –2027)', 'Intermediate in MPC at Narayana Junior College, Hyderabad (2021 –2023)', 'SSC,Board of Secondary Education at Newton Techno High School, Hyderabad (2021)']
🧑‍💼 Experience: ['Accenture North America Data Analytics and Visualization Job Simulation (May 2025)']


In [17]:
candidate_profile = {
    "name": name,
    "email": email,
    "phone": phone,
    "skills": skills,
    "education": education,
    "experience": experience
}

from pprint import pprint
pprint(candidate_profile)


{'education': ['B.Tech in Artificial Intelligence & Data Science at ST . '
               'Martins Engineering College, Hyderabad (2023 –2027)',
               'Intermediate in MPC at Narayana Junior College, Hyderabad '
               '(2021 –2023)',
               'SSC,Board of Secondary Education at Newton Techno High School, '
               'Hyderabad (2021)'],
 'email': 'srinivasvedururu@gmail.com',
 'experience': ['Accenture North America Data Analytics and Visualization Job '
                'Simulation (May 2025)'],
 'name': 'Vedururu Srinivasa Rao',
 'phone': '+91-9032086597',
 'skills': ['Excel',
            'NumPy',
            'Python',
            'Communication',
            'Git',
            'SQL',
            'Matplotlib',
            'Pandas']}
