In [6]:
import fitz  # PyMuPDF
import re
import json

def extract_information_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def extract_information(text):
    extracted_info = {
        "email": None,
        "phone_number": None,
        "github": None,
        "linkedin": None,
        "technical_skills": [],
        "professional_skills": [],
        "certification": {
            "organization": None,
            "name": None,
            "year": None
        }
    }

    # Extract email using regular expression
    email_pattern = re.compile(r"Email:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE)
    email_match = re.search(email_pattern, text)
    if email_match:
        extracted_info["email"] = email_match.group(1)

    # Extract phone number using regular expression
    phone_pattern = re.compile(r"Phone:\s*\((\d{3})\)\s*(\d{3})-(\d{4})", re.IGNORECASE)
    phone_match = re.search(phone_pattern, text)
    if phone_match:
        extracted_info["phone_number"] = f"({phone_match.group(1)}) {phone_match.group(2)}-{phone_match.group(3)}"

    # Extract GitHub using regular expression
    github_pattern = re.compile(r"GitHub:\s*(https://github.com/\S+)", re.IGNORECASE)
    github_match = re.search(github_pattern, text)
    if github_match:
        extracted_info["github"] = github_match.group(1)

    # Extract LinkedIn using regular expression
    linkedin_pattern = re.compile(r"LinkedIn:\s*(https://www.linkedin.com/\S+)", re.IGNORECASE)
    linkedin_match = re.search(linkedin_pattern, text)
    if linkedin_match:
        extracted_info["linkedin"] = linkedin_match.group(1)

    # Extract skills (assuming they are listed under "Skills")
    skills_pattern = re.compile(r"Skills:(.*?)(Certifications:|Projects:|References:|Education:)", re.DOTALL | re.IGNORECASE)
    skills_match = re.search(skills_pattern, text)
    if skills_match:
        skills_section = skills_match.group(1).strip()
        # Extract technical skills
        technical_skills_pattern = re.compile(r"Technical Skills:\s*(.*?)(\n|$)", re.DOTALL | re.IGNORECASE)
        technical_skills_match = re.search(technical_skills_pattern, skills_section)
        if technical_skills_match:
            technical_skills_text = technical_skills_match.group(1).strip()
            extracted_info["technical_skills"] = [skill.strip() for skill in technical_skills_text.split(",") if skill.strip()]
        # Extract professional skills
        professional_skills_pattern = re.compile(r"Professional Skills:\s*(.*?)(\n|$)", re.DOTALL | re.IGNORECASE)
        professional_skills_match = re.search(professional_skills_pattern, skills_section)
        if professional_skills_match:
            professional_skills_text = professional_skills_match.group(1).strip()
            extracted_info["professional_skills"] = [skill.strip() for skill in professional_skills_text.split(",") if skill.strip()]

    # Extract certification information
    certifications_pattern = re.compile(r"Certifications:(.*?)(Projects:|References:|Education:|Skills:)", re.DOTALL | re.IGNORECASE)
    certifications_match = re.search(certifications_pattern, text)
    if certifications_match:
        certifications_section = certifications_match.group(1).strip()
        # Extract organization, name, and year
        organization_pattern = re.compile(r"Organization:\s*(.*?)\s*Name:", re.DOTALL | re.IGNORECASE)
        organization_match = re.search(organization_pattern, certifications_section)
        if organization_match:
            extracted_info["certification"]["organization"] = organization_match.group(1).strip()

        name_pattern = re.compile(r"Name:\s*(.*?)\s*Year:", re.DOTALL | re.IGNORECASE)
        name_match = re.search(name_pattern, certifications_section)
        if name_match:
            extracted_info["certification"]["name"] = name_match.group(1).strip()

        year_pattern = re.compile(r"Year:\s*(\d{4})", re.DOTALL | re.IGNORECASE)
        year_match = re.search(year_pattern, certifications_section)
        if year_match:
            extracted_info["certification"]["year"] = year_match.group(1).strip()

    return extracted_info

# Replace with the path to your PDF file
pdf_file = "/Users/omar.djebbi/Desktop/cv/sodapdf-converted.pdf"
cv_text = extract_information_from_pdf(pdf_file)
extracted_info = extract_information(cv_text)
print(json.dumps(extracted_info, indent=2))


{
  "email": "johndoe@example.com",
  "phone_number": "(123) 456-7890",
  "github": "https://github.com/johndoe",
  "linkedin": "https://www.linkedin.com/in/johndoe",
  "technical_skills": [
    "JavaScript",
    "Python",
    "etc."
  ],
  "professional_skills": [
    "Data Analysis",
    "Project Management",
    "etc."
  ],
  "certification": {
    "organization": "Udemy",
    "name": "Certified JavaScript Developer",
    "year": "2019"
  }
}
