In [37]:
import os
import re
import json
import pdfplumber

## Patterns

### Personnal

In [38]:
full_name_pattern = r"^(?P<first_name>[A-Z][A-Za-zà-öø-ÿ\-']+)\s+(?P<last_name>[A-Z][a-zà-öø-ÿ\-']+)$"
headline_pattern = r"^(?:[A-Z][^\n]{3,80})$"
email_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9_-]+\.[a-zA-Z0-9._-]+"
phone_pattern = r"(?:\+?\d{1,3})?\s?(?:\(?\d{2,3}\)?[\s.-]?){3,5}\d{2}"

### Experience

In [39]:
exp_title_line_pattern = r"^(?P<title>.+?)\s*(?:[-–—|,]\s*)(?P<company>.+?)\s*\((?P<dates>[^)]+)\)"
exp_date_pattern = r"(?P<start>(?:\(?\s?[a-zà-öø-ÿA-Z.]+\s)?\d{4}|\d{2}/\d{4})\s*[à-–—]\s*(?P<end>Actuel|Présent|\w+\s?\d{4}\s?\)?)"

### Education

In [40]:
edu_title_line_pattern = r"^(?P<degree>.+?)\s*(?:[-–—|,]\s*)(?P<school>.+?)\s*\((?P<dates>[^)]+)\)"
edu_date_pattern = r"\((?P<start>\d{4})\s*[-–—]\s*(?P<end>\d{4}|En cours)\)"

### Projects

### Skills

In [41]:
skills_pattern = r"•\s*([A-Za-zÀ-Öà-öø-ÿ0-9\s\-/().]+)"

### Soft skills

## Extraction functions

In [42]:
def extract_field(pattern, text, multi=False):
    pa = re.compile(pattern, flags=re.MULTILINE)
    if multi:
        match = []
        matches = pa.findall(text)
        for x in matches if matches else []:
            match.append(x.replace("\n", " ").strip())
    else:
        match = pa.search(text)
        match = match.group(0) if match else None
        match = match.replace("\n", " ").strip() if match else None

    return match if match else None

In [43]:
def extract_name(text):
    pa = re.compile(full_name_pattern, flags=re.MULTILINE)
    match = pa.search(text)
    if match:
        first_name = match.group('first_name').strip()
        last_name = match.group('last_name').strip()
        return first_name, last_name
    return None, None

In [44]:
def extract_raw_experiences(text):
    experiences = []
    raw_experience_lines = re.findall(exp_title_line_pattern, text, flags=re.MULTILINE)
    for title, company, dates in raw_experience_lines:
        date_match = re.search(exp_date_pattern, dates)
        if date_match:
            start_date = date_match.group('start').strip()
            end_date = date_match.group('end').strip()
        else:
            start_date = None
            end_date = None

        experiences.append({
            "title": title.strip(),
            "company": company.strip(),
            "date": dates.strip(),
            "start_date": start_date,
            "end_date": end_date,
            "raw_title_line": f"{title} at {company} ({dates})"
        })
        
    return experiences

In [45]:
def extract_raw_education(text):
    educations = []
    raw_education_lines = re.findall(edu_title_line_pattern, text, flags=re.MULTILINE)
    for degree, school, dates in raw_education_lines:
        date_match = re.search(edu_date_pattern, dates)
        if date_match:
            start_date = date_match.group('start').strip()
            end_date = date_match.group('end').strip()
        else:
            start_date = None
            end_date = None

        educations.append({
            "school": school.strip(),
            "degree": degree.strip(),
            "date": dates.strip(),
            "start": start_date,
            "end": end_date,
            "raw_title_line": f"{degree} at {school} ({dates})"
        })
        
    return educations

In [46]:
def parse_cv(path):
    with pdfplumber.open(path) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()

        with open(f"output_{path.split('/')[-1]}.txt", "w") as f:
            f.write(text)

        first_name, last_name = extract_name(text)

    data = {
        "personal": {
            "first_name": first_name,
            "last_name": last_name,
            "headline": extract_field(headline_pattern, text),
            "email": extract_field(email_pattern, text),
            "phone": extract_field(phone_pattern, text, multi=True),
        },
        "experience": extract_raw_experiences(text),
        # "education": extract_raw_education(text),
        "skills": extract_field(skills_pattern, text, multi=True)
    }

    return data

## Process all files

In [47]:
data_folder = "data"
pdf_files = [f for f in os.listdir(data_folder) if f.lower().endswith('.pdf')]

In [48]:
for pdf_file in pdf_files:
    pdf_path = os.path.join(data_folder, pdf_file)
    print(f"Traitement de : {pdf_file}")
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[0]
            text = page.extract_text()

            print(parse_cv(pdf_path), "\n")

            # stock in a json file
            with open(f"{pdf_file}.json", "w", encoding="utf-8") as f:
                json.dump(parse_cv(pdf_path), f, indent=4, ensure_ascii=False)
            
    except Exception as e:
        print(f"Erreur avec {pdf_file}: {e}")

Traitement de : cv_sample_5.pdf
{'personal': {'first_name': 'KOUASSI', 'last_name': 'Nanga', 'headline': 'KOUASSI Nanga', 'email': 'knanga+cv08@exemple.com', 'phone': ['0102030405', '0708091011']}, 'experience': [], 'skills': ['Câblage des Installations Électriques Domestiques et Industrielles', 'Maintenance des Équipements et Installations Électriques', 'Encadrement et Supervision Pédagogique', 'Élaboration des Référentiels de Formation / Programmes Pédagogiques dans les Établissements du METFPA', 'Révision des Programmes Pédagogiques dans les Établissements du METFPA', 'Rédaction Administrative', 'Méthodologie de Recherche', 'Sciences de l', 'Utilisation d', 'Ingénierie Pédagogique (PPO']} 

Traitement de : cv_sample_8.pdf
{'personal': {'first_name': None, 'last_name': None, 'headline': 'KOUASSI Moïse Armand', 'email': 'moise.kouassi+cv12@exemple.com', 'phone': ['+225 07 67 21 83 04', '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2016-2017', '2013-2014', '2008-2009', '2012-201