## Import

In [972]:
import os
import re
import json
import pdfplumber
from langdetect import detect

## Patterns

### Personnal

In [973]:
full_name_pattern = r"^(?P<first_name>[A-Z][A-Za-zà-öø-ÿ\-']+)\s+(?P<last_name>[A-Z][a-zà-öø-ÿ\-']+)$"
headline_pattern = r"^(?:[A-Z][^\n]{3,80})$"
email_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9_-]+\.[a-zA-Z0-9._-]+"
phone_pattern = r"(?:\+?\d{1,4})?\s?(?:\d{2,3}\s?){4,5}"
location_pattern = r"(?:Abidjan|Lyon|Grand-Bassam|Anyama|Toulouse|Bingerville|Bassam)"

### Experience

In [974]:
exp_title_line_pattern = r"^(?P<title>.+?)\s*(?:[-–—|,]\s*)(?P<company>.+?)\s*\((?P<dates>[^)]+)\)"
exp_date_pattern = r"(?P<start>(?:\(?\s?[a-zà-öø-ÿA-Z.]+\s)?\d{4}|\d{2}/\d{4})\s*[à-–—]\s*(?P<end>Actuel|Présent|\w+\s?\d{4}\s?\)?)"

### Education

In [975]:
edu_title_line_pattern = r"^(?P<degree>.+?)\s*(?:[-–—|,]\s*)(?P<school>.+?)\s*\((?P<dates>[^)]+)\)"
edu_date_pattern = r"\((?P<start>\d{4})\s*[-–—]\s*(?P<end>\d{4}|En cours)\)"

### Projects

In [976]:
projects_pattern = r"\s*(?P<name>[A-Z][^—\n]+)(?:—\s*(?P<desc>.+))?"

### Skills

In [977]:
skills_pattern = r"•\s*([A-Za-zÀ-Öà-öø-ÿ\s\-/().]+)"

### Soft skills

In [978]:
soft_skills_pattern = r"(?:Communication|Leadership|Travail d'équipe|Résolution de problèmes|Gestion du temps|Adaptabilité|Pensée critique|Créativité|Esprit d'analyse|Intelligence émotionnelle|Agilité|Gestion des priorités|Assidu|Ponctuel|Rigoureux|Respectueux|Courageux|Travail en groupe ou individuellement)"

## Segmentation

In [979]:
def segment_text(text):
    sections = {
        "header": "",
        "experience": "",
        "projects": "",
        "education": "",
        "skills": "",
    }

    current_section = "header"

    for line in text.splitlines():

        line_lower = line.lower()
        if "experience" in line_lower or "expérience" in line_lower or "poste" in line_lower:
            current_section = "experience"
        elif "projets" in line_lower or "projects" in line_lower:
            current_section = "projects"
        elif "éducation" in line_lower or "études" in line_lower or "diplôme" in line_lower or "education" in line_lower or "formation" in line_lower:
            current_section = "education"
        elif "skills" in line_lower or "compétences" in line_lower:
            current_section = "skills"
            
        sections[current_section] += line + "\n"

    return sections

## Extraction functions

In [980]:
def extract_field(pattern, text, multi=False):
    pa = re.compile(pattern, flags=re.MULTILINE & re.IGNORECASE)
    if multi:
        match = []
        matches = pa.findall(text)
        for x in matches if matches else []:
            match.append(x.replace("\n", " ").strip())
    else:
        match = pa.search(text)
        match = match.group(0) if match else None
        match = match.replace("\n", " ").strip() if match else None

    return match

In [981]:
def extract_name(text):
    pa = re.compile(full_name_pattern, flags=re.MULTILINE)
    match = pa.search(text)
    if match:
        first_name = match.group('first_name').strip()
        last_name = match.group('last_name').strip()
        return first_name, last_name
    return None, None

Normalization

In [None]:
SKILL_MAP = {
    "excel": ("Excel", "tool"),
    "microsoft excel": ("Excel", "tool"),
    "word": ("Word", "tool"),
    "powerpoint": ("PowerPoint", "tool"),
    "ms office": ("Microsoft Office", "tool"),
    "ms project": ("MS Project", "project_management"),
    "access": ("Access", "tool"),
    
    "autocad": ("AutoCAD", "engineering"),
    "cad": ("CAD", "engineering"),
    "sketchup": ("SketchUp", "engineering"),
    
    "pvsyst": ("PVsyst", "energy"),
    "homer": ("HOMER", "energy"),
    "meteorol": ("METEOROL", "energy"),
    
    "python": ("Python", "programming"),
    "sql": ("SQL", "programming"),
    "java": ("Java", "programming"),
    "html": ("HTML", "web"),
    "bash": ("Bash", "programming"),
    "docker": ("Docker", "devops"),
    
    "cisco": ("Cisco", "networking"),
    "wireshark": ("Wireshark", "networking"),
    "kali": ("Kali Linux", "cybersecurity"),
    "nmap": ("Nmap", "cybersecurity"),
    "glpi": ("GLPI", "it_management"),
    
    "gestion de projet": ("Project Management", "management"),
    "ms project": ("MS Project", "project_management"),
    "management": ("Management", "management"),
    "gestion des contrats": ("Contract Management", "management"),
    "ingénierie d'affaires": ("Business Engineering", "management"),
    
    "énergie solaire": ("Solar Energy", "energy"),
    "photovoltaïque": ("Photovoltaic", "energy"),
    "efficacité énergétique": ("Energy Efficiency", "energy"),
    "énergie renouvelable": ("Renewable Energy", "energy"),
    "dimensionnement": ("Sizing", "energy"),
    "audit énergétique": ("Energy Audit", "energy"),
    
    "génie électrique": ("Electrical Engineering", "engineering"),
    "électrotechnique": ("Electrotechnics", "engineering"),
    "câblage électrique": ("Electrical Wiring", "engineering"),
    "maintenance électrique": ("Electrical Maintenance", "engineering"),
    
    "ingénierie pédagogique": ("Pedagogical Engineering", "education"),
    "référentiels de formation": ("Training Frameworks", "education"),
    "encadrement pédagogique": ("Pedagogical Supervision", "education"),
    "approche par compétences": ("Competency-Based Approach", "education"),
    
    "teams": ("Microsoft Teams", "collaboration"),
    "zoom": ("Zoom", "collaboration"),
    "flask": ("Flask", "programming"),
    
    "fibre optique": ("Fiber Optics", "networking"),
    "bim": ("BIM", "engineering"),
    "building information modeling": ("BIM", "engineering"),
    "active directory": ("Active Directory", "it_management"),
    
    "français": ("French", "language"),
    "anglais": ("English", "language"),
    "allemand": ("German", "language"),
    "espagnol": ("Spanish", "language"),
    
    "dhcp": ("DHCP", "networking"),
    "dns": ("DNS", "networking"),
    "radius": ("RADIUS", "networking"),
    
    "orca ava": ("ORCA AVA", "specialized_software"),
    "supervision": ("Supervision", "management"),
    "qualité": ("Quality", "management"),
    "sécurité": ("Security", "management"),
    "hse": ("HSE", "management"),
    "qse": ("QSE", "management"),
}

In [983]:
def normalized_skills(skills_list):

    def clean_skill(skill):
        skill = skill.lower().strip()
        skill = re.sub(r'[\-/().]', '', skill)
        return skill

    if not skills_list:
        return None
    
    normalized = []

    for skill in skills_list:
        skill_clean = clean_skill(skill)
        
        if skill_clean in SKILL_MAP.keys():
            normalized.append({"normalized": SKILL_MAP[skill_clean][0],
                               "category": SKILL_MAP[skill_clean][1]})
        
    return normalized

In [984]:
def extract_raw_experiences_1(text):
    lines = text.split('\n')
    experiences = []

    for line in lines:
        if line.strip():
            match = re.match(exp_title_line_pattern, line)
            if match:
                title = match.group('title').strip()
                company = match.group('company').strip()
                dates = match.group('dates').strip()

                date_match = re.search(exp_date_pattern, dates)

                if date_match:
                    start_date = date_match.group('start').strip()
                    end_date = date_match.group('end').strip()
                else:
                    start_date = None
                    end_date = None

                experiences.append({
                    "title": title.strip(),
                    "company": company.strip(),
                    "date": dates.strip(),
                    "start_date": start_date,
                    "end_date": end_date,
                    "raw_title_line": line.strip()
                })

    return experiences

In [985]:
def extract_raw_experiences_2(text):
    experiences = []
    raw_experience_lines = re.findall(exp_title_line_pattern, text, flags=re.MULTILINE)
    for title, company, dates in raw_experience_lines:
        date_match = re.search(exp_date_pattern, dates)
        if date_match:
            start_date = date_match.group('start').strip()
            end_date = date_match.group('end').strip()
        else:
            start_date = None
            end_date = None

        experiences.append({
            "title": title.strip(),
            "company": company.strip(),
            "date": dates.strip(),
            "start_date": start_date,
            "end_date": end_date,
            "raw_title_line": f"{title} at {company} ({dates})"
        })
        
    return experiences

In [986]:
def extract_raw_education_1(text):
    lines = text.split('\n')
    educations = []

    for line in lines:
        if line.strip():
            match = re.match(edu_title_line_pattern, line)
            if match:
                degree = match.group('degree').strip()
                school = match.group('school').strip()
                dates = match.group('dates').strip()

                date_match = re.search(edu_date_pattern, dates)

                if date_match:
                    start_date = date_match.group('start').strip()
                    end_date = date_match.group('end').strip()
                else:
                    start_date = None
                    end_date = None

                educations.append({
                    "school": school.strip(),
                    "degree": degree.strip(),
                    "date": dates.strip(),
                    "start": start_date,
                    "end": end_date,
                    "raw_title_line": line.strip()
                })

    return educations

In [987]:
def extract_raw_education_2(text):
    educations = []
    raw_education_lines = re.findall(edu_title_line_pattern, text, flags=re.MULTILINE)
    for degree, school, dates in raw_education_lines:
        date_match = re.search(edu_date_pattern, dates)
        if date_match:
            start_date = date_match.group('start').strip()
            end_date = date_match.group('end').strip()
        else:
            start_date = None
            end_date = None

        educations.append({
            "school": school.strip(),
            "degree": degree.strip(),
            "date": dates.strip(),
            "start": start_date,
            "end": end_date,
            "raw_title_line": f"{degree} at {school} ({dates})"
        })
        
    return educations

## Quality

In [988]:
def completness_score(data):
    score = 0
    total_fields = 10

    if data["personal"]["first_name"]:
        score += 1
    if data["personal"]["last_name"]:
        score += 1
    if data["personal"]["headline"]:
        score += 1
    if data["personal"]["email"]:
        score += 1
    if data["personal"]["phone"]:
        score += 1
    if data["personal"]["location"]:
        score += 1

    if data["experience"]:
        score += 1

    if data["education"]:
        score += 1

    if data["skills"]:
        score += 1

    if data["soft_skills"]:
        score += 1

    data["quality"]["completness_score"] = score / total_fields

    return data

In [989]:
def fields_missing(data):
    missing_fields = []

    if not data["personal"]["first_name"]:
        missing_fields.append("first_name")
    if not data["personal"]["last_name"]:
        missing_fields.append("last_name")
    if not data["personal"]["headline"]:
        missing_fields.append("headline")
    if not data["personal"]["email"]:
        missing_fields.append("email")
    if not data["personal"]["phone"]:
        missing_fields.append("phone")
    if not data["personal"]["location"]:
        missing_fields.append("location")
    if not data["experience"]:
        missing_fields.append("experience")
    if not data["education"]:
        missing_fields.append("education")
    if not data["skills"]:
        missing_fields.append("skills")
    if not data["soft_skills"]:
        missing_fields.append("soft_skills")

    data["quality"]["fields_missing"] = missing_fields
    
    return data

In [990]:
# def field_confidence(parsed_data):

## Meta

In [991]:
def detect_language(text):
    return detect(text)

## Parser

In [None]:
def parse_cv(path):
    with pdfplumber.open(path) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()

    sections = segment_text(text)

    first_name, last_name = extract_name(text)

    data = {
        "personal": {
            "first_name": first_name,
            "last_name": last_name,
            "headline": extract_field(headline_pattern, text),
            "email": extract_field(email_pattern, text),
            "phone": extract_field(phone_pattern, text, multi=True),
            "location": extract_field(location_pattern, text)
        },
        "experience": extract_raw_experiences_1(sections["experience"]),
        "education": extract_raw_education_1(sections["education"]),
        "skills": normalized_skills(extract_field(skills_pattern, text, multi=True)),
        "soft_skills": extract_field(soft_skills_pattern, text, multi=True),
        "quality": {
            "completness_score": None,
            "fields_missing": []
        },
        "meta": {
            "language": detect_language(text)
        }
    }

    data = completness_score(data)
    data = fields_missing(data)

    return data

In [993]:
# def field_confidence(parsed_data):

## Process all files

In [994]:
data_folder = "data"
pdf_files = [f for f in os.listdir(data_folder) if f.lower().endswith('.pdf')]

In [995]:
for pdf_file in pdf_files:
    pdf_path = os.path.join(data_folder, pdf_file)
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[0]
            text = page.extract_text()

        # stock text in a txt file
        with open(f"texts/output_{pdf_file.rsplit('.pdf', 1)[0]}.txt", "w") as f:
            f.write(text)
        

        # stock in a json file
        with open(f"json_files/{pdf_file.rsplit('.pdf', 1)[0]}.json", "w", encoding="utf-8") as f:
            json.dump(parse_cv(pdf_path), f, indent=4, ensure_ascii=False)
            
    except Exception as e:
        print(f"Erreur avec {pdf_file}: {e}")