In [47]:
from pathlib import Path
from utils import *
import string
import re
import spacy
from pprint import pprint

DATA_FOLDER = Path('../audio_database/personnal_data_test/')

In [51]:
def is_valid_name(name):
    """Check if the name is valid for matching."""
    cleaned_name = name.strip()
        # Exclude names that are too short
    if len(cleaned_name) <= 2:
        return False
    
    # Exclude specific non-name words or symbols
    non_name_keywords = ["sos", "oui", "allô", "médecin", "médecins", "bonjour", "covid"]  # Add more keywords as needed
    if cleaned_name.lower() in non_name_keywords:
        return False

    return True

def find_names_with_spacy(transcript):
    nlp = spacy.load('fr_dep_news_trf')
    doc = nlp(transcript)
    names = list(set([w.text for w in doc if w.pos_ == 'PROPN' and is_valid_name(w.text)]))
    
    return names

def month_name_to_number(month_name):
    month_names = ['janvier', 'février', 'mars', 'avril', 'mai', 'juin', 
                   'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre']
    try:
        return month_names.index(month_name.lower()) + 1
    except ValueError:
        return None
    

def extract_full_date(transcript):
    min_year = 1900

    # Pattern for numerical full date
    full_date_pattern = r'\b(0?[1-9]|[12][0-9]|3[01])[-/](0?[1-9]|1[0-2])[-/](\d{2}|\d{4})\b'
    
    # Pattern for full date with month name
    month_names = '|'.join(['janvier', 'février', 'mars', 'avril', 'mai', 'juin', 
                            'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'])
    date_with_month_pattern = r'\b(0?[1-9]|[12][0-9]|3[01])\s+(' + month_names + r')\s+(\d{4})\b'

    # Try to extract full date in numerical format
    full_dates = re.findall(full_date_pattern, transcript)
    for date in full_dates:
        day, month, year = date
        if len(year) == 2:  # Convert two-digit year to four digits
            year = "19" + year if year > "24" else "20" + year
         # Handle transcript erros (ex: 1568 instead of 1968)

        if int(year) < min_year:
            year = int("19" + year[-2:])

        formatted_date = f"{day.zfill(2)}/{month.zfill(2)}/{str(year)}"
        return formatted_date

    # Try to extract full date with month name
    dates_with_month = re.findall(date_with_month_pattern, transcript)
    for date in dates_with_month:
        day, month_name, year = date
        month_number = month_name_to_number(month_name)
        if month_number:
            formatted_date = f"{day.zfill(2)}/{month_number:02d}/{year}"
            return formatted_date

    # Extract just the year
    year_pattern = r'\b(19\d{2}|20\d{2})\b'
    years = re.findall(year_pattern, transcript)
    if years:
        year = int(years[0])
        if year < min_year:
            year = int("19" + year[-2:])
        return f"01/01/{str(years)}"  # Defaulting to '01/01/YYYY' for year-only cases

    return None

def extract_phone_numbers(transcript):
    phone_regex = r'(06|07)(\d{8})'

    transcript = transcript.replace(" ", "")
    transcript = transcript.translate(str.maketrans('', '', string.punctuation))
    match = re.search(phone_regex, transcript)
    match = match.group() if match else match
    return match

def extract_postcodes(transcript):
    # Existing pattern for postcodes like '75015' or '75-015'
    postcode_pattern = r'\b(75|77|78|91|92|93|94|95)(?:[-\s]?)(\d{3})\b'
    raw_postcodes = re.findall(postcode_pattern, transcript)

    # Additional pattern for postcodes written as '19ème'
    arrondissement_pattern = r'\b(\d{1,2})[eè]me\b'
    arrondissements = re.findall(arrondissement_pattern, transcript)

    # Formatting arrondissements to Paris postcodes (e.g., '75019' for '19ème')
    formatted_arrondissements = ['75' + arr.zfill(3) for arr in arrondissements]

    # Combining and returning all found postcodes
    return [''.join(match) for match in raw_postcodes] + formatted_arrondissements

def extract_personal_info(data_folder):
    personal_info = {}

    address_pattern = r'\b(\d+\s+(?:rue|rues|boulevard|avenue)\s+[a-zA-Zéèàêûôâîç\s-]+)\b'
    number_sequence_pattern = r'\b\d+\b'

    for file_path in data_folder.rglob('*.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()

            personal_info[str(file_path)] = {
                'date_of_birth': {
                    'full_date': extract_full_date(transcript)
                },
                'addresses': re.findall(address_pattern, transcript),
                'postcodes': extract_postcodes(transcript),
                'phone_numbers': extract_phone_numbers(transcript),
                'names' : list(set(find_names_with_spacy(transcript))),
                # 'other_findings': re.findall(number_sequence_pattern, transcript),
            }
        
    return personal_info

personnal_data = extract_personal_info(DATA_FOLDER)
pprint(personnal_data)

{'..\\audio_database\\personnal_data_test\\2023_1_9_7_15_2_ch30\\2023_1_9_7_15_2_ch30_transcription.txt': {'addresses': [],
                                                                                                           'date_of_birth': {'full_date': None},
                                                                                                           'names': ['Paris',
                                                                                                                     'Bourges'],
                                                                                                           'phone_numbers': None,
                                                                                                           'postcodes': []},
 '..\\audio_database\\personnal_data_test\\2023_1_9_7_17_26_ch30\\2023_1_9_7_17_26_ch30_transcription.txt': {'addresses': [],
                                                                                               