In [17]:
from pathlib import Path
from utils import *
import pandas as pd
import string
import re
import spacy
from pprint import pprint
from tqdm import tqdm
import fuzzywuzzy
from fuzzywuzzy import fuzz, process
from unidecode import unidecode
from datetime import timedelta


DATA_FOLDER = Path("../audio_database/big_db_test/")

### Identify and save personnal information from transcripts

In [58]:
def is_valid_name(name):
    """Check if the name is valid for matching."""
    cleaned_name = name.strip()
        # Exclude names that are too short
    if len(cleaned_name) <= 2:
        return False
    
    # Exclude specific non-name words or symbols
    non_name_keywords = ["sos", "oui", "allô", "médecin", "médecins", "bonjour", "covid"]  # Add more keywords as needed
    if cleaned_name.lower() in non_name_keywords:
        return False

    return True

def find_names_with_spacy(transcript):
    nlp = spacy.load('fr_dep_news_trf')
    doc = nlp(transcript)
    names = list(set([w.text for w in doc if w.pos_ == 'PROPN' and is_valid_name(w.text)]))
    
    return names

def month_name_to_number(month_name):
    month_names = ['janvier', 'février', 'mars', 'avril', 'mai', 'juin', 
                   'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre']
    try:
        return month_names.index(month_name.lower()) + 1
    except ValueError:
        return None
    

def extract_full_date(transcript):
    min_year = 1900

    # Pattern for numerical full date
    full_date_pattern = r'\b(0?[1-9]|[12][0-9]|3[01])[-/](0?[1-9]|1[0-2])[-/](\d{2}|\d{4})\b'
    
    # Pattern for full date with month name
    month_names = '|'.join(['janvier', 'février', 'mars', 'avril', 'mai', 'juin', 
                            'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'])
    date_with_month_pattern = r'\b(0?[1-9]|[12][0-9]|3[01])\s+(' + month_names + r')\s+(\d{4})\b'

    # Try to extract full date in numerical format
    full_dates = re.findall(full_date_pattern, transcript)
    formatted_dates = []
    if full_dates:
        for date in full_dates:
            day, month, year = date
            if len(year) == 2:  # Convert two-digit year to four digits
                year = "19" + year if year > "24" else "20" + year
            # Handle transcript erros (ex: 1568 instead of 1968)

            if int(year) < min_year:
                year = int("19" + year[-2:])

            formatted_dates.append(f"{day.zfill(2)}/{month.zfill(2)}/{str(year)}")
    
        return formatted_dates

    # Try to extract full date with month name
    dates_with_month = re.findall(date_with_month_pattern, transcript)
    if dates_with_month:
        for date in dates_with_month:
            day, month_name, year = date
            month_number = month_name_to_number(month_name)
            if month_number:
                formatted_dates.append(f"{day.zfill(2)}/{month_number:02d}/{year}")
        
        return formatted_dates

    # Extract just the year
    year_pattern = r'\b(19\d{2}|20\d{2})\b'
    years = re.findall(year_pattern, transcript)
    if years:
        year = int(years[0])
        if year < min_year:
            year = int("19" + year[-2:])
        return f"01/01/{str(year)}"  # Defaulting to '01/01/YYYY' for year-only cases

    return None

def extract_phone_numbers(transcript):
    phone_regex = r'(06|07)(\d{8})'

    transcript = transcript.replace(" ", "")
    transcript = transcript.translate(str.maketrans('', '', string.punctuation))
    match = re.search(phone_regex, transcript)
    match = match.group() if match else match
    return match

def extract_postcodes(transcript):
    # Existing pattern for postcodes like '75015' or '75-015'
    postcode_pattern = r'\b(75|77|78|91|92|93|94|95)(?:[-\s]?)(\d{3})\b'
    raw_postcodes = re.findall(postcode_pattern, transcript)

    # Additional pattern for postcodes written as '19ème'
    arrondissement_pattern = r'\b(\d{1,2})[eè]me\b'
    arrondissements = re.findall(arrondissement_pattern, transcript)

    # Formatting arrondissements to Paris postcodes (e.g., '75019' for '19ème')
    formatted_arrondissements = ['75' + arr.zfill(3) for arr in arrondissements]

    # Combining and returning all found postcodes
    return [''.join(match) for match in raw_postcodes] + formatted_arrondissements


def normalize_transcript(transcript):
    # Define a mapping of abbreviations to expansions
    abbreviations = {
        " dr ": " docteur ",
        " st ": " saint ",
        " ste ": " sainte ",
        # Add more abbreviations and their expansions as needed
    }
    # Ensure the address is a lowercase string
    normalized_transcript = unidecode(str(transcript).lower())
    # Replace abbreviations with their expanded form
    for abbr, expansion in abbreviations.items():
        normalized_transcript = normalized_transcript.replace(abbr, expansion)
    # Replace hyphens with spaces
    normalized_transcript = normalized_transcript.replace("-", " ")
    return normalized_transcript

def clean_trailing_words(address):
    stop_words = ['dans le', "c'est ca", " alors ", " a ", " et ", " avec ", '.', ',', '!', '?']

    for stop_word in stop_words:
        if stop_word in address:
            # Remove the stop word and anything that follows
            address = address.split(stop_word)[0]
            break
    return address.strip()

def extract_personal_info(data_folder):
    personal_info = {}

    # address_pattern = r'\b(\d+\s+(?:rue|rues|boulevard|avenue|place)\s+[a-zA-Zéèàêûôâîç\s-]+)\b'
    address_pattern = r'\b(\d+\s+(?:rue|rues|boulevard|avenue|place|square|villa|quai|allée|chaussée|passage)\s+[a-zA-Zéèàêûôâîç\s-]+(?:\'[a-zA-Zéèàêûôâîç\s-]+)?)\b'

    # address_pattern = r'\b\d+\s*(rue|rues|avenue|boulevard|place)\s*[^\d,]+\b'
    number_sequence_pattern = r'\b\d+\b'

    for file_path in tqdm(data_folder.rglob('*.txt')):
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
            transcript = normalize_transcript(transcript)

            personal_info[str(Path(file_path.name))] = {
                'date_of_birth': {
                    'full_date': extract_full_date(transcript)
                },
                'addresses': list(set([clean_trailing_words(address) for address in re.findall(address_pattern, transcript)])),
                'postcodes': extract_postcodes(transcript),
                'phone_numbers': extract_phone_numbers(transcript),
                # 'names' : list(set(find_names_with_spacy(transcript))),
                # 'other_findings': re.findall(number_sequence_pattern, transcript),
            }
        
    return personal_info

personnal_data = extract_personal_info(DATA_FOLDER)

0it [00:00, ?it/s]

17162it [00:14, 1161.33it/s]


In [60]:
def calculate_personal_data_stats(personal_data):
    stats = {
        'total_files': 0,
        'non_empty_files': 0,
        'date_of_birth': 0,
        'addresses': 0,
        'postcodes': 0,
        'phone_numbers': 0
    }

    for _, info in personal_data.items():
        stats['total_files'] += 1
        non_empty = False

        for key, value in info.items():
            # Special handling for 'date_of_birth' as it contains a nested dictionary
            if key == 'date_of_birth':
                if value.get('full_date'):  # Check if 'full_date' is not empty
                    non_empty = True
                    stats[key] += 1
            elif value:  # Checks if the item is non-empty for other keys
                non_empty = True
                stats[key] += len(value) if isinstance(value, list) else 1

        if non_empty:
            stats['non_empty_files'] += 1

    return stats

# Usage
stats = calculate_personal_data_stats(personnal_data)
print(stats)

{'total_files': 17162, 'non_empty_files': 11580, 'date_of_birth': 5722, 'addresses': 8922, 'postcodes': 8489, 'phone_numbers': 2283}


In [82]:
def print_missing_keys(personnal_data, label_csv):
    missed_matches = {}
    # Get the unique values in the "file_path" column of label_csv
    label_file_paths = set(label_csv["file_path"])

    # Iterate over the keys in the personnal_data dictionary
    for key, info in personnal_data.items():
        # Check if the key is not present in the label_file_paths set
        if key not in label_file_paths and info['addresses']:
            print(info['addresses'])
            missed_matches[key] = info
    return missed_matches

label_csv = pd.read_csv("./output_labels.csv", sep=',', encoding='utf-8')

missed_matches = print_missing_keys(personnal_data, label_csv)
print(len(missed_matches))

['33 rue des ardennes']
['1 rue firmin genier']
['8 rue d o e']
['120 avenue du general de gaulle']
['75 rue saint charles']
['54 rue du theatre  alors', '54 rue du theatre']
['25 rue rennes neuquin']
['58 rue le thor']
['17 rue du faux bourg saint denis']
['16 rue frederic passy']
['72 avenue du dauphine']
['7 rue paul verlaine']
['145 avenue pasteur']
['158 rue victor hugo']
['49 rue de romainville']
['26 avenue de la resistance']
['78 rue michel ange']
['6 rue de la source bleue']
['217 rue de paris', "217 rue de paris vous m'avez dit"]
['4 avenue pasteur', '85 boulevard de port royal']
["22 avenue de l'orient", "22 avenue de l'omiere"]
['8 rue saint lucie']
['10 rue paulin talabot']
['7 rue bois le vent']
['5 rue remy dunissel']
["10 rue d'eterne"]
['17 rue voisin']
['29 boulevard mortier']
['104 rue regnault']
['32 avenue de la mode piquee']
["44 rue de l'egalite"]
['44 rue du bac']
['11 rue edmond roger']
['40 rue vosgelat']
['3 rues emile zola']
['6 rue robert desnos']
['1 rue g

### Data matching

In [86]:
import pandas as pd
from unidecode import unidecode
from fuzzywuzzy import process, fuzz
import re

def extract_road_number(address):
    match = re.search(r'\b\d+\b', address)
    return match.group(0) if match else None

def road_numbers_match(csv_road_number, input_road_number):
    # If the CSV address has 'nan' for the road number, we treat it as a match
    if csv_road_number == 'nan':
        return True
    # Check if the input road number is contained within the CSV road number or vice versa
    return input_road_number in csv_road_number or csv_road_number in input_road_number

def find_matching_row(csv_path, data_dict):
    # Define the columns to read
    cols_to_use = ['Date', 'Annee', 'Mois', 'Jour', 'Heure', 'Min', 'FullAdresse', 'TelPatient', 'DateNaiss', 'CodePostal', 'Nom', 'Prenom', 'Devenir', 'NumRue']

    # Read the CSV just once
    df = pd.read_csv(csv_path, usecols=cols_to_use, sep=',', encoding='utf-8')
    df.dropna(subset=['Devenir'],inplace=True)

    df.fillna({'Annee': 0, 'Mois': 0, 'Jour': 0, 'Heure': 0, 'Min': 0, 'Sec': 0}, inplace=True)
    df['DateTime'] = pd.to_datetime(df['Annee'].astype(int).astype(str) + '-' +
                                    df['Mois'].astype(int).astype(str).str.zfill(2) + '-' +
                                    df['Jour'].astype(int).astype(str).str.zfill(2) + ' ' +
                                    df['Heure'].astype(int).astype(str).str.zfill(2) + ':' +
                                    df['Min'].astype(int).astype(str).str.zfill(2),
                                    errors='coerce')
    df['DateNaiss'] = pd.to_datetime(df['DateNaiss'], format='%Y-%m-%d', errors='coerce')

    # Preprocess 'FullAdresse'
    df['FullAdresse'] = df['FullAdresse'].apply(lambda x: unidecode(str(x).lower()).replace("-", " "))

    # Prepare results dictionary
    results = {}

    for file_path, info in tqdm(data_dict.items()):
        # Extract and convert full datetime from file_path
        datetime_parts = file_path.split('_')[:6]
        datetime_str = '-'.join(datetime_parts[:3]) + ' ' + ':'.join(datetime_parts[3:6])
        file_datetime = pd.to_datetime(datetime_str)

        # Calculate 4-hour window
        end_time = file_datetime + timedelta(hours=2)

        # Filter the dataframe by the 4-hour window
        date_filtered_df = df[(df['DateTime'] >= file_datetime) & (df['DateTime'] <= end_time)]

        if info.get('phone_numbers'):
            transcript_phone_number = info['phone_numbers']
            medical_db_phone_number = date_filtered_df['TelPatient'].astype('str').str.contains(transcript_phone_number, na=False)
            match = date_filtered_df[medical_db_phone_number]
            if not match.empty:
                results[file_path] = (match, "phone_numbers")
                continue

        if info.get('addresses'):
            for address in info['addresses']:
                list_adresse = date_filtered_df['FullAdresse'].tolist()
                best_match = process.extractOne(address, list_adresse, scorer=fuzz.token_sort_ratio)
                if best_match:
                    # print(file_path, address, best_match)

                    # If we have a perfect match, we're done
                    if best_match[1] == 100:
                        match = date_filtered_df[date_filtered_df['FullAdresse'].str.contains(re.escape(best_match[0]), na=False)]
                        if not match.empty:
                            results[file_path] = (match, "addresses")
                            break

                    # For high-scoring matches, we check the road numbers
                    elif best_match[1] > 80:
                        input_road_number = extract_road_number(address)
                        csv_road_number = extract_road_number(best_match[0]) or 'nan'  # Treat missing numbers as 'nan'
                        if road_numbers_match(csv_road_number, input_road_number):
                            match = date_filtered_df[date_filtered_df['FullAdresse'].str.contains(re.escape(best_match[0]), na=False)]
                            if not match.empty:
                                results[file_path] = (match, "addresses")
                                break

        if info.get('date_of_birth', {}).get('full_date'):
            for dob in info['date_of_birth']['full_date']:
                dob = pd.to_datetime(dob, dayfirst=True, errors='coerce')
                if dob is not pd.NaT:
                    match = date_filtered_df[date_filtered_df['DateNaiss'] == dob]
                    if not match.empty:
                        results[file_path] = (match, "date_of_birth")
                        continue

        if info.get('postcodes'):
            for postcode in info['postcodes']:
                match = date_filtered_df[date_filtered_df['CodePostal'] == str(postcode)]
                if not match.empty:
                    results[file_path] = (match, "postcodes")
                    break

    return results

# Example usage:
medical_db_path = "../audio_database/base_medicale//BaseMed2022LR_cleaned.csv"

matches = find_matching_row(medical_db_path, missed_matches)
print(len(matches))
# for file_path, matched_rows in matches.items():
#     print(f"File: {file_path}")
#     print(matched_rows[0]["IdAppel"], matched_rows[1])

  df = pd.read_csv(csv_path, usecols=cols_to_use, sep=',', encoding='utf-8')
100%|██████████| 4576/4576 [00:11<00:00, 388.42it/s]

2





In [12]:
label_csv = pd.read_csv("./output_labels.csv", sep=',', encoding='utf-8')

# Group the data by the "Devenir" column and sample the same number of samples for each value
min_samples = min(len(label_csv[label_csv['Devenir'] == 0]), len(label_csv[label_csv['Devenir'] == 1]))
sampled_df = label_csv.groupby('Devenir').apply(lambda x: x.sample(n=min_samples))

# Reset the index of the sampled dataframe
sampled_df.reset_index(drop=True, inplace=True)

# Save the dataframe to a CSV file
sampled_df.to_csv('sampled_data.csv', index=False)

In [81]:
print(f"On réussi a matcher {len(matches)} fichiers sur {stats['total_files']} au total, soit : {len(matches) / stats['total_files'] * 100:.02f} % de la base")
print(f"On réussi a matcher {len(matches)} fichiers sur {stats['non_empty_files']} fichier ayant des infos personnelles récupèrable, soit : {len(matches) / stats['non_empty_files'] * 100:.02f} % de la base")
print(f"On réussi a matcher {len(matches)} fichiers sur {stats['addresses']} fichier avec adresse récupèrable, soit : {len(matches) / stats['addresses'] * 100:.02f} % de la base")

On réussi a matcher 4128 fichiers sur 17162 au total, soit : 24.05 % de la base
On réussi a matcher 4128 fichiers sur 11351 fichier ayant des infos personnelles récupèrable, soit : 36.37 % de la base
On réussi a matcher 4128 fichiers sur 8425 fichier avec adresse récupèrable, soit : 49.00 % de la base
