In [1]:
from pathlib import Path
from utils import *
import pandas as pd
import pyodbc
import string
import re
import spacy
from pprint import pprint
from tqdm import tqdm

DATA_FOLDER = Path("C:/Users/Jean-BaptistePERNEY/Documents/ECHO/DATA/test_1h/raw_transcriptions/")

### Identify and save personnal information from transcripts

In [12]:
def is_valid_name(name):
    """Check if the name is valid for matching."""
    cleaned_name = name.strip()
        # Exclude names that are too short
    if len(cleaned_name) <= 2:
        return False
    
    # Exclude specific non-name words or symbols
    non_name_keywords = ["sos", "oui", "allô", "médecin", "médecins", "bonjour", "covid"]  # Add more keywords as needed
    if cleaned_name.lower() in non_name_keywords:
        return False

    return True

def find_names_with_spacy(transcript):
    nlp = spacy.load('fr_dep_news_trf')
    doc = nlp(transcript)
    names = list(set([w.text for w in doc if w.pos_ == 'PROPN' and is_valid_name(w.text)]))
    
    return names

def month_name_to_number(month_name):
    month_names = ['janvier', 'février', 'mars', 'avril', 'mai', 'juin', 
                   'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre']
    try:
        return month_names.index(month_name.lower()) + 1
    except ValueError:
        return None
    

def extract_full_date(transcript):
    min_year = 1900

    # Pattern for numerical full date
    full_date_pattern = r'\b(0?[1-9]|[12][0-9]|3[01])[-/](0?[1-9]|1[0-2])[-/](\d{2}|\d{4})\b'
    
    # Pattern for full date with month name
    month_names = '|'.join(['janvier', 'février', 'mars', 'avril', 'mai', 'juin', 
                            'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'])
    date_with_month_pattern = r'\b(0?[1-9]|[12][0-9]|3[01])\s+(' + month_names + r')\s+(\d{4})\b'

    # Try to extract full date in numerical format
    full_dates = re.findall(full_date_pattern, transcript)
    for date in full_dates:
        day, month, year = date
        if len(year) == 2:  # Convert two-digit year to four digits
            year = "19" + year if year > "24" else "20" + year
         # Handle transcript erros (ex: 1568 instead of 1968)

        if int(year) < min_year:
            year = int("19" + year[-2:])

        formatted_date = f"{day.zfill(2)}/{month.zfill(2)}/{str(year)}"
        return formatted_date

    # Try to extract full date with month name
    dates_with_month = re.findall(date_with_month_pattern, transcript)
    for date in dates_with_month:
        day, month_name, year = date
        month_number = month_name_to_number(month_name)
        if month_number:
            formatted_date = f"{day.zfill(2)}/{month_number:02d}/{year}"
            return formatted_date

    # Extract just the year
    year_pattern = r'\b(19\d{2}|20\d{2})\b'
    years = re.findall(year_pattern, transcript)
    if years:
        year = int(years[0])
        if year < min_year:
            year = int("19" + year[-2:])
        return f"01/01/{str(year)}"  # Defaulting to '01/01/YYYY' for year-only cases

    return None

def extract_phone_numbers(transcript):
    phone_regex = r'(06|07)(\d{8})'

    transcript = transcript.replace(" ", "")
    transcript = transcript.translate(str.maketrans('', '', string.punctuation))
    match = re.search(phone_regex, transcript)
    match = match.group() if match else match
    return match

def extract_postcodes(transcript):
    # Existing pattern for postcodes like '75015' or '75-015'
    postcode_pattern = r'\b(75|77|78|91|92|93|94|95)(?:[-\s]?)(\d{3})\b'
    raw_postcodes = re.findall(postcode_pattern, transcript)

    # Additional pattern for postcodes written as '19ème'
    arrondissement_pattern = r'\b(\d{1,2})[eè]me\b'
    arrondissements = re.findall(arrondissement_pattern, transcript)

    # Formatting arrondissements to Paris postcodes (e.g., '75019' for '19ème')
    formatted_arrondissements = ['75' + arr.zfill(3) for arr in arrondissements]

    # Combining and returning all found postcodes
    return [''.join(match) for match in raw_postcodes] + formatted_arrondissements

def extract_personal_info(data_folder):
    personal_info = {}

    address_pattern = r'\b(\d+\s+(?:rue|rues|boulevard|avenue)\s+[a-zA-Zéèàêûôâîç\s-]+)\b'
    number_sequence_pattern = r'\b\d+\b'

    for file_path in tqdm(data_folder.rglob('*.txt')):
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()

            personal_info[str(file_path)] = {
                'date_of_birth': {
                    'full_date': extract_full_date(transcript)
                },
                'addresses': re.findall(address_pattern, transcript),
                'postcodes': extract_postcodes(transcript),
                'phone_numbers': extract_phone_numbers(transcript),
                'names' : list(set(find_names_with_spacy(transcript))),
                # 'other_findings': re.findall(number_sequence_pattern, transcript),
            }
        
    return personal_info

personnal_data = extract_personal_info(DATA_FOLDER)
pprint(personnal_data)

75it [02:01,  1.62s/it]

{'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_10_16_ch11\\2022_1_4_10_10_16_ch11_transcription.txt': {'addresses': ['7 '
                                                                                                                                                                        'rue '
                                                                                                                                                                        'de '
                                                                                                                                                                        'Champagne'],
                                                                                                                                                          'date_of_birth': {'full_date': '15/05/1982'},
                                                                                                




### Load telecom log database

In [4]:
def load_mdb_data(file_path, table_name="InboundVoiceCalls"):
    try:
        # Connection string for MDB file
        conn_str = f'DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={file_path};'
        
        # Establishing connection to the MDB file
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        
        # Reading each table into a pandas DataFrame
        df = pd.read_sql(f'SELECT * FROM [{table_name}]', conn)
        
        cursor.close()
        conn.close()

        return df
        
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

file_path = "/Users/Jean-BaptistePERNEY/Documents/ECHO/DATA/telecom_log_2022/01022022.mdb"
telecom_log_df = load_mdb_data(file_path, table_name="InboundVoiceCalls")

telecom_log_df = telecom_log_df[telecom_log_df['CallAgentCommunicationDuration'] != 0]
telecom_log_df = telecom_log_df[telecom_log_df['CallServiceID'] != 'SOS']

columns_to_keep = ["CallTime", "CallCLID", "AgentID", "CallAgentCommunicationDuration"]
telecom_log_df = telecom_log_df[columns_to_keep]

telecom_log_df.head()

  df = pd.read_sql(f'SELECT * FROM [{table_name}]', conn)


Unnamed: 0,CallTime,CallCLID,AgentID,CallAgentCommunicationDuration
38,2021-12-11 08:24:43,667513808,martynowski2438,118
49,2021-12-11 09:00:07,689634034,martynowski2438,99
72,2021-12-11 10:15:33,614477661,martynowski2438,165
100,2021-12-11 10:53:13,783847741,martynowski2438,224
103,2021-12-11 10:57:41,619819855,martynowski2438,115


In [33]:
personnal_data.keys()

dict_keys(['C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_10_16_ch11\\2022_1_4_10_10_16_ch11_transcription.txt', 'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_10_8_ch9\\2022_1_4_10_10_8_ch9_transcription.txt', 'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_11_3_ch17\\2022_1_4_10_11_3_ch17_transcription.txt', 'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_11_41_ch11\\2022_1_4_10_11_41_ch11_transcription.txt', 'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_12_49_ch11\\2022_1_4_10_12_49_ch11_transcription.txt', 'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_13_33_ch30\\2022_1_4_10_13_33_ch30_transcription.txt', 'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions

In [40]:
df = pd.read_csv(medical_db_path, delimiter=';', dayfirst=True)

# Explicitly convert the 'DateAppel' column to datetime objects
df['DateAppel'] = pd.to_datetime(df['DateAppel'], errors='coerce', dayfirst=True)

# Drop any rows where 'DateAppel' could not be converted
df = df.dropna(subset=['DateAppel'])
file_name = 'C:\\Users\\Jean-BaptistePERNEY\\Documents\\ECHO\\DATA\\test_1h\\raw_transcriptions\\2022_1_4_10_10_16_ch11\\2022_1_4_10_10_16_ch11_transcription.txt'
timestamp_parts = file_name.split('\\')[-2].split('_')
print(timestamp_parts)
date_str = f"{timestamp_parts[0]}-{timestamp_parts[1]}-{timestamp_parts[2]}"
search_date = pd.to_datetime(date_str).date()
print(search_date)
# Filter the dataframe by the date
date_filtered_df = df[df['DateAppel'].dt.date == search_date]
date_filtered_df

['2022', '1', '4', '10', '10', '16', 'ch11']
2022-01-04


  df = pd.read_csv(medical_db_path, delimiter=';', dayfirst=True)


Unnamed: 0,IdAppel,TelAppelant,TelPatient,IdUtilisateur_DmdAmbu,IdDevenir,LientPatient,IdProvenance,NumDossier,LettreMedTr,DateAppel,...,SignePositif,PatientVIP,IdMoyen,CommentaireSamu,IdAppelDiagnostic,IdAppel.1,IdUtilisateur_C,DH_Creation,IdDiagnostic,IdDiagnostic.1
1379,6355940,07.66.53.49.51,07.66.53.49.51,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3873183,6355940,1151,04/01/2022 02:16,672,672
1380,6355941,06.60.84.32.10,06.60.84.32.10,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3873203,6355941,1194,04/01/2022 03:22,381,381
1381,6355951,06.50.15.74.50,06.50.15.74.50,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3873208,6355951,1110,04/01/2022 03:33,1578,1578
1382,6355959,07.86.09.49.77,07.86.09.49.77,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3873219,6355959,1151,04/01/2022 05:19,1396,1396
1383,6355962,06.62.69.31.42,06.62.69.31.42,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3873210,6355962,1194,04/01/2022 03:53,381,381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946,6357876,06.44.89.19.99,06.44.89.19.99,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3874850,6357876,1110,05/01/2022 02:24,1097,1097
1947,6357887,06.86.97.72.62,06.86.97.72.62,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3874835,6357887,1151,05/01/2022 01:01,677,677
1948,6357891,06.58.59.90.70,06.58.59.90.70,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3874836,6357891,1110,05/01/2022 01:03,826,826
1949,6357893,07.61.89.22.62,07.61.89.22.62,0.0,1.0,,71,,0,2022-01-04,...,,0.0,,,3874829,6357893,1151,05/01/2022 00:24,670,670


In [37]:
df['DateAppel']

0       2022-01-01
1       2022-01-01
2       2022-01-01
3       2022-01-01
4       2022-01-01
           ...    
15821   2022-01-31
15824   2022-01-31
15825   2022-01-31
15826   2022-01-31
15829   2022-01-31
Name: DateAppel, Length: 13945, dtype: datetime64[ns]

### Data matching

In [45]:
import pandas as pd
from datetime import datetime
import re

def find_matching_row(csv_path, data_dict):
    # Read the CSV just once
    df = pd.read_csv(csv_path, delimiter=';', dayfirst=True)

    # Explicitly convert the 'DateAppel' column to datetime objects
    df['DateAppel'] = pd.to_datetime(df['DateAppel'], errors='coerce', dayfirst=True)

    # Drop any rows where 'DateAppel' could not be converted
    df = df.dropna(subset=['DateAppel'])

    # Prepare results dictionary
    results = {}

    for file_path, info in data_dict.items():
        # Parse the timestamp from the file_path
        timestamp_parts = file_path.split('\\')[-2].split('_')
        date_str = f"{timestamp_parts[0]}-{timestamp_parts[1]}-{timestamp_parts[2]}"
        search_date = pd.to_datetime(date_str).date()
        
        # Filter the dataframe by the date
        date_filtered_df = df[df['DateAppel'].dt.date == search_date]

        # Attempt to match phone number
        if 'phone_numbers' in info and info['phone_numbers']:
            phone_number = info['phone_numbers']
            match = date_filtered_df[date_filtered_df['TelAppelant'].str.contains(phone_number, na=False)]
            if not match.empty:
                results[file_path] = (match, "phone_numbers")
                continue  # Found a match with phone number, skip to next entry

        # Attempt to match addresses
        if 'addresses' in info and info['addresses']:
            for address in info['addresses']:
                match = date_filtered_df[date_filtered_df['Adresse'].str.contains(address, na=False)]
                if not match.empty:
                    results[file_path] = (match, "addresses")
                    break  # Found a match with address, skip to next entry

        # Attempt to match postcodes
        if 'postcodes' in info and info['postcodes']:
            for postcode in info['postcodes']:
                match = date_filtered_df[date_filtered_df['CodePostal'] == postcode]
                if not match.empty:
                    results[file_path] = (match, "postcodes")
                    break  # Found a match with postcode, skip to next entry

        # Attempt to match date of birth
        if 'date_of_birth' in info and info['date_of_birth']:
            dob = pd.to_datetime(info['date_of_birth']['full_date'], dayfirst=True, errors='coerce')
            if dob is not None:
                match = date_filtered_df[date_filtered_df['DateNaiss'] == dob.date()]
                if not match.empty:
                    results[file_path] = (match, "date_of_birth")
                    continue  # Found a match with date of birth, skip to next entry

        # Attempt to match names
        if 'names' in info and info['names']:
            for name in info['names']:
                match = date_filtered_df[(date_filtered_df['Nom'].str.contains(name, na=False)) | 
                                         (date_filtered_df['Prenom'].str.contains(name, na=False))]
                if not match.empty:
                    results[file_path] = (match, "names")
                    break  # Found a match with name, skip to next entry

    return results

# Example usage:
medical_db_path = "/Users/Jean-BaptistePERNEY/Documents/ECHO/DATA/medical_database_2022/calls_diag/janvier 2022 appels avec diag.csv"

matches = find_matching_row(medical_db_path, personnal_data)
print(len(matches))
for file_path, matched_rows in matches.items():
    print(f"File: {file_path}")
    print(matched_rows[0]["IdAppel"], matched_rows[1])

  df = pd.read_csv(csv_path, delimiter=';', dayfirst=True)


26
File: C:\Users\Jean-BaptistePERNEY\Documents\ECHO\DATA\test_1h\raw_transcriptions\2022_1_4_10_11_3_ch17\2022_1_4_10_11_3_ch17_transcription.txt
1609    6356541
Name: IdAppel, dtype: int64 names
File: C:\Users\Jean-BaptistePERNEY\Documents\ECHO\DATA\test_1h\raw_transcriptions\2022_1_4_10_13_33_ch30\2022_1_4_10_13_33_ch30_transcription.txt
1529    6356274
1777    6357278
Name: IdAppel, dtype: int64 names
File: C:\Users\Jean-BaptistePERNEY\Documents\ECHO\DATA\test_1h\raw_transcriptions\2022_1_4_10_17_47_ch9\2022_1_4_10_17_47_ch9_transcription.txt
1512    6356232
Name: IdAppel, dtype: int64 names
File: C:\Users\Jean-BaptistePERNEY\Documents\ECHO\DATA\test_1h\raw_transcriptions\2022_1_4_10_18_31_ch15\2022_1_4_10_18_31_ch15_transcription.txt
1608    6356533
Name: IdAppel, dtype: int64 names
File: C:\Users\Jean-BaptistePERNEY\Documents\ECHO\DATA\test_1h\raw_transcriptions\2022_1_4_10_18_4_ch11\2022_1_4_10_18_4_ch11_transcription.txt
1611    6356545
1642    6356647
1643    6356647
1900    6