In [1]:
#%load_ext autoreload
#%autoreload 2

In [2]:
TABLE_FOLDER_PATH = "../tables"
GROUND_TRUTH_PATH   = "../gold/cta_gt.csv"
LANGUAGE_METADATA_PATH = "../gold/language_metadata.csv"
OUTPUT_PATH = "../sherlock_data_processing"

In [5]:
train_ratio= 0.8
val_ratio= 0.1
test_ratio= 0.1

In [6]:
import os
import shutil
import numpy as np
import pandas as pd


from data_processing_sherlock import DataProcessingSherlock

### Ground truth label processing
Modifying ground truth labels to one standard. Removing labels that counts is less than 7 since they are creating noise and could not be correctly disrtibuted to the train/test/val

In [7]:
LABEL_MAP = {
    # Date
    "Vaccination_date": "Date",
    "Date_report":"Date",
    "Date_onset":  "Date",
    "Date_confirmation": "Date",
    "Date_of_first_consultation":"Date",
    "Date_hospitalisation":  "Date",
    "Date_discharge_hospital": "Date",
    "Date_admission_ICU":   "Date",
    "Date_discharge_ICU":  "Date",
    "Date_isolation":  "Date",
    "Date_death":  "Date",
    "Date_recovered":  "Date",
    "Travel_history_entry": "Date",
    "Travel_history_start":  "Date",
    "Date_entry":  "Date",
    "Date_last_modified": "Date",

    # ID
    "Contact_ID": "ID",
    "ID": "ID",

    #Gender
    "Gender": "Gender",
    "Sex_at_birth": "Gender",
    "Gender_other": "Gender",
    "Sex_at_birth_other": "Gender",

    #Location
    "Travel_history_location": "Location",
    "Location_information": "Location",

    # Contact setting
    "Contact_setting": "Contact_setting",
    "Contact_setting_other": "Contact_setting",

    # demographic
    "Race": "Demographic",
    "Ehtnicity": "Demographic",

    # Medical Boolean
    "Healthcare_worker": "Medical_boolean",
    "Previous_infection": "Medical_boolean",
    "Pregnancy_Status": "Medical_boolean",
    "Vaccination":  "Medical_boolean",
    "Hospitalised":  "Medical_boolean",
    "Intensive_care":  "Medical_boolean",
    "Home_monitoring":  "Medical_boolean",
    "Isolated": "Medical_boolean",
    "Contact_with_case": "Medical_boolean",
    "Travel_history": "Medical_boolean",

    # Sourec
    "Source": "Source",
    "Source_II": "Source",
    "Source_III": "Source",
    "Source_IV": "Source",
}

In [8]:
LABEL_MAP_LC = {k.lower(): v.lower() for k, v in LABEL_MAP.items()}

def remap_labels(arr, mapping=LABEL_MAP_LC):
    """
    • forces each element to lower-case
    • replaces it if the key exists in `mapping`
    • otherwise leaves it as lower-case original
    """
    return np.array([mapping.get(x.lower(), x.lower()) for x in arr])

In [9]:
data = pd.read_csv(GROUND_TRUTH_PATH)

data['label_mapped'] = remap_labels(data['label'].values)

label_counts = data['label_mapped'].value_counts()

data = data.drop('label', axis=1)

# 4. find all labels with count < 7
small_labels = label_counts[label_counts < 7].index

# 5. set those labels to '__none__'
data.loc[data['label_mapped'].isin(small_labels), 'label_mapped'] = '__none__'

data = data.rename(columns={'label_mapped': 'label'})

data.to_csv(GROUND_TRUTH_PATH, index=False)

print(data["label"].value_counts())

label
__none__           1175
medical_boolean     210
date                128
location             57
id                   48
case_status          27
age                  24
gender               23
outcome              22
contact_setting      13
symptoms             11
occupation           11
Name: count, dtype: int64


In [10]:
def data_cleaning(data):
    """
    Clean the data in dataframes, removes hashes tha was broken during processing

    :param data: dataframe to clean
    :return: cleaned dataframe
    """

    for col in data.select_dtypes(include="object"):

        # make a bool mask of rows where the cell contains 'x000D'
        mask = data[col].str.contains("x000D", na=False)
        data.loc[mask, col] = np.nan

    return data

In [11]:
data_processing_sherlock = DataProcessingSherlock()

if os.path.exists(OUTPUT_PATH):
    shutil.rmtree(OUTPUT_PATH)

for filename in os.listdir(TABLE_FOLDER_PATH):
    if not filename.lower().endswith(".csv"):
        continue

    table_csv_path = os.path.join(TABLE_FOLDER_PATH, filename)
    table_name = filename  # matches the 'table_name' column in GT

    try:
        # Example usage:
        data, labels = data_processing_sherlock.load_table_with_labels(
            table_csv_path=TABLE_FOLDER_PATH + "/" + filename,
            gt_csv_path   =GROUND_TRUTH_PATH,
            table_name    =table_name
        )

        #cleaned_data = data_cleaning(data)

        data_processing_sherlock.flatten_and_save(data, labels, OUTPUT_PATH, table_name, LANGUAGE_METADATA_PATH,
                                                  train_ratio=train_ratio, val_ratio=val_ratio, test_ratio=test_ratio)

        #print(f"Processed {filename} -> {data_path}")
    except Exception as e:
        print(f"Skipping {filename}: {e}")

Combined data length: 52
Combined labels length: 52
Combined lang length: 52
Combined data length: 55
Combined labels length: 55
Combined lang length: 55
Combined data length: 72
Combined labels length: 72
Combined lang length: 72
Combined data length: 96
Combined labels length: 96
Combined lang length: 96
Combined data length: 117
Combined labels length: 117
Combined lang length: 117
Combined data length: 164
Combined labels length: 164
Combined lang length: 164
Combined data length: 186
Combined labels length: 186
Combined lang length: 186
Combined data length: 196
Combined labels length: 196
Combined lang length: 196
Combined data length: 213
Combined labels length: 213
Combined lang length: 213
Combined data length: 225
Combined labels length: 225
Combined lang length: 225
Combined data length: 233
Combined labels length: 233
Combined lang length: 233
Combined data length: 250
Combined labels length: 250
Combined lang length: 250
Combined data length: 255
Combined labels length: 25

In [12]:
import pyarrow.parquet as pq
tbl = pq.read_table(f"{OUTPUT_PATH}/data.parquet")
print(tbl.schema)

__index_level_0__: int64
values: string


In [13]:
for row in tbl.column("values"):
    print(row.as_py(), type(row.as_py()))

Grace,Naïm,Jeanne,Lethokuhle,Meriem,Modibo,Olivia,Halim,Salma,Khawla,Habiba,Marcelle,Thomas,Mahmoud,Ali,Benjamin,Claudine,Aïcha,Agathe,Mariam,Dieudonné,Aminata,Odette,Fabrice,Marwa,Christian,Sara,Imene,Jacqueline,Said,Omphile,Ursule,Rachelle,Amina,Mohamed,Nesreen,Abdel-Rahman,Sarah,Antonio,Omar,Yasmine,Urbain,Georges,Xavier,Ashraqat,Hubert,Sonia,Zacharie,Monique,Frédéric,Marc,Gamalat,Amahle,Théo,Norbert,Milagrosa,Albert <class 'str'>
William Mbaiki,Adrienne Sibut,Sébastien Ndélé,Rachelle Maloum,Jacques Kabo,Henriette Goumba,Henri Paoua,Séraphine Dacko,Dorothée Bossembélé,Raoul Maloum,Roland Carnot,Bossembélé,Bangassou,Lucienne Nzale,Fabienne Malébo,Jacqueline Yamissou,Claire Ndakala,Paoua,Bokassa,Marcelle Bangui,Sylvie Yaloke,Gbaya,Kévin Damara,Samuel Djotodia,Bossangoa,Lucien Bambari,Paul Yamissou,Yakoma,Madeleine Bimbo,Valérie Malébo,Goumba,Mbaiki,Aurélie Gamboula,Noël Ngbandi,Bouar,Quitterie Tombalbaye,André Bouar,Tatiana Bozizé,Jeanne Maloum,Boganda,Théo Bossangoa,Rachel Yaloke,Bou

In [15]:
df = tbl.to_pandas()
print(df)

     __index_level_0__                                             values
0                    0  Grace,Naïm,Jeanne,Lethokuhle,Meriem,Modibo,Oli...
1                    1  William Mbaiki,Adrienne Sibut,Sébastien Ndélé,...
2                    2  Quentin Bozizé,Luc Nola,Régis Kabo,Sébastien N...
3                    3  communaute,fosa,reco,cte,lui-meme,recherche_ac...
4                    4                       Democratic Republic of Congo
..                 ...                                                ...
926                926                                              0,X,1
927                927                                              X,1,x
928                928                                              x,X,1
929                929                                              X,x,3
930                930                              21,0,18,13,14,12,17,7

[931 rows x 2 columns]


In [16]:
py_lists = [s.split(",") for s in tbl.column("values").to_pylist()]
for L in py_lists:
    print(L)

['Grace', 'Naïm', 'Jeanne', 'Lethokuhle', 'Meriem', 'Modibo', 'Olivia', 'Halim', 'Salma', 'Khawla', 'Habiba', 'Marcelle', 'Thomas', 'Mahmoud', 'Ali', 'Benjamin', 'Claudine', 'Aïcha', 'Agathe', 'Mariam', 'Dieudonné', 'Aminata', 'Odette', 'Fabrice', 'Marwa', 'Christian', 'Sara', 'Imene', 'Jacqueline', 'Said', 'Omphile', 'Ursule', 'Rachelle', 'Amina', 'Mohamed', 'Nesreen', 'Abdel-Rahman', 'Sarah', 'Antonio', 'Omar', 'Yasmine', 'Urbain', 'Georges', 'Xavier', 'Ashraqat', 'Hubert', 'Sonia', 'Zacharie', 'Monique', 'Frédéric', 'Marc', 'Gamalat', 'Amahle', 'Théo', 'Norbert', 'Milagrosa', 'Albert']
['William Mbaiki', 'Adrienne Sibut', 'Sébastien Ndélé', 'Rachelle Maloum', 'Jacques Kabo', 'Henriette Goumba', 'Henri Paoua', 'Séraphine Dacko', 'Dorothée Bossembélé', 'Raoul Maloum', 'Roland Carnot', 'Bossembélé', 'Bangassou', 'Lucienne Nzale', 'Fabienne Malébo', 'Jacqueline Yamissou', 'Claire Ndakala', 'Paoua', 'Bokassa', 'Marcelle Bangui', 'Sylvie Yaloke', 'Gbaya', 'Kévin Damara', 'Samuel Djotodia'

## Labels

In [17]:
import pandas as pd
labels = pd.read_parquet(f"{OUTPUT_PATH}/labels.parquet")
print(labels.columns)
print(labels.shape)
print(labels[:5])
print(labels.index.name)

Index(['type'], dtype='object')
(931, 1)
       type
0  __none__
1  __none__
2  __none__
3  __none__
4  __none__
None


In [18]:
y_train = np.array([x.lower() for x in labels])
print(y_train)

['type']


### Unique labels

In [19]:
from itertools import count, groupby
import pandas as pd
data = pd.read_csv(GROUND_TRUTH_PATH)

unique_labels = data["label"].unique()
unique_count = len(unique_labels)

label_count = (data.groupby("label").size().reset_index(name="count").sort_values("count", ascending=False) )

print(label_count)


              label  count
0          __none__   1175
8   medical_boolean    210
4              date    128
7          location     57
6                id     48
2       case_status     27
1               age     24
5            gender     23
10          outcome     22
3   contact_setting     13
9        occupation     11
11         symptoms     11


### Labels per split

In [20]:
import pandas as pd

data_dir = OUTPUT_PATH
# 1. load each split
train = pd.read_parquet(f"{data_dir}/train_labels.parquet")
val   = pd.read_parquet(f"{data_dir}/validation_labels.parquet")
test  = pd.read_parquet(f"{data_dir}/test_labels.parquet")

# 2. get raw counts per label in each split
train_counts = train['type'].value_counts().rename('train')
val_counts   = val  ['type'].value_counts().rename('validation')
test_counts  = test ['type'].value_counts().rename('test')

# 3. combine into one DataFrame and fill missing labels with 0
dist = pd.concat([train_counts, val_counts, test_counts], axis=1).fillna(0).astype(int)

print(dist)

          train  validation  test
type                             
__none__    744          93    94


In [21]:
len(test)

94

In [22]:
data = pd.read_parquet(f"{OUTPUT_PATH}/data.parquet")

data.columns

Index(['__index_level_0__', 'values'], dtype='object')

In [23]:
labels = pd.read_parquet(f"{OUTPUT_PATH}/labels.parquet")
labels.columns

Index(['type'], dtype='object')

### Finding unique values for label

In [21]:
import pandas as pd

In [22]:
# 1. Read in both files
data = pd.read_parquet(f"{OUTPUT_PATH}/data.parquet")
labels = pd.read_parquet(f"{OUTPUT_PATH}/labels.parquet")

# 2. If your labels DataFrame has its index-level stored as the index, bring it into a column:
if labels.index.name == "__index_level_0__" or labels.index.name is None:
    labels = labels.reset_index().rename(columns={"index": "__index_level_0__"})

# 3. Make sure the merge key is the same dtype on both sides:
data["__index_level_0__"] = data["__index_level_0__"].astype(labels["__index_level_0__"].dtype)

# 4. Merge on that key:
merged = data.merge(labels, on="__index_level_0__")

In [23]:
merged

Unnamed: 0,__index_level_0__,values,type
0,0,"e_4800uq0,046_q0u0e,ue_009q06,u_018q0e0,qe4_00...",id
1,1,"0C101DE5RTQ2,E15R0T0CQ52D,RC18E0T03Q2D,03QRTED...",id
2,2,"bik094,mbk228,bik074,bik001,mbk461,mbk055,mbk2...",id
3,3,"89T1R7D9QK010BCME06--,410030ER-0QKBDCT2-6R,4CD...",id
4,4,"45,4,60,65,22,8,30,19,2,46,18,3,35,54,33,85,31...",age
...,...,...,...
537,537,"0R46-B4K000310,R020-BK4004610,970B09000-R0K0,-...",id
538,538,"44,64,70,31,2S,8,63,1992,16,9M,39,20,15,57,77,...",age
539,539,"m,f,A,F,M",gender
540,540,Confirmé,case_status


In [24]:
symptoms_values = merged.loc[merged["type"] == "symptoms",
merged.columns.difference(["__index_level_0__", "type"])].squeeze().unique()

print(symptoms_values)

['JAUNDICE;PAINABDO;WEAK,FEVER;RASH;VOMIT;WEAK,DARKU;DIARR;FEVER;JAUNDICE;PAINABDO;STOPAL;VOMIT;WEAK,WEAK,DIARR;FEVER;JAUNDICE;PAINABDO;VOMIT,FEVER;HEAD;PAINABDO;VOMIT;WEAK,ANO;DIARR;FEVER;HEPATOME;PAINABDO;RASH;RESP;SORETHR;VOMIT,JAUNDICE;PAINABDO;RESP;STOPAL;WEAK,DIARR;PAINABDO;RASH,JAUNDICE;STOPAL;VOMIT;WEAK,FEVER,DIARR;JAUNDICE;STOPAL;WEAK,DIARR;FEVER;JAUNDICE;STOPAL;VOMIT;WEAK,PAINABDO,FEVER;O;STOBLO;VOMIT,FEVER;PAINABDO;RESP;VOMIT;WEAK,DIARR;HEAD;JAUNDICE;PAINABDO;WEAK,DIARR;JAUNDICE;VOMIT;WEAK,FEVER;HEAD;JAUNDICE;PAINABDO;VOMIT,DIARR;FEVER;JAUNDICE;O;PAINABDO;RASH;RESP;VOMIT;WEAK,DIARR;PAINABDO;RESP;VOMIT;WEAK,O,DARKU;HEPATOME;JAUNDICE;O;PAINABDO;VOMIT,JAUNDICE;O;PAINABDO;STOPAL;VOMIT;WEAK,DIARR;FEVER;PAINABDO;RESP;VOMIT;WEAK,FEVER;O;VOMIT,DARKU;JAUNDICE;PAINABDO;RESP;STOPAL;VOMIT;WEAK,ANO;O;RESP;WEAK,DIARR;FEVER;VOMIT,JAUNDICE;PAINABDO;VOMIT,FEVER;JAUNDICE;PAINABDO;STOPAL,DIARR;JAUNDICE;PAINABDO;STOPAL;VOMIT;WEAK,DIARR;FEVER;PAINABDO;VOMIT;WEAK,DARKU;JAUNDICE;PAINABDO;VOMIT,DAR

Symptoms contains:

'NO,YES' 'YES,NO' 'NO'
1.0, 0.0
Non, Oui

Pre_existing condition:

"Yes, No"
"Yes", "NO"
