In [14]:
train_ratio= 0.8
val_ratio= 0.1
test_ratio= 0.1
OUTPUT_PATH = "../sherlock_data_processing"

In [15]:
TABLE_FOLDER_PATH = "../tables"
GROUND_TRUTH_PATH   = "../gold/cta_gt.csv"
LANGUAGE_METADATA_PATH = "../gold/language_metadata.csv"

In [16]:
import os
import shutil
import numpy as np
import pandas as pd
import re

from data_processing_sherlock import DataProcessingSherlock

In [17]:
LABEL_MAP = {
    # Date
    "Vaccination_date": "Date",
    "Date_report":"Date",
    "Date_onset":  "Date",
    "Date_confirmation": "Date",
    "Date_of_first_consultation":"Date",
    "Date_hospitalisation":  "Date",
    "Date_discharge_hospital": "Date",
    "Date_admission_ICU":   "Date",
    "Date_discharge_ICU":  "Date",
    "Date_isolation":  "Date",
    "Date_death":  "Date",
    "Date_recovered":  "Date",
    "Travel_history_entry": "Date",
    "Travel_history_start":  "Date",
    "Date_entry":  "Date",
    "Date_last_modified": "Date",

    # ID
    "Contact_ID": "ID",
    "ID": "ID",

    #Gender
    "Gender": "Gender",
    "Sex_at_birth": "Gender",
    "Gender_other": "Gender",
    "Sex_at_birth_other": "Gender",

    #Location
    "Travel_history_location": "Location",
    "Location_information": "Location",

    # Contact setting
    "Contact_setting": "Contact_setting",
    "Contact_setting_other": "Contact_setting",

    # demographic
    "Race": "Demographic",
    "Ehtnicity": "Demographic",

    # Medical Boolean
    "Healthcare_worker": "Medical_boolean",
    "Previous_infection": "Medical_boolean",
    "Pregnancy_Status": "Medical_boolean",
    "Vaccination":  "Medical_boolean",
    "Hospitalised":  "Medical_boolean",
    "Intensive_care":  "Medical_boolean",
    "Home_monitoring":  "Medical_boolean",
    "Isolated": "Medical_boolean",
    "Contact_with_case": "Medical_boolean",
    "Travel_history": "Medical_boolean",

    # Sourec
    "Source": "Source",
    "Source_II": "Source",
    "Source_III": "Source",
    "Source_IV": "Source",
}

In [18]:
LABEL_MAP_LC = {k.lower(): v.lower() for k, v in LABEL_MAP.items()}

def remap_labels(arr, mapping=LABEL_MAP_LC):
    """
    • forces each element to lower-case
    • replaces it if the key exists in `mapping`
    • otherwise leaves it as lower-case original
    """
    return np.array([mapping.get(x.lower(), x.lower()) for x in arr])

In [19]:
data = pd.read_csv(GROUND_TRUTH_PATH)

data['label_mapped'] = remap_labels(data['label'].values)

label_counts = data['label_mapped'].value_counts()

data = data.drop('label', axis=1)

# 4. find all labels with count < 7
small_labels = label_counts[label_counts < 7].index

# 5. set those labels to '__none__'
data.loc[data['label_mapped'].isin(small_labels), 'label_mapped'] = '__none__'

data = data.rename(columns={'label_mapped': 'label'})

data.to_csv(GROUND_TRUTH_PATH, index=False)

print(data["label"].value_counts())

label
__none__           1175
medical_boolean     210
date                128
location             57
id                   48
case_status          27
age                  24
gender               23
outcome              22
contact_setting      13
symptoms             11
occupation           11
Name: count, dtype: int64


In [20]:
def data_cleaning(data):
    """
    Clean the data in dataframes, removes hashes tha was broken during processing

    :param data: dataframe to clean
    :return: cleaned dataframe
    """

    for col in data.select_dtypes(include="object"):

        # make a bool mask of rows where the cell contains 'x000D'
        mask = data[col].str.contains("x000D", na=False)
        data.loc[mask, col] = np.nan

    return data

In [26]:
def generic_clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    For every object-dtype column in `df`:
      • lowercases
      • replaces | and ; with commas
      • strips out non-ASCII characters
      • trims whitespace
      • turns empty strings into NaN
    """
    def _clean_cell(val):
        if pd.isnull(val):
            return val
        s = str(val).lower()
        # unify delimiters
        s = s.replace('|', ',').replace(';', ',')
        # remove any non-ASCII
        s = s.encode('ascii', 'ignore').decode('ascii')
        # collapse multiple commas & strip
        s = re.sub(r',+', ',', s).strip(' ,')
        return s or np.nan

    obj_cols = df.select_dtypes(include="object").columns
    df[obj_cols] = df[obj_cols].applymap(_clean_cell)
    return df

In [27]:
data_processing_sherlock = DataProcessingSherlock()

if os.path.exists(OUTPUT_PATH):
    shutil.rmtree(OUTPUT_PATH)

for filename in os.listdir(TABLE_FOLDER_PATH):
    if not filename.lower().endswith(".csv"):
        continue

    table_csv_path = os.path.join(TABLE_FOLDER_PATH, filename)
    table_name = filename  # matches the 'table_name' column in GT

    try:
        # Example usage:
        data, labels = data_processing_sherlock.load_table_with_labels(
            table_csv_path=TABLE_FOLDER_PATH + "/" + filename,
            gt_csv_path   =GROUND_TRUTH_PATH,
            table_name    =table_name
        )

        #cleaned_data = data_cleaning(data)

        cleaned_data = generic_clean_data(data)

        data_processing_sherlock.flatten_and_save(cleaned_data, labels, OUTPUT_PATH, table_name, LANGUAGE_METADATA_PATH,
                                                  train_ratio=train_ratio, val_ratio=val_ratio, test_ratio=test_ratio)

        #print(f"Processed {filename} -> {data_path}")
    except Exception as e:
        print(f"Skipping {filename}: {e}")

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Combined data length: 9
Combined labels length: 9
Combined lang length: 9
Skipping WHO_022.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 11
Combined labels length: 11
Combined lang length: 11
Skipping WHO_025.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 13
Combined labels length: 13
Combined lang length: 13
Skipping WHO_010.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 43
Combined labels length: 43
Combined lang length: 43
Skipping WHO_017.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 47
Combined labels length: 47
Combined lang lengt

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Combined data length: 101
Combined labels length: 101
Combined lang length: 101
Skipping WHO_033.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 103
Combined labels length: 103
Combined lang length: 103
Skipping WHO_018.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 109
Combined labels length: 109
Combined lang length: 109
Skipping WHO_016.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 111
Combined labels length: 111
Combined lang length: 111
Skipping WHO_011.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 181
Combined labels length: 181
Co

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Combined data length: 318
Combined labels length: 318
Combined lang length: 318
Skipping WHO_020.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 328
Combined labels length: 328
Combined lang length: 328
Skipping WHO_027.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 363
Combined labels length: 363
Combined lang length: 363
Skipping WHO_029.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 383
Combined labels length: 383
Combined lang length: 383
Skipping WHO_002.csv: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Combined data length: 390
Combined labels length: 390
Co

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Combined data length: 506
Combined labels length: 506
Combined lang length: 506
Combined data length: 537
Combined labels length: 537
Combined lang length: 537
Combined data length: 540
Combined labels length: 540
Combined lang length: 540
Combined data length: 542
Combined labels length: 542
Combined lang length: 542


  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


In [28]:
import pyarrow.parquet as pq
tbl = pq.read_table(f"{OUTPUT_PATH}/data.parquet")
print(tbl.schema)

__index_level_0__: int64
values: string


In [29]:
for row in tbl.column("values"):
    print(row.as_py(), type(row.as_py()))

n820e-v5b1,21v69nb5-e,-e4b652nv2,32n-2ve5b4,eb232-v5n5,e22b4n34-v,2vb-0ne223,e52n4b-v52,2-e46bn12v,nb1v-324e2,2bv26-e07n,9en-31v3b2,v2-1b1e48n,12nv-2be53,b1n5v-e529,e1n8-7bv35,-27ne91bv2,61n2veb27- <class 'str'>
benv-21955-ben-337,benv-22716-ben-965,benv-22146-ben-503,benv-21955-ben-340,benv-22146-ben-502,benv-22076-ben-430,benv-21933-ben-383,benv-21508-ben-1000,benv-22135-ben-201,benv-21955-ben-410,benv-21927-ben-18,benv-22716-ben-913,benv-22146-ben-533,benv-21955-ben-123,benv-21955-ben-102,benv-21955-ben-99,benv-21933-ben-274,benv-22076-ben-448,benv-21955-ben-103,benv-22354-ben-674,benv-21956-ben-36,benv-22146-ben-570,benv-22146-ben-624,benv-22716-ben-920,benv-21955-ben-795,benv-21927-ben-17,benv-21841-ben-10,benv-22146-ben-574,benv-22076-ben-431,benv-21955-ben-53,benv-21955-ben-403,benv-21955-ben-111,benv-22146-ben-652,benv-22135-ben-215,benv-21955-ben-327,benv-22076-ben-423,benv-22076-ben-994,benv-22465-ben-862,benv-22135-ben-190,benv-22146-ben-837,benv-21955-ben-147,benv-22146-ben

In [30]:
df = tbl.to_pandas()
print(df)

     __index_level_0__                                             values
0                    0  n820e-v5b1,21v69nb5-e,-e4b652nv2,32n-2ve5b4,eb...
1                    1  benv-21955-ben-337,benv-22716-ben-965,benv-221...
2                    2  2,24,32,37,53,1,20,10,38,11,0m,11m,39,46,8,45,...
3                    3                                                f,m
4                    4  boguila,mbaiki,kabo,boganda,berberati,bria,car...
..                 ...                                                ...
537                537             argentina,austria,australia,bangladesh
538                538                                    bgd,arg,aut,aus
539                539  2020-06-08,2020-06-27,2020-04-14,2020-02-29,20...
540                540                       french polynesia,philippines
541                541  2022-01-17,2022-05-23,2022-01-31,2022-04-10,20...

[542 rows x 2 columns]


In [31]:
py_lists = [s.split(",") for s in tbl.column("values").to_pylist()]
for L in py_lists:
    print(L)

['n820e-v5b1', '21v69nb5-e', '-e4b652nv2', '32n-2ve5b4', 'eb232-v5n5', 'e22b4n34-v', '2vb-0ne223', 'e52n4b-v52', '2-e46bn12v', 'nb1v-324e2', '2bv26-e07n', '9en-31v3b2', 'v2-1b1e48n', '12nv-2be53', 'b1n5v-e529', 'e1n8-7bv35', '-27ne91bv2', '61n2veb27-']
['benv-21955-ben-337', 'benv-22716-ben-965', 'benv-22146-ben-503', 'benv-21955-ben-340', 'benv-22146-ben-502', 'benv-22076-ben-430', 'benv-21933-ben-383', 'benv-21508-ben-1000', 'benv-22135-ben-201', 'benv-21955-ben-410', 'benv-21927-ben-18', 'benv-22716-ben-913', 'benv-22146-ben-533', 'benv-21955-ben-123', 'benv-21955-ben-102', 'benv-21955-ben-99', 'benv-21933-ben-274', 'benv-22076-ben-448', 'benv-21955-ben-103', 'benv-22354-ben-674', 'benv-21956-ben-36', 'benv-22146-ben-570', 'benv-22146-ben-624', 'benv-22716-ben-920', 'benv-21955-ben-795', 'benv-21927-ben-17', 'benv-21841-ben-10', 'benv-22146-ben-574', 'benv-22076-ben-431', 'benv-21955-ben-53', 'benv-21955-ben-403', 'benv-21955-ben-111', 'benv-22146-ben-652', 'benv-22135-ben-215', 'be

## Labels

In [32]:
import pandas as pd
labels = pd.read_parquet(f"{OUTPUT_PATH}/labels.parquet")
print(labels.columns)
print(labels.shape)
print(labels[:5])
print(labels.index.name)

Index(['type'], dtype='object')
(542, 1)
       type
0        id
1        id
2       age
3    gender
4  location
None


In [33]:
y_train = np.array([x.lower() for x in labels])
print(y_train)

['type']


### Unique labels

In [34]:
from itertools import count, groupby
import pandas as pd
data = pd.read_csv(GROUND_TRUTH_PATH)

unique_labels = data["label"].unique()
unique_count = len(unique_labels)

label_count = (data.groupby("label").size().reset_index(name="count").sort_values("count", ascending=False) )

print(label_count)


              label  count
0          __none__   1175
8   medical_boolean    210
4              date    128
7          location     57
6                id     48
2       case_status     27
1               age     24
5            gender     23
10          outcome     22
3   contact_setting     13
9        occupation     11
11         symptoms     11


### Labels per split

In [35]:
import pandas as pd

data_dir = OUTPUT_PATH
# 1. load each split
train = pd.read_parquet(f"{data_dir}/train_labels.parquet")
val   = pd.read_parquet(f"{data_dir}/validation_labels.parquet")
test  = pd.read_parquet(f"{data_dir}/test_labels.parquet")

# 2. get raw counts per label in each split
train_counts = train['type'].value_counts().rename('train')
val_counts   = val  ['type'].value_counts().rename('validation')
test_counts  = test ['type'].value_counts().rename('test')

# 3. combine into one DataFrame and fill missing labels with 0
dist = pd.concat([train_counts, val_counts, test_counts], axis=1).fillna(0).astype(int)

print(dist)

                 train  validation  test
type                                    
medical_boolean    157          20    20
date                97          12    12
location            45           5     6
id                  34           4     5
case_status         20           3     2
gender              18           2     2
age                 18           2     3
outcome             17           2     2
contact_setting     10           2     1
occupation           9           1     1
symptoms             8           1     1


In [36]:
len(test)

55

In [37]:
data = pd.read_parquet(f"{OUTPUT_PATH}/data.parquet")

data.columns

Index(['__index_level_0__', 'values'], dtype='object')

In [38]:
labels = pd.read_parquet(f"{OUTPUT_PATH}/labels.parquet")
labels.columns

Index(['type'], dtype='object')

### Finding unique values for label

In [39]:
import pandas as pd

In [40]:
# 1. Read in both files
data = pd.read_parquet(f"{OUTPUT_PATH}/data.parquet")
labels = pd.read_parquet(f"{OUTPUT_PATH}/labels.parquet")

# 2. If your labels DataFrame has its index-level stored as the index, bring it into a column:
if labels.index.name == "__index_level_0__" or labels.index.name is None:
    labels = labels.reset_index().rename(columns={"index": "__index_level_0__"})

# 3. Make sure the merge key is the same dtype on both sides:
data["__index_level_0__"] = data["__index_level_0__"].astype(labels["__index_level_0__"].dtype)

# 4. Merge on that key:
merged = data.merge(labels, on="__index_level_0__")

In [41]:
merged

Unnamed: 0,__index_level_0__,values,type
0,0,"n820e-v5b1,21v69nb5-e,-e4b652nv2,32n-2ve5b4,eb...",id
1,1,"benv-21955-ben-337,benv-22716-ben-965,benv-221...",id
2,2,"2,24,32,37,53,1,20,10,38,11,0m,11m,39,46,8,45,...",age
3,3,"f,m",gender
4,4,"boguila,mbaiki,kabo,boganda,berberati,bria,car...",location
...,...,...,...
537,537,"argentina,austria,australia,bangladesh",location
538,538,"bgd,arg,aut,aus",location
539,539,"2020-06-08,2020-06-27,2020-04-14,2020-02-29,20...",date
540,540,"french polynesia,philippines",location


In [48]:
symptoms_values = merged.loc[merged["type"] == "contact_setting",
merged.columns.difference(["__index_level_0__", "type"])].squeeze().unique()

print(symptoms_values)

['famille,fille,pre,niece,voisin,mre,ami'
 'bozoum,gamboula,baboua,bossangoa,batangafo' 'yes,no'
 'ndl health center,ndl community hospital' 'no,yes'
 'soignant,pre-mere,ecolier,visiteur,malade,enfant,cousin,pre,voisin,enseignant,fidele,osp,grand mere,transporteur,neveu,collegue,soeur,ami,niece,co-patient,membre famille,tante,frere,oncle,mere'
 'communautaire,funerailles,nosocomial'
 'famille,funerailles,famille, funerailles,communaute, funerailles,nosocomial,communaute'
 'co-hospitalisation,patient<>soignant,travail,voisins,amis,nosocomial,famille'
 'communautaire,nosocomial,funrailles'
 "pointe d'entre,structure sanitaire,rercherche active,reco,communaut"]


Symptoms contains:

'NO,YES' 'YES,NO' 'NO'
1.0, 0.0
Non, Oui

Pre_existing condition:

"Yes, No"
"Yes", "NO"
