In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import shutil

from data_processing_sherlock import DataProcessingSherlock

In [3]:
TABLE_FOLDER_PATH = "../tables"
GROUND_TRUTH_PATH   = "../gold/cta_gt.csv"
LANGUAGE_METADATA_PATH = "../gold/language_metadata.csv"
OUTPUT_PATH = "../sherlock_data_processing"

In [4]:
import numpy as np


def data_cleaning(data):
    """
    Clean the data in dataframes, removes hashes tha was broken during processing

    :param data: dataframe to clean
    :return: cleaned dataframe
    """

    for col in data.select_dtypes(include="object"):

        # make a bool mask of rows where the cell contains 'x000D'
        mask = data[col].str.contains("x000D", na=False)
        data.loc[mask, col] = np.nan

    return data

In [5]:
data_processing_sherlock = DataProcessingSherlock()

if os.path.exists(OUTPUT_PATH):
    shutil.rmtree(OUTPUT_PATH)

for filename in os.listdir(TABLE_FOLDER_PATH):
    if not filename.lower().endswith(".csv"):
        continue

    table_csv_path = os.path.join(TABLE_FOLDER_PATH, filename)
    table_name = filename  # matches the 'table_name' column in GT

    try:
        # Example usage:
        data, labels = data_processing_sherlock.load_table_with_labels(
            table_csv_path=TABLE_FOLDER_PATH + "/" + filename,
            gt_csv_path   =GROUND_TRUTH_PATH,
            table_name    =table_name
        )

        #cleaned_data = data_cleaning(data)

        data_processing_sherlock.flatten_and_save(data, labels, OUTPUT_PATH, table_name, LANGUAGE_METADATA_PATH)

        #print(f"Processed {filename} -> {data_path}")
    except Exception as e:
        print(f"Skipping {filename}: {e}")

Combined data length: 21
Combined labels length: 21
Combined lang length: 21
Combined data length: 29
Combined labels length: 29
Combined lang length: 29
Combined data length: 61
Combined labels length: 61
Combined lang length: 61
Combined data length: 81
Combined labels length: 81
Combined lang length: 81
Combined data length: 97
Combined labels length: 97
Combined lang length: 97
Combined data length: 130
Combined labels length: 130
Combined lang length: 130
Combined data length: 135
Combined labels length: 135
Combined lang length: 135
Combined data length: 137
Combined labels length: 137
Combined lang length: 137
Combined data length: 140
Combined labels length: 140
Combined lang length: 140
Combined data length: 148
Combined labels length: 148
Combined lang length: 148
Combined data length: 150
Combined labels length: 150
Combined lang length: 150
Combined data length: 152
Combined labels length: 152
Combined lang length: 152
Combined data length: 154
Combined labels length: 154
C

In [37]:
import pyarrow.parquet as pq
tbl = pq.read_table("../sherlock_data_processing/data.parquet")
print(tbl.schema)

__index_level_0__: int64
values: string


In [38]:
for row in tbl.column("values"):
    print(row.as_py(), type(row.as_py()))

e_4800uq0,046_q0u0e,ue_009q06,u_018q0e0,qe4_0060u,1_0q002eu,0u040_7eq,0ueq0_202,80u_00qe6,_q97eu000,0u_e000q8,004qu_0e7,u1_060e0q,25q0eu0_0,_900qe40u,q408e0_u0,6u000q0_e,u190q0e1_,1e0_u00q1,4e00_0u1q,0_u00q0e5,_00eq200u,5p0000_br,010_u7eq0,e1_0uq206,05u0e0_q6,010u2e_5q,ue80_70q0,80q_e110u,0uq0e07_0,e0800uq_1,_ue080q01,11qu_e010,_q102eu40,_301q0eu0,e0003q_u3,6_0u0q30e,u_003eq90,00u_e0q49,90_0e9qu0,000q_32ue,00uq5e_70,u_q0310e1,00q660e_u,0_9ue0q02,_qe83000u,e0u14_1q0,0u003e_q9,02_050qeu,805q0u0_e,070eq20u_,e90u8q0_0,uqe0_0060,q80_008ue,q2000eu0_,0u70eq06_,5_u0040qe,1003q0_ue,q00e400u_,e010_q90u,09eu_000q,0e700_qu0,2eq10u_00,_0u0qe404,e680u0_0q,0q57u0e_0,eq0050_u3,1q_e10u02,069e_0uq0,1_1uq000e,q090u_0e0,uq300e0_1,00u0e_q30,q3e200u0_,00u0q_16e,_u0qe0127,_e0qu1500,026_q0e0u,q20eu101_,u00_eq001 <class 'str'>
0C101DE5RTQ2,E15R0T0CQ52D,RC18E0T03Q2D,03QRTED050C0,02RE0QT1C4D1,0D3ERT002Q0C,T0CQDE00100R,RC0E2T7Q01D2,6E80DC09QR0T,0QR3T0D0E1C0,RD3TE012QC01,Q012260DRECT,101RE27QTDC0,QE32CDR0T106,72Q5

In [39]:
df = tbl.to_pandas()
print(df)

    __index_level_0__                                             values
0                   0  e_4800uq0,046_q0u0e,ue_009q06,u_018q0e0,qe4_00...
1                   1  0C101DE5RTQ2,E15R0T0CQ52D,RC18E0T03Q2D,03QRTED...
2                   2  bik094,mbk228,bik074,bik001,mbk461,mbk055,mbk2...
3                   3  89T1R7D9QK010BCME06--,410030ER-0QKBDCT2-6R,4CD...
4                   4  45,4,60,65,22,8,30,19,2,46,18,3,35,54,33,85,31...
..                ...                                                ...
80                 80  2022-05-19,2022-05-09,2022-05-17,2022-05-16,20...
81                 81                                             Yes,No
82                 82                         we dont have this variable
83                 83  2022-05-25,2022-06-01,2022-05-31,2022-05-27,20...
84                 84                                                  N

[85 rows x 2 columns]


In [40]:
py_lists = [s.split(",") for s in tbl.column("values").to_pylist()]
for L in py_lists:
    print(L)

['e_4800uq0', '046_q0u0e', 'ue_009q06', 'u_018q0e0', 'qe4_0060u', '1_0q002eu', '0u040_7eq', '0ueq0_202', '80u_00qe6', '_q97eu000', '0u_e000q8', '004qu_0e7', 'u1_060e0q', '25q0eu0_0', '_900qe40u', 'q408e0_u0', '6u000q0_e', 'u190q0e1_', '1e0_u00q1', '4e00_0u1q', '0_u00q0e5', '_00eq200u', '5p0000_br', '010_u7eq0', 'e1_0uq206', '05u0e0_q6', '010u2e_5q', 'ue80_70q0', '80q_e110u', '0uq0e07_0', 'e0800uq_1', '_ue080q01', '11qu_e010', '_q102eu40', '_301q0eu0', 'e0003q_u3', '6_0u0q30e', 'u_003eq90', '00u_e0q49', '90_0e9qu0', '000q_32ue', '00uq5e_70', 'u_q0310e1', '00q660e_u', '0_9ue0q02', '_qe83000u', 'e0u14_1q0', '0u003e_q9', '02_050qeu', '805q0u0_e', '070eq20u_', 'e90u8q0_0', 'uqe0_0060', 'q80_008ue', 'q2000eu0_', '0u70eq06_', '5_u0040qe', '1003q0_ue', 'q00e400u_', 'e010_q90u', '09eu_000q', '0e700_qu0', '2eq10u_00', '_0u0qe404', 'e680u0_0q', '0q57u0e_0', 'eq0050_u3', '1q_e10u02', '069e_0uq0', '1_1uq000e', 'q090u_0e0', 'uq300e0_1', '00u0e_q30', 'q3e200u0_', '00u0q_16e', '_u0qe0127', '_e0qu1500'

In [10]:
# TODO: Note that we still have problem with a not found values, we need to clean more deeply and better.
# TODO: Moreover, we can see that sherlock french word embeddings NaN

## Labels

In [41]:
import pandas as pd
labels = pd.read_parquet("../sherlock_data_processing/labels.parquet")
print(labels.columns)
print(labels.shape)
print(labels[:5])
print(labels.index.name)

Index(['type'], dtype='object')
(85, 1)
         type
0          ID
1          ID
2          ID
3  Contact_ID
4         Age
None


In [43]:
y_train = np.array([x.lower() for x in labels])
print(y_train)

['type']
