### Import data into database

In [42]:
from activity_tracker import utils
from activity_tracker.preprocess import normalize_subject_id
import sqlite3
import pandas as pd
import pathlib
import re
import warnings
warnings.filterwarnings("ignore")

# Import raw data into raw schema
data_dir = pathlib.Path("../data/raw/")
db_path = "../data/sqlite/activity_tracker.db"
mapper_path = pathlib.Path("../data/data_mapper.yml")
excel_path = pathlib.Path("../data/raw/MDE clinical data.xlsx")
mde_study_log_path = pathlib.Path("../data/raw/mde_study_log.csv")
excluded_files = [
    "fitbitBreathingRate_merged.csv"
    "fitbitCoreTemperature_merged.csv",
    "fitbitSkinTemperature_merged.csv",
    "mde_clinical_data.csv",
    "dailyCardioFitnessScore_merged.csv"
    "weightLogInfo_merged.csv",
]

### Create activity_tracker database

In [4]:
def camel_to_snake(name: str) -> str:
    name = name.replace("_merged", "")
    return re.sub(r'(?<=[a-z0-9])([A-Z])', r'_\1', name).lower()

conn = sqlite3.connect(db_path)

for file_path in data_dir.glob("*.csv"):
    if file_path.name in excluded_files:
        continue

    table_name = camel_to_snake(file_path.stem)
    df = pd.read_csv(file_path)
    
    # Convert date and time columns to datetime
    for col in df.columns:
        col_lower = col.lower()
        if ("date" in col_lower or "day" in col_lower):
            df[col] = pd.to_datetime(df[col]).dt.date 
        elif col_lower == "time":
            df[col] = pd.to_datetime(df[col])
    try:
        df.to_sql(table_name, conn, if_exists="fail", index=False)
        print(f"Table {table_name} created with {len(df)} rows.")
    except ValueError:
        print(f"Skipping {table_name} — table already exists.")
conn.close()

Skipping fitbit_wear_time_via_hr — table already exists.
Skipping daily_activity — table already exists.
Skipping sleep_day — table already exists.
Skipping fitbit_daily_hrv — table already exists.
Skipping heart_rate_zones — table already exists.
Skipping fitbit_daily_sp_o2 — table already exists.
Skipping fitbit_breathing_rate — table already exists.
Skipping daily_calories — table already exists.
Skipping daily_steps — table already exists.
Skipping daily_cardio_fitness_score — table already exists.
Skipping sleep_stages_day — table already exists.
Skipping mde_study_log — table already exists.
Skipping daily_fitbit_active_zone_minutes — table already exists.
Skipping daily_intensities — table already exists.


### Create a subject table

In [43]:
xls = pd.ExcelFile(excel_path)
df_control = pd.read_excel(xls, sheet_name=xls.sheet_names[0])
df_exercise = pd.read_excel(xls, sheet_name=xls.sheet_names[1])

# Add group labels
df_control["group"] = "control"
df_exercise["group"] = "exercise"
df_combined = pd.concat([df_control, df_exercise], ignore_index=True)

subject_columns = [
    "Participant ID", "group", "Sex", "Age", "Eth", "Race",
    "Mth Inc", "Educ", "Mari", "Liv Sit"
]
df_subject = df_combined[subject_columns].copy()

df_subject.columns = (
    df_subject.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(".", "_")
)

# Rename columns
rename_map = {
    "participant_id": "subject_id",
    "eth": "ethnicity",
    "mth_inc": "monthly_income",
    "educ": "education",
    "mari": "marital_status",
    "liv_sit": "living_situation"
}
df_subject = df_subject.rename(columns=rename_map)
df_subject = df_subject.sort_values(by="subject_id").reset_index(drop=True)

data_dictionary = utils.load_data_mapper(mapper_path)

# Apply data dictionary to map integer values to strings
for column in data_dictionary:
    if column in df_subject.columns:
        df_subject[column] = df_subject[column].map(data_dictionary[column])
df_subject["subject_id"] = df_subject["subject_id"].apply(normalize_subject_id)

conn = sqlite3.connect(db_path)
df_subject.to_sql("subject", conn, if_exists="replace", index=False)
conn.commit()
conn.close()

In [44]:
df_subject

Unnamed: 0,subject_id,group,sex,age,ethnicity,race,monthly_income,education,marital_status,living_situation
0,MDE001,control,m,83.0,non_hisp,black,3000.0,18.0,never_married,with_others
1,MDE002,exercise,m,61.0,non_hisp,white,1033.0,3.0,never_married,alone
2,MDE003,exercise,f,69.0,hisp,white,1349.0,13.0,widowed,with_family
3,MDE004,control,f,65.0,non_hisp,black,900.0,11.0,widowed,with_family
4,MDE005,control,f,84.0,non_hisp,black,792.0,3.0,widowed,with_family
...,...,...,...,...,...,...,...,...,...,...
84,MDE085,control,f,90.0,hisp,white,,6.0,widowed,with_family
85,MDE086,control,f,85.0,hisp,white,0.0,6.0,widowed,with_family
86,MDE087,control,f,99.0,hisp,unknown,0.0,3.0,widowed,with_family
87,MDE088,control,m,71.0,hisp,unknown,700.0,2.0,married,with_family


### Create a visit table

In [45]:
def make_unique_columns(cols):
    seen = {}
    result = []
    for col in cols:
        base = col
        if base not in seen:
            seen[base] = 1
            result.append(base)
        else:
            count = seen[base]
            new_col = f"{base}_{count}"
            while new_col in seen:
                count += 1
                new_col = f"{base}_{count}"
            seen[base] = count + 1
            seen[new_col] = 1
            result.append(new_col)
    return result

def sanitize_column(col):
    base = col.split(".")[0]
    # Remove anything in parentheses (and the parentheses themselves)
    base = re.sub(r"\(.*?\)", "", base)
    # Normalize spacing and other characters
    name = "_".join(base.strip().split()).lower()
    # Remove remaining special characters (slashes, dashes, percent signs)
    name = name.replace("/", "").replace("-", "").replace("%", "")
    return name

def stack_visits(df, group_label):
    df = df.rename(columns={"Participant ID": "subject_id"})
    visit_markers = sorted([col for col in df.columns if col.startswith("V") and len(col) == 2 and col[1].isdigit()],
                           key=lambda x: int(x[1:]))
    
    all_visits = []

    for i, marker in enumerate(visit_markers):
        start = df.columns.get_loc(marker) + 1
        end = df.columns.get_loc(visit_markers[i + 1]) if i + 1 < len(visit_markers) else len(df.columns)
        visit_cols = df.columns[start:end].tolist()

        visit_df = df[["subject_id"] + visit_cols].copy()
        cleaned_cols = [sanitize_column(col) for col in visit_cols]
        visit_df.columns = ["subject_id"] + make_unique_columns(cleaned_cols)

        visit_df.insert(1, "group", group_label)
        visit_df.insert(2, "visit_id", int(marker[1:]))

        all_visits.append(visit_df)

    return pd.concat(all_visits, ignore_index=True)

# Load data
xls = pd.ExcelFile(excel_path)

df_visits_control = stack_visits(pd.read_excel(xls, sheet_name=0), "control")
df_visits_exercise = stack_visits(pd.read_excel(xls, sheet_name=1), "exercise")

ffp_cols = ["wt_loss", "weak", "slow", "exhaust", "phys_act"]
for df in [df_visits_control, df_visits_exercise]:
    df["ffp_score"] = df[ffp_cols].sum(axis=1)
    df["gait"] = 4 / df["walk"].replace(0, pd.NA)

# Combine
df = pd.concat([df_visits_control, df_visits_exercise], ignore_index=True)
df = df.sort_values(by=["subject_id", "visit_id"]).reset_index(drop=True)
df = df[["subject_id", "group", "visit_id", "ffp_status", "ffp_score"]]
df["visit_id"] = df["visit_id"].astype("category")
df["ffp_status"] = df["ffp_status"].apply(lambda x: int(x) if pd.notnull(x) else pd.NA).astype("category")
df["ffp_score"] = df["ffp_score"].astype(float)
df["subject_id"] = df["subject_id"].apply(normalize_subject_id)

### Process study log

In [46]:
study_log_df = pd.read_csv(mde_study_log_path)

# Normalize subject IDs
study_log_df["subject_id"] = study_log_df["PID"].astype(str).str.replace("-", "").str.upper()
study_log_df["subject_id"] = study_log_df["subject_id"].apply(normalize_subject_id)

visit_date_cols = {
    "Screening": 1,
    "Week 4 (V2)": 2,
    "Week 8 (V3)": 3,
    "Week 12 (V4)": 4,
    "Week 24 (EOS)": 5
}
visit_dates_df = study_log_df[["subject_id"] + list(visit_date_cols.keys())].rename(columns=visit_date_cols)
visit_dates_df = visit_dates_df.dropna(subset=list(visit_date_cols.values()), how="all")

# Convert all visit date columns to datetime safely
for col in visit_date_cols.values():
    visit_dates_df[col] = pd.to_datetime(visit_dates_df[col].astype(str).str.strip().replace("OS", pd.NA), errors='coerce')

visit_dates_df

Unnamed: 0,subject_id,1,2,3,4,5
0,MDE001,2021-07-19,2021-08-23,2021-09-15,2021-10-07,2022-01-19
1,MDE002,2021-07-21,2021-08-30,2021-09-23,2021-10-13,2022-01-19
2,MDE003,2021-07-21,2021-08-30,2021-09-22,2021-10-11,2022-01-19
3,MDE004,2021-08-24,2021-09-22,NaT,NaT,NaT
4,MDE005,2021-09-16,2021-10-18,2021-11-11,2021-12-08,2022-02-14
...,...,...,...,...,...,...
96,MDE097,2024-08-15,2024-10-02,2024-10-28,2024-12-04,2025-02-27
97,MDE098,2024-10-14,2024-11-26,2025-01-07,2025-02-11,2025-05-06
98,MDE099,2024-10-23,NaT,NaT,NaT,NaT
99,MDE100,2024-10-24,2024-12-05,2025-01-07,2025-01-30,2025-05-01


In [47]:
# Merge visit dates with df
visit_dates_df = visit_dates_df.melt(
    id_vars="subject_id",
    var_name="visit_id",
    value_name="date"
)
df = df.merge(visit_dates_df, on=["subject_id", "visit_id"], how="left")
df["date"] = pd.to_datetime(df["date"])
df

Unnamed: 0,subject_id,group,visit_id,ffp_status,ffp_score,date
0,MDE001,control,1,0,4.0,2021-07-19
1,MDE001,control,2,0,4.0,2021-08-23
2,MDE001,control,3,1,2.0,2021-09-15
3,MDE001,control,4,0,4.0,2021-10-07
4,MDE001,control,5,0,3.0,2022-01-19
...,...,...,...,...,...,...
440,MDE089,control,1,1,1.0,2023-11-01
441,MDE089,control,2,,0.0,2023-12-29
442,MDE089,control,3,1,2.0,2024-01-25
443,MDE089,control,4,1,1.0,2024-02-29


In [48]:
# Create visit database
conn = sqlite3.connect(db_path)
df.to_sql("visit", conn, if_exists="replace", index=False)

conn.commit()
conn.close()

### Create daily measurement table

In [69]:
def camel_to_snake(name):
    name = re.sub(r'(?<=[a-z0-9])([A-Z])', r'_\1', name)
    name = name.replace(" ", "_").replace("-", "_").replace("/", "_")
    return name.lower()

query = """
SELECT *
FROM fitbit_wear_time_via_hr wt
  LEFT JOIN daily_activity da
    ON wt.ID = da.ID AND wt.Day = da.ActivityDate
LEFT JOIN daily_intensities di
  ON da.ID = di.ID AND da.ActivityDate = di.ActivityDay
LEFT JOIN fitbit_daily_sp_o2 fb
  ON da.ID = fb.ID AND da.ActivityDate = fb.SleepDay
"""

conn = sqlite3.connect(db_path)
df_measurement = pd.read_sql_query(query, conn)
conn.close()

# Column name sanitization
df_measurement.columns = [camel_to_snake(col) for col in df_measurement.columns]
df_measurement = df_measurement.loc[:, ~df_measurement.columns.duplicated()]
df_measurement = df_measurement.drop(columns=["activity_date", "activity_day", "sleep_day"]).rename({"id": "subject_id", "day": "date"}, axis=1)
df_measurement["subject_id"] = df_measurement["subject_id"].apply(normalize_subject_id)
df_measurement["date"] = pd.to_datetime(df_measurement["date"])

# Filter measurements to only include days with measurements
start_col = df_measurement.columns.get_loc("percentage_wear_time") + 1
right_cols = df_measurement.columns[start_col:]
df_measurement = df_measurement[~df_measurement[right_cols].isna().all(axis=1)].reset_index(drop=True)
df_measurement.sort_values(by=["subject_id", "date"], inplace=True)

In [76]:
conn = sqlite3.connect(db_path)
df_measurement.to_sql("daily_measurement", conn, if_exists="replace", index=False)
conn.commit()
conn.close()

In [77]:
df_measurement.columns

Index(['subject_id', 'date', 'total_minutes_wear_time', 'percentage_wear_time',
       'total_steps', 'total_distance', 'tracker_distance',
       'logged_activities_distance', 'very_active_distance',
       'moderately_active_distance', 'light_active_distance',
       'sedentary_active_distance', 'very_active_minutes',
       'fairly_active_minutes', 'lightly_active_minutes', 'sedentary_minutes',
       'calories', 'floors', 'calories_bmr', 'marginal_calories',
       'resting_heart_rate', 'average_sp_o2', 'min_sp_o2', 'max_sp_o2'],
      dtype='object')

### Create analysis db

In [78]:
analysis_db_path = "../data/sqlite/analysis.db"
conn = sqlite3.connect(analysis_db_path)

conn.execute(f"ATTACH DATABASE '{db_path}' AS raw")
conn.execute("DROP TABLE IF EXISTS subject")
conn.execute("CREATE TABLE subject AS SELECT * FROM raw.subject")
conn.execute("DROP TABLE IF EXISTS visit")
conn.execute("CREATE TABLE visit AS SELECT * FROM raw.visit")
conn.execute("DROP TABLE IF EXISTS daily_measurement")
conn.execute("CREATE TABLE daily_measurement AS SELECT * FROM raw.daily_measurement")

conn.commit()
conn.close()