### Import data into database

In [None]:
from activity_tracker import utils
import sqlite3
import pandas as pd
import pathlib
import re

# Import raw data into raw schema
data_dir = pathlib.Path("../data/raw/")
db_path = "../data/sqlite/activity_tracker.db"
mapper_path = pathlib.Path("../data/data_mapper.yml")
excel_path = pathlib.Path("../data/raw/MDE clinical data.xlsx")
excluded_files = [
    "fitbitCoreTemperature_merged.csv",
    "fitbitSkinTemperature_merged.csv",
    "weightLogInfo_merged.csv",
    "mde_clinical_data.csv"
]

In [None]:
def camel_to_snake(name: str) -> str:
    name = name.replace("_merged", "")
    return re.sub(r'(?<=[a-z0-9])([A-Z])', r'_\1', name).lower()

conn = sqlite3.connect(db_path)

for file_path in data_dir.glob("*.csv"):
    if file_path.name in excluded_files:
        continue

    table_name = camel_to_snake(file_path.stem)
    df = pd.read_csv(file_path)
    try:
        df.to_sql(table_name, conn, if_exists="fail", index=False)
        print(f"Table {table_name} created with {len(df)} rows.")
    except ValueError:
        print(f"Skipping {table_name} — table already exists.")
conn.close()

### Create a subject table

In [None]:
xls = pd.ExcelFile(excel_path)
df_control = pd.read_excel(xls, sheet_name=xls.sheet_names[0])
df_exercise = pd.read_excel(xls, sheet_name=xls.sheet_names[1])

# Add group labels
df_control["group"] = "control"
df_exercise["group"] = "exercise"
df_combined = pd.concat([df_control, df_exercise], ignore_index=True)

subject_columns = [
    "Participant ID", "group", "Sex", "Age", "Eth", "Race",
    "Mth Inc", "Educ", "Mari", "Liv Sit"
]
df_subject = df_combined[subject_columns].copy()

df_subject.columns = (
    df_subject.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(".", "_")
)

# Rename columns
rename_map = {
    "participant_id": "subject_id",
    "eth": "ethnicity",
    "mth_inc": "monthly_income",
    "educ": "education",
    "mari": "marital_status",
    "liv_sit": "living_situation"
}
df_subject = df_subject.rename(columns=rename_map)
df_subject = df_subject.sort_values(by="subject_id").reset_index(drop=True)

data_dictionary = utils.load_data_mapper(mapper_path)

# Apply data dictionary to map integer values to strings
for column in data_dictionary:
    if column in df_subject.columns:
        df_subject[column] = df_subject[column].map(data_dictionary[column])
        
conn = sqlite3.connect(db_path)
df_subject.to_sql("subject", conn, if_exists="replace", index=False)

conn.commit()
conn.close()

In [None]:
df_subject

### Create a visit table

In [None]:
def make_unique_columns(cols):
    seen = {}
    result = []
    for col in cols:
        base = col
        if base not in seen:
            seen[base] = 1
            result.append(base)
        else:
            count = seen[base]
            new_col = f"{base}_{count}"
            while new_col in seen:
                count += 1
                new_col = f"{base}_{count}"
            seen[base] = count + 1
            seen[new_col] = 1
            result.append(new_col)
    return result

def sanitize_column(col):
    base = col.split(".")[0]
    # Remove anything in parentheses (and the parentheses themselves)
    base = re.sub(r"\(.*?\)", "", base)
    # Normalize spacing and other characters
    name = "_".join(base.strip().split()).lower()
    # Remove remaining special characters (slashes, dashes, percent signs)
    name = name.replace("/", "").replace("-", "").replace("%", "")
    return name

def stack_visits(df, group_label):
    df = df.rename(columns={"Participant ID": "subject_id"})
    visit_markers = sorted([col for col in df.columns if col.startswith("V") and len(col) == 2 and col[1].isdigit()],
                           key=lambda x: int(x[1:]))
    
    all_visits = []

    for i, marker in enumerate(visit_markers):
        start = df.columns.get_loc(marker) + 1
        end = df.columns.get_loc(visit_markers[i + 1]) if i + 1 < len(visit_markers) else len(df.columns)
        visit_cols = df.columns[start:end].tolist()

        visit_df = df[["subject_id"] + visit_cols].copy()
        cleaned_cols = [sanitize_column(col) for col in visit_cols]
        visit_df.columns = ["subject_id"] + make_unique_columns(cleaned_cols)

        visit_df.insert(1, "group", group_label)
        visit_df.insert(2, "visit_id", int(marker[1:]))

        all_visits.append(visit_df)

    return pd.concat(all_visits, ignore_index=True)

# Load data
xls = pd.ExcelFile(excel_path)

df_visits_control = stack_visits(pd.read_excel(xls, sheet_name=0), "control")
df_visits_exercise = stack_visits(pd.read_excel(xls, sheet_name=1), "exercise")

ffp_cols = ["wt_loss", "weak", "slow", "exhaust", "phys_act"]
for df in [df_visits_control, df_visits_exercise]:
    df["ffp_score"] = df[ffp_cols].sum(axis=1)
    df["gait"] = 4 / df["walk"].replace(0, pd.NA)

# Combine
df_visits_all = pd.concat([df_visits_control, df_visits_exercise], ignore_index=True)
df_visits_all = df_visits_all.sort_values(by=["subject_id", "visit_id"]).reset_index(drop=True)

conn = sqlite3.connect(db_path)
df_visits_all.to_sql("visit", conn, if_exists="replace", index=False)

conn.commit()
conn.close()

In [None]:
df_visits_all

### Create Hourly data table

In [None]:
def camel_to_snake(name):
    name = re.sub(r'(?<=[a-z0-9])([A-Z])', r'_\1', name)
    name = name.replace(" ", "_").replace("-", "_").replace("/", "_")
    return name.lower()

query = """
SELECT *
FROM fitbit_wear_time_via_hr wt
  LEFT JOIN daily_activity da
    ON wt.ID = da.ID AND wt.Day = da.ActivityDate
LEFT JOIN daily_calories dc
  ON da.ID = dc.ID AND da.ActivityDate = dc.ActivityDay
LEFT JOIN daily_intensities di
  ON dc.ID = di.ID AND dc.ActivityDay = di.ActivityDay
LEFT JOIN daily_steps ds
  ON dc.ID = ds.ID AND dc.ActivityDay = ds.ActivityDay
LEFT JOIN fitbit_daily_sp_o2 fb
  ON dc.ID = fb.ID AND dc.ActivityDay = fb.SleepDay
"""

conn = sqlite3.connect(db_path)
df = pd.read_sql_query(query, conn)
conn.close()

# Column name sanitization
df.columns = [camel_to_snake(col) for col in df.columns]
df = df.loc[:, ~df.columns.duplicated()]
df = df.drop(columns=["activity_date", "activity_day", "sleep_day"]).rename({"id": "subject_id", "day": "date"}, axis=1)

In [None]:
# Filter rows
start_col = df.columns.get_loc("percentage_wear_time") + 1
right_cols = df.columns[start_col:]

# Drop rows where all right-side values are NaN
filtered_df = df[~df[right_cols].isna().all(axis=1)].reset_index(drop=True)
filtered_df