### Run data quality check on new data

In [1]:
import pandas as pd
import pathlib
import warnings
warnings.filterwarnings("ignore")

# Import raw data into raw schema
data_dir = pathlib.Path("../data/raw/")
excel_path = pathlib.Path("../data/raw/MDE clinical data.xlsx")
excel_path_old = pathlib.Path("../data/raw/MDE clinical data_old.xlsx")


xls = pd.ExcelFile(excel_path)
xls_old = pd.ExcelFile(excel_path_old)

df_control = pd.read_excel(xls, sheet_name=xls.sheet_names[0])
df_exercise = pd.read_excel(xls, sheet_name=xls.sheet_names[1])

df_control_old = pd.read_excel(xls_old, sheet_name=xls_old.sheet_names[0])
df_exercise_old = pd.read_excel(xls_old, sheet_name=xls_old.sheet_names[1])

In [2]:
new_control_subjects = set(df_control["Participant ID"].tolist()) - set(df_control_old["Participant ID"].tolist())

new_exercise_subjects = set(df_exercise["Participant ID"].tolist()) - set(df_exercise_old["Participant ID"].tolist())

print(f"New control subjects: {new_control_subjects}")
print(f"New exercise subjects: {new_exercise_subjects}")

New control subjects: {'MDE100', 'MDE101', 'MDE99', 'MDE97', 'MDE92'}
New exercise subjects: {'MDE90', 'MDE94', 'MDE95', 'MDE93', 'MDE96', 'MDE98', 'MDE91'}


In [3]:
# Remove new subjects from both datasets to compare only existing subjects
df_control_existing = df_control[~df_control["Participant ID"].isin(new_control_subjects)]
df_exercise_existing = df_exercise[~df_exercise["Participant ID"].isin(new_exercise_subjects)]

# Sort both dataframes by Participant ID to ensure proper comparison
df_control_existing_sorted = df_control_existing.sort_values("Participant ID").reset_index(drop=True)
df_control_old_sorted = df_control_old.sort_values("Participant ID").reset_index(drop=True)

df_exercise_existing_sorted = df_exercise_existing.sort_values("Participant ID").reset_index(drop=True)
df_exercise_old_sorted = df_exercise_old.sort_values("Participant ID").reset_index(drop=True)

# Check if dataframes are identical
control_identical = df_control_existing_sorted.equals(df_control_old_sorted)
exercise_identical = df_exercise_existing_sorted.equals(df_exercise_old_sorted)

print(f"Control data (excluding new subjects) is identical: {control_identical}")
print(f"Exercise data (excluding new subjects) is identical: {exercise_identical}")

# If not identical, show differences
if not control_identical:
    print("\nControl data differences:")
    print("Columns with differences:")
    for col in df_control_existing_sorted.columns:
        if not df_control_existing_sorted[col].equals(df_control_old_sorted[col]):
            print(f"  - {col}")
    
    # Show sample differences for first few columns with differences
    diff_cols = [col for col in df_control_existing_sorted.columns 
                  if not df_control_existing_sorted[col].equals(df_control_old_sorted[col])]
    if diff_cols:
        print(f"\nSample differences in {diff_cols[0]}:")
        comparison = pd.concat([
            df_control_old_sorted[["Participant ID", diff_cols[0]]].rename(columns={diff_cols[0]: "Old"}),
            df_control_existing_sorted[["Participant ID", diff_cols[0]]].rename(columns={diff_cols[0]: "New"})
        ], axis=1)
        print(comparison.head(10))

if not exercise_identical:
    print("\nExercise data differences:")
    # Similar analysis for exercise data

Control data (excluding new subjects) is identical: False
Exercise data (excluding new subjects) is identical: False

Control data differences:
Columns with differences:
  - Gait(m/s)


KeyError: 'Avg Daily Steps'

In [4]:
set(df_control.columns) - set(df_control_old.columns)


{'Avg Daily Steps',
 'Avg Daily Steps.1',
 'Avg Daily Steps.2',
 'Avg Daily Steps.3',
 'Avg Daily Steps.4',
 'Fecal',
 'Fecal.1',
 'Fecal.2',
 'Mem Cat',
 'Mem Cat.1',
 'Mem Cat.2'}