In [None]:
import jupyter_black
from IPython.display import display


jupyter_black.load(line_length=999)

In [None]:
import os
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
scores = pd.read_excel("量表最终版.xlsx")
scores = scores[["提交答卷时间", "账号", "PHQ-9总分", "GAD-7总分", "AIS总分"]]

scores["提交答卷时间"] = pd.to_datetime(scores["提交答卷时间"], format="%Y/%m/%d %H:%M:%S")
scores["提交答卷时间"] = scores["提交答卷时间"].dt.date

scores = scores[(scores["账号"] == "gzj") | (scores["账号"] == "hm")]
scores.rename(
    columns={
        "提交答卷时间": "Date",
        "账号": "SID",
        "PHQ-9总分": "PHQ-9",
        "GAD-7总分": "GAD-7",
        "AIS总分": "AIS",
    },
    inplace=True,
)
scores.sort_values(by="Date", inplace=True)
scores

In [None]:
def interpolate_scores(df: pd.DataFrame) -> pd.DataFrame:
    df["Date"] = pd.to_datetime(df["Date"])
    interpolated_dfs = []
    for sid, group in df.groupby("SID"):
        group = group.set_index("Date")

        for col in ["PHQ-9", "GAD-7", "AIS"]:
            group[col] = pd.to_numeric(group[col], errors="coerce")

        group = group.resample("D").interpolate(method="time")
        group["SID"] = sid
        interpolated_dfs.append(group)

    final_df = pd.concat(interpolated_dfs)
    final_df = final_df.reset_index()
    return final_df


scores = interpolate_scores(scores.copy())
scores

In [None]:
def add_scores(scores: pd.DataFrame, input_dir: str = "data_csv", output_dir: str = "data_add",bar:bool=False):
    scores["Date"] = pd.to_datetime(scores["Date"]).dt.strftime("%Y-%m-%d")
    os.makedirs(output_dir, exist_ok=True)

    feature_dirs=tqdm(os.listdir(input_dir), desc="Processing by features") if bar else os.listdir(input_dir)

    for feature_dir in feature_dirs:
        feature_path = os.path.join(input_dir, feature_dir)
        if os.path.isdir(feature_path):
            output_feature_path = os.path.join(output_dir, feature_dir)
            os.makedirs(output_feature_path, exist_ok=True)

            for filename in os.listdir(feature_path):
                if filename.endswith(".csv"):
                    file_path = os.path.join(feature_path, filename)
                    date_str, _ = os.path.splitext(filename)
                    date_str = pd.to_datetime(date_str).strftime("%Y-%m-%d")

                    df = pd.read_csv(file_path, dtype=str)
                    unique_sids = df["SID"].unique()
                    for sid in unique_sids:
                        match = scores[(scores["SID"] == sid) & (scores["Date"] == date_str)]

                        if not match.empty:
                            phq9 = round(match["PHQ-9"].iloc[0], 4)
                            gad7 = round(match["GAD-7"].iloc[0], 4)
                            ais = round(match["AIS"].iloc[0], 4)

                            df.loc[df["SID"] == sid, "PHQ-9"] = phq9
                            df.loc[df["SID"] == sid, "GAD-7"] = gad7
                            df.loc[df["SID"] == sid, "AIS"] = ais

                    output_file_path = os.path.join(output_feature_path, filename)
                    df.to_csv(output_file_path, index=False)

In [None]:
add_scores(scores,bar=True)

In [None]:
import torch


def conv_pool(feature: str, input_dir: str = "data_add", output_dir: str = "dataset", bar: bool = False):
    data_path = os.path.join(input_dir, feature)
    dates_path = tqdm(os.listdir(data_path), desc="Processing by dates") if bar else os.listdir(data_path)

    pass

In [None]:
input_dir = "data_add"
output_dir = "dataset"
feature = "ACCELERATION"

# data_path = os.path.join(input_dir, feature)
# dates_path = tqdm(os.listdir(data_path), desc="Processing by dates")
# for date_path in dates_path:
#     date_str=date_path
#     date_path = os.path.join(data_path, date_path)
#     date = pd.read_csv(date_path)
#     date = clean_incremental_timestamps(date)
#     processed_df = process_time_window(date.copy(), csv_file_path, window="1min", interpolate=True)

date = pd.read_csv("data_add/ACCELERATION/2024.11.05.csv")
print(date.head())