In [None]:
import os
from datetime import timedelta
from copy import deepcopy
import pandas as pd

# A minimal data_factory dict with folder paths
data_factory = {
    "feature_folder": {
        "INS-W": {
            1: "/Users/kwang/Data/globem/INS-W_1/FeatureData/",
            2: "/Users/kwang/Data/globem/INS-W_2/FeatureData/",
            3: "/Users/kwang/Data/globem/INS-W_3/FeatureData/",
            4: "/Users/kwang/Data/globem/INS-W_4/FeatureData/",
        }
    },
    "survey_folder": {
        "INS-W": {
            1: "/Users/kwang/Data/globem/INS-W_1/SurveyData/",
            2: "/Users/kwang/Data/globem/INS-W_2/SurveyData/",
            3: "/Users/kwang/Data/globem/INS-W_3/SurveyData/",
            4: "/Users/kwang/Data/globem/INS-W_4/SurveyData/",
        }
    },
    "participants_info_folder": {
        "INS-W": {
            1: "/Users/kwang/Data/globem/INS-W_1/ParticipantsInfoData/",
            2: "/Users/kwang/Data/globem/INS-W_2/ParticipantsInfoData/",
            3: "/Users/kwang/Data/globem/INS-W_3/ParticipantsInfoData/",
            4: "/Users/kwang/Data/globem/INS-W_4/ParticipantsInfoData/",
        }
    },
    
    # If only loading dep_weekly or dep_endterm,
    # an empty placeholder for threshold_book:
    "threshold_book": {
        # Example of other tasks if expanding beyond dep_weekly / dep_endterm
        # "some_other_survey_task": {"threshold_as_true":10, "threshold_as_false":5}
    }
}
def data_loader_read_label_file(institution: str, phase: int, prediction_target: str):
    if prediction_target == "dep_weekly":
        prediction_target_col = "dep"  # 'dep' is the column in dep_weekly.csv
        csv_path = data_factory["survey_folder"][institution][phase] + "dep_weekly.csv"
        df_label = pd.read_csv(csv_path)
    elif prediction_target == "dep_endterm":
        prediction_target_col = "dep"
        csv_path = data_factory["survey_folder"][institution][phase] + "dep_endterm.csv"
        df_label = pd.read_csv(csv_path)
    else:
        # For other custom tasks, read from other CSVs or threshold_book:
        raise ValueError(f"Unsupported prediction target: {prediction_target}")

    # Make sure date -> datetime, unify pid format
    df_label["date"] = pd.to_datetime(df_label["date"])
    df_label["pid"]  = df_label["pid"].apply(lambda x: f"{x}#{institution}_{phase}")

    # Drop duplicates
    df_label = df_label.drop_duplicates(["pid", "date"], keep="last")

    return df_label, prediction_target_col


def data_loader_single_dataset_label_based(
    institution: str,
    phase: int,
    prediction_target: str,
    flag_more_feat_types: bool = False
) -> pd.DataFrame:
    """
    Loads the 4-week window of daily features (from rapids.csv) up to each label date,
    returning a DataFrame with columns [pid, date, X_raw, y_raw, device_type].
    """
    # --- 1) Read rapids.csv as features ---
    df_full_rawdata = pd.read_csv(
        data_factory["feature_folder"][institution][phase] + "rapids.csv",
        low_memory=False
    )
    df_full_rawdata["date"] = pd.to_datetime(df_full_rawdata["date"])
    # unify the pid format
    df_full_rawdata["pid"] = df_full_rawdata["pid"].apply(lambda x: f"{x}#{institution}_{phase}")

    # --- 2) Read participant info (platform.csv) for device_type
    df_participant_file = pd.read_csv(
        data_factory["participants_info_folder"][institution][phase] + "platform.csv",
        low_memory=False
    )
    df_participant_file["pid"] = df_participant_file["pid"].apply(lambda x: f"{x}#{institution}_{phase}")
    df_participant_file = df_participant_file.set_index("pid")

    # --- 3) Load label file (dep_weekly or dep_endterm) ---
    df_label, prediction_target_col = data_loader_read_label_file(institution, phase, prediction_target)

    # --- 4) Decide which sensor columns to keep
    # If only the basic four sensor types (location, screen, sleep, steps),
    # set flag_more_feat_types=False, otherwise keep Bluetooth/call too.
    if not flag_more_feat_types:
        sensor_prefixes = ['f_loc', 'f_screen', 'f_slp', 'f_steps']
    else:
        sensor_prefixes = ['f_loc', 'f_screen', 'f_slp', 'f_steps', 'f_blue', 'f_call']

    retained_features = ["pid", "date"]
    for col in df_full_rawdata.columns:
        for ft in sensor_prefixes:
            if col.startswith(ft):
                retained_features.append(col)
                break

    # --- 5) Build a 4-week window of data for each label date --- ? is this sufficient?
    datapoints = []
    for _, row in df_label.iterrows():
        pid = row["pid"]
        date_end = row["date"]
        date_start = date_end - timedelta(days=27)  # 4 weeks ~ 28 days

        # slice the raw data for that pid, in [date_start, date_end]
        df_data_window = df_full_rawdata[df_full_rawdata["pid"] == pid]
        df_data_window = df_data_window[
            (df_data_window["date"] >= date_start) &
            (df_data_window["date"] <= date_end)
        ]
        if df_data_window.empty:
            continue

        # to ensure each day is present in X_raw, we do an outer merge with date range
        df_placeholder = pd.DataFrame({"date": pd.date_range(date_start, date_end)})
        df_placeholder["pid"] = pid
        df_data_window = pd.merge(
            df_placeholder,
            df_data_window[retained_features],
            on=["pid","date"],
            how="left"
        )

        # assemble the datapoint
        datapoint = {
            "pid": pid,
            "date": date_end, 
            "X_raw": df_data_window[retained_features],  # 4-week daily rows
            "y_raw": row[prediction_target_col],         # label (0 or 1)
            "device_type": df_participant_file.loc[pid]["platform"].split(";")[0]
        }
        datapoints.append(datapoint)

    df_datapoints = pd.DataFrame(datapoints)

    # optional: remove participants with fewer than 2 label points if prediction_target == 'dep_weekly'
    if prediction_target == "dep_weekly":
        pids_few_response = df_datapoints.groupby("pid").size()
        pids_few_response = pids_few_response[pids_few_response < 2].index
        df_datapoints = df_datapoints[~df_datapoints["pid"].isin(pids_few_response)]

    return df_datapoints

In [None]:
df_datapoints = data_loader_single_dataset_label_based(
    institution="INS-W",
    phase=1,
    prediction_target="dep_weekly",
    flag_more_feat_types=False    
)

print(df_datapoints.shape)
df_datapoints.head()

(2354, 5)


Unnamed: 0,pid,date,X_raw,y_raw,device_type
0,INS-W_001#INS-W_1,2018-04-04,pid date \ 0 INS-W_...,False,android
1,INS-W_001#INS-W_1,2018-04-08,pid date \ 0 INS-W_...,False,android
2,INS-W_001#INS-W_1,2018-04-11,pid date \ 0 INS-W_...,False,android
3,INS-W_001#INS-W_1,2018-04-18,pid date \ 0 INS-W_...,False,android
4,INS-W_001#INS-W_1,2018-04-22,pid date \ 0 INS-W_...,False,android


In [None]:
import re

def contains_dis(col_name: str) -> bool:
    # Split the column name by ':' and '_' using regex.
    tokens = re.split("[:_]", col_name)
    # Check if any of the tokens is 'dis'
    return "dis" in tokens

sample_X = df_datapoints.iloc[0]['X_raw']
for col in sample_X.columns:
    if contains_dis(col):
        print(col)

In [8]:
df_datapoints["X_raw"].iloc[2].shape

(28, 3755)

In [9]:
df_datapoints["y_raw"].value_counts()

False    1307
True     1047
Name: y_raw, dtype: int64

# Attempt ARM

In [23]:
import re
import pandas as pd

# For demonstration, select an example subject's data from your loaded dataset.
example = df_datapoints.iloc[0]
print("Example subject PID:", example["pid"])
print("Label date:", example["date"])

df_sensor = example["X_raw"]

# Function to check if a column name indicates a discretized feature.
def is_discretized_column(col_name: str) -> bool:
    tokens = re.split("[:_]", col_name)
    return "dis" in tokens

# Robust function to encode a series of discretized values.
def robust_encode(series: pd.Series) -> pd.Series:
    # Get the unique non-null values.
    unique_vals = series.dropna().unique()
    # Check if they are a subset of the expected discrete tokens.
    valid_tokens = {"l", "m", "h"}
    if len(unique_vals) > 0 and all(val in valid_tokens for val in unique_vals):
        mapping = {"l": 0, "m": 1, "h": 2}
        return series.map(mapping)
    else:
        # If not, try converting to numeric (in case it already is numeric).
        try:
            return pd.to_numeric(series)
        except Exception as e:
            # If conversion fails, return the original series.
            return series

# Suppose df_sensor is your sensor DataFrame (the X_raw part for one subject).
df_encoded = df_sensor.copy()

# Identify sensor columns (exclude metadata like "pid" and "date")
sensor_cols = [col for col in df_sensor.columns if col not in ["pid", "date"]]

# Now loop over sensor columns.
for col in sensor_cols:
    if is_discretized_column(col):
        # For columns with "dis", apply robust encoding.
        df_encoded[col] = robust_encode(df_sensor[col])
    else:
        # For columns without "dis", they might be all NaN or raw values.
        # If they are numeric, you can convert them; if they are all NaN, they won't affect ARM.
        try:
            df_encoded[col] = pd.to_numeric(df_sensor[col])
        except:
            df_encoded[col] = df_sensor[col]

# Now df_encoded should have numeric values for the discretized features.
print("Encoded sensor data (first few rows):")
display(df_encoded.head(2))


Example subject PID: INS-W_001#INS-W_1
Label date: 2018-04-04 00:00:00
Encoded sensor data (first few rows):


Unnamed: 0,pid,date,f_slp:fitbit_sleep_summary_rapids_sumdurationafterwakeupmain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationasleepmain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationawakemain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationtofallasleepmain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationinbedmain:14dhist,f_slp:fitbit_sleep_summary_rapids_avgefficiencymain:14dhist,f_slp:fitbit_sleep_summary_rapids_avgdurationafterwakeupmain:14dhist,f_slp:fitbit_sleep_summary_rapids_avgdurationasleepmain:14dhist,...,f_loc:phone_locations_doryab_timeattop2location_norm:weekend,f_loc:phone_locations_doryab_timeattop3location_norm:weekend,f_loc:phone_locations_doryab_totaldistance_norm:weekend,f_loc:phone_locations_doryab_varspeed_norm:weekend,f_loc:phone_locations_locmap_duration_in_locmap_study_norm:weekend,f_loc:phone_locations_locmap_percent_in_locmap_study_norm:weekend,f_loc:phone_locations_locmap_duration_in_locmap_exercise_norm:weekend,f_loc:phone_locations_locmap_percent_in_locmap_exercise_norm:weekend,f_loc:phone_locations_locmap_duration_in_locmap_greens_norm:weekend,f_loc:phone_locations_locmap_percent_in_locmap_greens_norm:weekend
0,INS-W_001#INS-W_1,2018-03-08,,,,,,,,,...,,,,,,,,,,
1,INS-W_001#INS-W_1,2018-03-09,,,,,,,,,...,,,,,,,,,,


In [24]:
df_encoded.shape

(28, 3755)

# Try ARM

In [26]:
import pandas as pd
import numpy as np
from datetime import timedelta

# A simple function to encode a numeric series into "l", "m", "h" based on its 33rd and 66th percentiles.
def encode_low_med_high(series):
    if series.dropna().empty:
        return series
    q33, q66 = series.quantile(0.33), series.quantile(0.66)
    def mapper(val):
        if pd.isna(val):
            return np.nan
        if val < q33:
            return "l"
        elif val < q66:
            return "m"
        else:
            return "h"
    return series.apply(mapper)

# --- STEP 1: Select one subject's data and encode sensor features ---
# (Assume df_datapoints is already loaded by your dataloader; here we take the first subject for example)

example = df_datapoints.iloc[0]
print("Example subject PID:", example["pid"])
print("Label date:", example["date"])

# Get the sensor data for this subject (the 4-week window)
df_sensor = example["X_raw"]

# Make a copy for encoding
df_encoded = df_sensor.copy()

# Identify sensor columns: assume that all columns except "pid" and "date" are sensor features.
sensor_cols = [col for col in df_sensor.columns if col not in ["pid", "date"]]

# (Optional) If your raw sensor features are not yet encoded to "l"/"m"/"h",
# you can apply the above function column‐wise.
# For example, if the column is numeric you might do:
# df_encoded[col] = encode_low_med_high(df_sensor[col])
# (In our case, you mentioned the “with dis” columns already contain strings like "l".)

# --- STEP 2: Create a mapping dictionary for sensor columns ---
# We assign unique codes for each sensor column’s levels.
mapping_dict = {}
counter = 1
for col in sensor_cols:
    # Here, for each sensor column, we assign:
    # l -> counter, m -> counter+1, h -> counter+2.
    mapping_dict[col] = {"l": counter, "m": counter + 1, "h": counter + 2}
    counter += 3

# Now, create a numeric version of the sensor data.
df_numeric = df_encoded.copy()

# Process only sensor columns (skip meta columns)
for col in sensor_cols:
    # We assume the column contains either NaN or one of "l", "m", "h".
    # (If you ever get numeric values already, you might want to leave them untouched.)
    def map_val(val):
        if pd.isna(val):
            return np.nan
        # If the value is already numeric (unlikely in a "dis" column), return as is.
        try:
            # This conversion is only attempted for sensor levels.
            return mapping_dict[col][str(val).lower()]
        except KeyError:
            # In case the value is not one of "l", "m", "h" (e.g. a PID), return NaN.
            return np.nan
    df_numeric[col] = df_numeric[col].apply(map_val)

print("\nExample numeric sensor data (first few rows):")
display(df_numeric.head())

# --- STEP 3: Create transactions for ARM ---
# Here, we assume each row in df_numeric corresponds to one day.
# We convert each row (across the sensor_cols) into a list of item codes.
transactions = (
    df_numeric[sensor_cols]
    .dropna(how="all")  # drop days with all missing sensor data
    .apply(lambda row: [int(row[col]) for col in sensor_cols if pd.notna(row[col])], axis=1)
    .tolist()
)

print("Sample transactions (first 5 days):")
for t in transactions[:5]:
    print(t)

# --- STEP 4: Run Association Rule Mining (ARM) using Spark's FPGrowth ---
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth

# Initialize a local SparkSession
spark = SparkSession.builder \
    .appName("ARMExample") \
    .master("local[*]") \
    .getOrCreate()

# Create a Spark DataFrame for transactions.
# Here, we give each transaction an ID.
df_transactions = spark.createDataFrame(
    [(i, t) for i, t in enumerate(transactions) if len(t) > 1],  # only transactions with >1 item
    ["id", "items"]
)

# Set thresholds for FPGrowth (tweak these as needed)
min_support = 0.2  # e.g., an itemset appears in at least 20% of days
min_confidence = 0.6

fpGrowth = FPGrowth(itemsCol="items", minSupport=min_support, minConfidence=min_confidence)
model = fpGrowth.fit(df_transactions)

# Retrieve the association rules as a Pandas DataFrame.
rules = model.associationRules.toPandas()
print("\nAssociation Rules:")
print(rules)

# Stop Spark when done.
spark.stop()


Example subject PID: INS-W_001#INS-W_1
Label date: 2018-04-04 00:00:00

Example numeric sensor data (first few rows):


Unnamed: 0,pid,date,f_slp:fitbit_sleep_summary_rapids_sumdurationafterwakeupmain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationasleepmain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationawakemain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationtofallasleepmain:14dhist,f_slp:fitbit_sleep_summary_rapids_sumdurationinbedmain:14dhist,f_slp:fitbit_sleep_summary_rapids_avgefficiencymain:14dhist,f_slp:fitbit_sleep_summary_rapids_avgdurationafterwakeupmain:14dhist,f_slp:fitbit_sleep_summary_rapids_avgdurationasleepmain:14dhist,...,f_loc:phone_locations_doryab_timeattop2location_norm:weekend,f_loc:phone_locations_doryab_timeattop3location_norm:weekend,f_loc:phone_locations_doryab_totaldistance_norm:weekend,f_loc:phone_locations_doryab_varspeed_norm:weekend,f_loc:phone_locations_locmap_duration_in_locmap_study_norm:weekend,f_loc:phone_locations_locmap_percent_in_locmap_study_norm:weekend,f_loc:phone_locations_locmap_duration_in_locmap_exercise_norm:weekend,f_loc:phone_locations_locmap_percent_in_locmap_exercise_norm:weekend,f_loc:phone_locations_locmap_duration_in_locmap_greens_norm:weekend,f_loc:phone_locations_locmap_percent_in_locmap_greens_norm:weekend
0,INS-W_001#INS-W_1,2018-03-08,,,,,,,,,...,,,,,,,,,,
1,INS-W_001#INS-W_1,2018-03-09,,,,,,,,,...,,,,,,,,,,
2,INS-W_001#INS-W_1,2018-03-10,,,,,,,,,...,,,,,,,,,,
3,INS-W_001#INS-W_1,2018-03-11,,,,,,,,,...,,,,,,,,,,
4,INS-W_001#INS-W_1,2018-03-12,,,,,,,,,...,,,,,,,,,,


Sample transactions (first 5 days):
[418, 421, 424, 428, 430, 435, 436, 439, 442, 446, 448, 451, 456, 457, 646, 651, 654, 657, 661, 665, 668, 672, 675, 676, 679, 682, 686, 688, 691, 694, 697, 700, 704, 706, 710, 1669, 1672, 1675, 1679, 1681, 1686, 1687, 1690, 1693, 1697, 1699, 1702, 1707, 1709, 1897, 1902, 1905, 1908, 1912, 1916, 1919, 1923, 1926, 1927, 1930, 1933, 1937, 1939, 1942, 1945, 1948, 1951, 1955, 1957, 1961, 3165, 3168, 3170, 3174, 3177, 3180, 3181, 3184, 3188, 3190, 3193, 3198, 3201, 3204, 3206, 3209, 3213, 4172, 4174, 4177, 4181, 4183, 4187, 4190, 4192, 4195, 4199, 4201, 4205, 4209, 4212, 4219, 4222, 4227, 4228, 4232, 4235, 4238, 4241, 4245, 4247, 4249, 4253, 4255, 4259, 4261, 4265, 4268, 4271, 4401, 4404, 4407, 4410, 4416, 4419, 4421, 4425, 4428, 4431, 4432, 4435, 4439, 4441, 4444, 4449, 4452, 4455, 4457, 4460, 4463, 5666, 5669, 5672, 5675, 5678, 5682, 5684, 5687, 5690, 5692, 5696, 5700, 5702, 5705, 5708, 5710, 5713, 6721, 6724, 6729, 6731, 6735, 6738, 6741, 6744, 6747, 67

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/kwang/opt/anaconda3/envs/globem/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=68>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/kwang/opt/anaconda3/envs/globem/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/kwang/opt/anaconda3/envs/globem/lib/python3.8/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/kwang/opt/anaconda3/envs/globem/lib/python3.8/site-packages/py4j/clientserver.py", line 

Py4JError: An error occurred while calling o72.collectToPython