# Data Loading

In [14]:
import os
from datetime import timedelta
from copy import deepcopy
import pandas as pd
# For workstation
# data_dir = "/home/willkewang/Datasets/GLOBEM/"
# For macbook
data_dir = '/Users/kwang/Data/globem'

# A minimal data_factory dict with folder paths
data_factory = {
    "feature_folder": {
        "INS-W": {
            1: "{}/INS-W_1/FeatureData/".format(data_dir),
            2: "{}/INS-W_2/FeatureData/".format(data_dir),
            3: "{}/INS-W_3/FeatureData/".format(data_dir),
            4: "{}/INS-W_4/FeatureData/".format(data_dir),
        }
    },
    "survey_folder": {
        "INS-W": {
            1: "{}/INS-W_1/SurveyData/".format(data_dir),
            2: "{}/INS-W_2/SurveyData/".format(data_dir),
            3: "{}/INS-W_3/SurveyData/".format(data_dir),
            4: "{}/INS-W_4/SurveyData/".format(data_dir),
        }
    },
    "participants_info_folder": {
        "INS-W": {
            1: "{}/INS-W_1/ParticipantsInfoData/".format(data_dir),
            2: "{}/INS-W_2/ParticipantsInfoData/".format(data_dir),
            3: "{}/INS-W_3/ParticipantsInfoData/".format(data_dir),
            4: "{}/INS-W_4/ParticipantsInfoData/".format(data_dir),
        }
    },
    
    # If only loading dep_weekly or dep_endterm,
    # an empty placeholder for threshold_book:
    "threshold_book": {
        # Example of other tasks if expanding beyond dep_weekly / dep_endterm
        # "some_other_survey_task": {"threshold_as_true":10, "threshold_as_false":5}
    }
}
def data_loader_read_label_file(institution: str, phase: int, prediction_target: str):
    if prediction_target == "dep_weekly":
        prediction_target_col = "dep"  # 'dep' is the column in dep_weekly.csv
        csv_path = data_factory["survey_folder"][institution][phase] + "dep_weekly.csv"
        df_label = pd.read_csv(csv_path)
    elif prediction_target == "dep_endterm":
        prediction_target_col = "dep"
        csv_path = data_factory["survey_folder"][institution][phase] + "dep_endterm.csv"
        df_label = pd.read_csv(csv_path)
    else:
        # For other custom tasks, read from other CSVs or threshold_book:
        raise ValueError(f"Unsupported prediction target: {prediction_target}")

    # Make sure date -> datetime, unify pid format
    df_label["date"] = pd.to_datetime(df_label["date"])
    df_label["pid"]  = df_label["pid"].apply(lambda x: f"{x}#{institution}_{phase}")

    # Drop duplicates
    df_label = df_label.drop_duplicates(["pid", "date"], keep="last")

    return df_label, prediction_target_col


def data_loader_single_dataset_label_based(
    institution: str,
    phase: int,
    prediction_target: str,
    flag_more_feat_types: bool = False
) -> pd.DataFrame:
    """
    Loads the 4-week window of daily features (from rapids.csv) up to each label date,
    returning a DataFrame with columns [pid, date, X_raw, y_raw, device_type].
    """
    # --- 1) Read rapids.csv as features ---
    df_full_rawdata = pd.read_csv(
        data_factory["feature_folder"][institution][phase] + "rapids.csv",
        low_memory=False
    )
    df_full_rawdata["date"] = pd.to_datetime(df_full_rawdata["date"])
    # unify the pid format
    df_full_rawdata["pid"] = df_full_rawdata["pid"].apply(lambda x: f"{x}#{institution}_{phase}")

    # --- 2) Read participant info (platform.csv) for device_type
    df_participant_file = pd.read_csv(
        data_factory["participants_info_folder"][institution][phase] + "platform.csv",
        low_memory=False
    )
    df_participant_file["pid"] = df_participant_file["pid"].apply(lambda x: f"{x}#{institution}_{phase}")
    df_participant_file = df_participant_file.set_index("pid")

    # --- 3) Load label file (dep_weekly or dep_endterm) ---
    df_label, prediction_target_col = data_loader_read_label_file(institution, phase, prediction_target)

    # --- 4) Decide which sensor columns to keep
    # If only the basic four sensor types (location, screen, sleep, steps),
    # set flag_more_feat_types=False, otherwise keep Bluetooth/call too.
    if not flag_more_feat_types:
        sensor_prefixes = ['f_loc', 'f_screen', 'f_slp', 'f_steps']
    else:
        sensor_prefixes = ['f_loc', 'f_screen', 'f_slp', 'f_steps', 'f_blue', 'f_call']

    retained_features = ["pid", "date"]
    for col in df_full_rawdata.columns:
        for ft in sensor_prefixes:
            if col.startswith(ft):
                retained_features.append(col)
                break

    # --- 5) Build a 4-week window of data for each label date --- ? is this sufficient?
    datapoints = []
    for _, row in df_label.iterrows():
        pid = row["pid"]
        date_end = row["date"]
        date_start = date_end - timedelta(days=27)  # 4 weeks ~ 28 days

        # slice the raw data for that pid, in [date_start, date_end]
        df_data_window = df_full_rawdata[df_full_rawdata["pid"] == pid]
        df_data_window = df_data_window[
            (df_data_window["date"] >= date_start) &
            (df_data_window["date"] <= date_end)
        ]
        if df_data_window.empty:
            continue

        # to ensure each day is present in X_raw, we do an outer merge with date range
        df_placeholder = pd.DataFrame({"date": pd.date_range(date_start, date_end)})
        df_placeholder["pid"] = pid
        df_data_window = pd.merge(
            df_placeholder,
            df_data_window[retained_features],
            on=["pid","date"],
            how="left"
        )

        # assemble the datapoint
        datapoint = {
            "pid": pid,
            "date": date_end, 
            "X_raw": df_data_window[retained_features],  # 4-week daily rows
            "y_raw": row[prediction_target_col],         # label (0 or 1)
            "device_type": df_participant_file.loc[pid]["platform"].split(";")[0]
        }
        datapoints.append(datapoint)

    df_datapoints = pd.DataFrame(datapoints)

    # optional: remove participants with fewer than 2 label points if prediction_target == 'dep_weekly'
    if prediction_target == "dep_weekly":
        pids_few_response = df_datapoints.groupby("pid").size()
        pids_few_response = pids_few_response[pids_few_response < 2].index
        df_datapoints = df_datapoints[~df_datapoints["pid"].isin(pids_few_response)]

    return df_datapoints

In [15]:
df_datapoints = data_loader_single_dataset_label_based(
    institution="INS-W",
    phase=1,
    prediction_target="dep_weekly",
    flag_more_feat_types=False    
)

print(df_datapoints.shape)

(2354, 5)


# Spark implementation

## Filter for only 3 features

In [None]:
import pandas as pd
import numpy as np
import gc, time, warnings
from datetime import timedelta
warnings.filterwarnings("ignore")

# -------------------------------
# SETTINGS & HELPER FUNCTIONS
# -------------------------------

# Select three base features from Xu's list.
xu_base_features = [
    "f_screen:phone_screen_rapids_avgdurationunlock",
    "f_slp:fitbit_sleep_intraday_rapids_maxdurationasleepunifiedmain",
    "f_steps:fitbit_steps_intraday_rapids_sumsteps"
]

# Xu uses four time epochs.
time_epochs = ["morning", "afternoon", "evening", "night"]

# Generate the full list of discretized feature names.
# (For each base feature, we expect columns like
#  "{base_feature}_dis:{epoch}")
selected_discretized_features = [f"{feat}_dis:{epoch}" for feat in xu_base_features for epoch in time_epochs]

# A simple function to drop duplicate records for the same day from a subject’s sensor DataFrame.
def drop_duplicate_days(df_sensor):
    # Assumes df_sensor has a 'date' column (of type datetime).
    # If there are multiple records for the same day, keep the last.
    return df_sensor.drop_duplicates(subset="date", keep="last")

# A helper to build a transaction for one day.
# For each of the selected discretized feature columns,
# if a value is present, we create an item of the form "col=value".
def build_transaction(row, feature_cols):
    items = []
    for col in feature_cols:
        if col in row.index and pd.notna(row[col]):
            items.append(f"{col}={row[col]}")
    return items

# A cleanup function to clear Spark cache and force garbage collection.
def cleanup_spark(spark):
    try:
        spark.catalog.clearCache()
    except Exception as e:
        print("Error clearing Spark cache:", e)
    gc.collect()

# -------------------------------
# MAIN SCRIPT: Process 200 Subjects using 3 Features
# -------------------------------

# Assume df_datapoints is already loaded (e.g. via your data loader)
# and has at least the following columns: 'pid', 'date', and 'X_raw'.
# Here, each row in df_datapoints corresponds to one subject,
# and the "X_raw" column is a DataFrame containing that subject's time-series sensor data.

# Select 200 subjects (for example, the first 200 unique subject IDs)
all_subjects = df_datapoints["pid"].unique()
selected_subjects = all_subjects[:1000]  # adjust as needed
df_subset = df_datapoints[df_datapoints["pid"].isin(selected_subjects)].copy()
print(df_subset.shape)

# 1. Create an empty list to collect each subject's DataFrame.
list_of_dfs = []

for subject in selected_subjects:
    # Select rows for the subject.
    subject_df = df_subset[df_subset["pid"] == subject]
    
    # Get the time-series DataFrame from X_raw.
    df_sensor = drop_duplicate_days(subject_df.iloc[0]["X_raw"])
    
    # Only keep columns of interest (including date).
    df_sensor = df_sensor[["date"] + selected_discretized_features].copy()
    
    # Optionally, add pid to keep track of which rows belong to which subject.
    df_sensor["pid"] = subject
    
    # Append this subject’s DataFrame to the list.
    list_of_dfs.append(df_sensor)

# 2. Concatenate all the collected DataFrames.
df_stacked = pd.concat(list_of_dfs, ignore_index=True)
print(df_stacked.shape)

(2354, 5)
(4284, 14)


# Spark execution

## Adding weekday vs weekend slices:

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
import json

##################################
# 2. Create SparkSession
##################################
spark = SparkSession.builder \
    .appName("SparkFPGrowthExample") \
    .master("local[*]") \
    .getOrCreate()


# Suppose df_stacked has columns:
#   - date (pd.Timestamp)
#   - pid
#   - f_screen:phone_screen_rapids_avgdurationunlock_dis:morning
#   - f_slp:fitbit_sleep_intraday_rapids_maxdurationasleepunifiedmain_dis:morning
#   - ...
#   - f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:night
# and so on, for both weekdays and weekends.

slices = {
    "wkdy_morning": {"dayofweek": range(0,5), "epoch": "morning"},
    "wkdy_afternoon": {"dayofweek": range(0,5), "epoch": "afternoon"},
    "wkdy_evening": {"dayofweek": range(0,5), "epoch": "evening"},
    "wkdy_night": {"dayofweek": range(0,5), "epoch": "night"},
    "wkend_morning": {"dayofweek": range(5,7), "epoch": "morning"},
    "wkend_afternoon": {"dayofweek": range(5,7), "epoch": "afternoon"},
    "wkend_evening": {"dayofweek": range(5,7), "epoch": "evening"},
    "wkend_night": {"dayofweek": range(5,7), "epoch": "night"},
}

############################################################
# 2) Integer-Encoding Items
############################################################
def build_item_mapping(df_stacked, discretized_cols):
    """
    Scan all discretized columns, gather unique string 'col=value',
    and assign an integer ID to each.
    Returns a dict mapping 'col=value' -> integerID (item2id),
    plus the inverse mapping integerID -> 'col=value' (id2item).
    """
    item_set = set()

    # Collect all unique 'col=value' pairs found in df_stacked (for all rows).
    for col in discretized_cols:
        unique_vals = df_stacked[col].dropna().unique()
        for val in unique_vals:
            item_str = f"{col}={val}"
            item_set.add(item_str)

    # Now assign an integer ID. We'll just enumerate them.
    item2id = {}
    id2item = {}
    for idx, item_str in enumerate(sorted(item_set), start=1):
        item2id[item_str] = idx
        id2item[idx] = item_str

    id2item_strkeys = {str(k): v for k, v in id2item.items()}

    mapping_dict = {
        "item2id": item2id,
        "id2item": id2item_strkeys
    }

    # Write to JSON
    with open("item_mapping.json", "w") as f:
        json.dump(mapping_dict, f, indent=2)
    print("Saved item mappings to item_mapping.json")

    return item2id, id2item


def build_transaction_int(row, feature_cols, item2id):
    """
    Convert discretized columns from row into a list of integer-encoded items.
    E.g., if row['f_screen_dis:morning'] == 'low', then item is "f_screen_dis:morning=low".
    If item2id says that's 123, we add 123 to the transaction.
    """
    items_int = []
    for col in feature_cols:
        val = row[col]
        if pd.notna(val):
            item_str = f"{col}={val}"
            if item_str in item2id:
                items_int.append(item2id[item_str])
    return items_int

############################################################
# 3) Prepare the Master Mapping + Helper Lists
############################################################
# Identify which columns in df_stacked are your "discretized features."
# For example, all columns that contain "_dis:" might be your discretized columns.
discretized_cols = selected_discretized_features

# Build the mapping for all possible "column=value" pairs.
item2id, id2item = build_item_mapping(df_stacked, discretized_cols)
print(f"Total unique discrete items: {len(item2id)}")

############################################################
# 4) Run Spark FPGrowth Per Slice
############################################################
# Let’s define the Spark session once, then re-use it for each slice.
spark = SparkSession.builder \
    .appName("SparkFPGrowth_xuStyle") \
    .master("local[*]") \
    .getOrCreate()

# We'll store results in a dictionary keyed by slice name.
all_rules_dfs = {}  # each entry is a Pandas DataFrame of association rules
all_freq_itemsets_dfs = {}

# Xu’s same thresholds. (Adjust if you prefer.)
# Example: 30% support, 70% confidence.
min_support = 0.125
min_confidence = 0.5

for slice_key, slice_info in slices.items():
    dayofweek_range = slice_info["dayofweek"]  # e.g. range(0,5) for M-F
    epoch = slice_info["epoch"]               # "morning", etc.

    # Filter df_stacked to just the desired day-of-week + epoch
    # We'll check if the column name contains the epoch, or if we just want
    # to gather columns that are relevant. 

    df_slice = df_stacked[
        df_stacked["date"].dt.dayofweek.isin(dayofweek_range)
    ].copy()

    relevant_cols = [c for c in discretized_cols if epoch in c]

    # Build transactions as lists of integers
    transactions_int = []
    for i, row in df_slice.iterrows():
        tx = build_transaction_int(row, relevant_cols, item2id)
        if len(tx) > 1:
            transactions_int.append((i, tx))  # (transactionID, items)

    if not transactions_int:
        print(f"No transactions for slice {slice_key}, skipping.")
        continue

    # Create Spark DataFrame with columns: "id" and "items"
    df_transactions = spark.createDataFrame(
        transactions_int, ["id", "items"]
    )

    print(f"\n--- Slice: {slice_key} ---")
    print("Num transactions:", df_transactions.count())

    # Run FPGrowth
    fpGrowth = FPGrowth(
        itemsCol="items",
        minSupport=min_support,
        minConfidence=min_confidence
    )
    model = fpGrowth.fit(df_transactions)

    # Collect frequent itemsets
    freq_itemsets_spark = model.freqItemsets
    freq_itemsets_pd = freq_itemsets_spark.toPandas()
    # Convert item IDs back to strings
    freq_itemsets_pd["items_str"] = freq_itemsets_pd["items"].apply(
        lambda ids: [id2item[item_id] for item_id in ids]
    )
    all_freq_itemsets_dfs[slice_key] = freq_itemsets_pd

    # Collect association rules
    rules_spark = model.associationRules
    rules_pd = rules_spark.toPandas()
    
    # Convert IDs back to strings
    rules_pd["antecedent_str"] = rules_pd["antecedent"].apply(
        lambda ids: [id2item[item_id] for item_id in ids]
    )
    rules_pd["consequent_str"] = rules_pd["consequent"].apply(
        lambda ids: [id2item[item_id] for item_id in ids]
    )
    all_rules_dfs[slice_key] = rules_pd

    # Print out some results
    for idx, row in rules_pd.iterrows():
        print(f"\n---- Row {idx} ----")
        row_dict = row.to_dict()
        for k,v in row_dict.items():
            print(k, v)

        print("####################################################")

##################################
# 5. Extract Frequent Itemsets
##################################
freq_itemsets = model.freqItemsets  # This is a Spark DataFrame
freq_itemsets_pandas = freq_itemsets.toPandas()  # Convert to Pandas
print("\nFrequent Itemsets:")
print(freq_itemsets_pandas)

##################################
# 6. Extract Association Rules
##################################
rules = model.associationRules  # Spark DataFrame
rules_pandas = rules.toPandas() # Convert to Pandas
print("\nAssociation Rules:")
print(rules_pandas)

##################################
# 7. (Optional) Filter or Sort Rules
##################################
# Example filter: keep rules with confidence >= 0.5 and lift > 2
filtered_rules = rules_pandas[
    (rules_pandas["confidence"] >= 0.5) & 
    (rules_pandas["lift"] > 2)
]
print("\nFiltered Rules (conf >= 0.5, lift > 2):")
print(filtered_rules)

##################################
# 8. Stop Spark
##################################
spark.stop()


Saved item mappings to item_mapping.json
Total unique discrete items: 36

--- Slice: wkdy_morning ---
Num transactions: 241

---- Row 0 ----
antecedent [9]
consequent [31]
confidence 0.5581395348837209
lift 1.026806319900586
support 0.0995850622406639
antecedent_str ['f_screen:phone_screen_rapids_avgdurationunlock_dis:morning=m']
consequent_str ['f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:morning=h']
####################################################

---- Row 1 ----
antecedent [32, 20]
consequent [7]
confidence 0.6666666666666666
lift 3.3472222222222223
support 0.016597510373443983
antecedent_str ['f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:morning=l', 'f_slp:fitbit_sleep_intraday_rapids_maxdurationasleepunifiedmain_dis:morning=l']
consequent_str ['f_screen:phone_screen_rapids_avgdurationunlock_dis:morning=h']
####################################################

---- Row 2 ----
antecedent [9, 19]
consequent [31]
confidence 0.5555555555555556
lift 1.0220525869380832
su

## Spark clean up

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth

# Initialize a local SparkSession
spark = SparkSession.builder \
    .appName("ARMExample") \
    .master("local[*]") \
    .getOrCreate()

spark.stop()

import gc
gc.collect()

import os
import shutil

# You may need to adjust this path if you have a custom setting.
spark_temp_dir = "/tmp"

# List all items in the temporary directory
for item in os.listdir(spark_temp_dir):
    # Check if the item starts with 'spark-' (typical prefix for Spark temp directories)
    if item.startswith("spark-"):
        full_path = os.path.join(spark_temp_dir, item)
        try:
            shutil.rmtree(full_path)
            print(f"Removed temporary folder: {full_path}")
        except Exception as e:
            print(f"Could not remove {full_path}: {e}")

178

# Older implementation with onehot encoding

In [4]:
import pandas as pd
import numpy as np
import time
import gc
import warnings

# MLxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

warnings.filterwarnings("ignore")

def build_transaction(row, feature_cols):
    """
    Build a transaction list from a single row of data.
    For each selected discretized column, if it has a valid (non-NaN) value,
    produce an item of the form "col=value".
    Returns a list of items (strings).
    """
    items = []
    for col in feature_cols:
        if col in row and pd.notna(row[col]):
            items.append(f"{col}={row[col]}")
    return items

transactions = []

for idx, row in df_stacked.iterrows():
    # Build the list of discrete items for this (pid, date)
    tx = build_transaction(row, selected_discretized_features)
    transactions.append(tx)

print(f"Number of transactions: {len(transactions)}")

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

df_onehot = pd.DataFrame(te_ary, columns=te.columns_)
df_onehot.head()

Number of transactions: 4284


Unnamed: 0,f_screen:phone_screen_rapids_avgdurationunlock_dis:afternoon=h,f_screen:phone_screen_rapids_avgdurationunlock_dis:afternoon=l,f_screen:phone_screen_rapids_avgdurationunlock_dis:afternoon=m,f_screen:phone_screen_rapids_avgdurationunlock_dis:evening=h,f_screen:phone_screen_rapids_avgdurationunlock_dis:evening=l,f_screen:phone_screen_rapids_avgdurationunlock_dis:evening=m,f_screen:phone_screen_rapids_avgdurationunlock_dis:morning=h,f_screen:phone_screen_rapids_avgdurationunlock_dis:morning=l,f_screen:phone_screen_rapids_avgdurationunlock_dis:morning=m,f_screen:phone_screen_rapids_avgdurationunlock_dis:night=h,...,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:afternoon=m,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:evening=h,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:evening=l,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:evening=m,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:morning=h,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:morning=l,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:morning=m,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:night=h,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:night=l,f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:night=m
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
min_support = 0.01  # Example: require at least 1% support

frequent_itemsets = apriori(df_onehot, min_support=min_support, use_colnames=True)
frequent_itemsets.sort_values("support", ascending=False, inplace=True)
frequent_itemsets.shape

(46, 2)

In [13]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules.sort_values("confidence", ascending=False, inplace=True)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
5,(f_slp:fitbit_sleep_intraday_rapids_maxduratio...,(f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.019841,0.031513,0.012838,0.647059,20.533333,1.0,0.012213,2.744048,0.970556,0.333333,0.635575,0.527233
4,(f_slp:fitbit_sleep_intraday_rapids_maxduratio...,(f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.021475,0.031513,0.013072,0.608696,19.315942,1.0,0.012395,2.475023,0.96904,0.327485,0.595963,0.511755
0,(f_steps:fitbit_steps_intraday_rapids_sumsteps...,(f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.027077,0.031513,0.01564,0.577586,18.328736,1.0,0.014786,2.292746,0.971754,0.36413,0.563842,0.536941
7,(f_steps:fitbit_steps_intraday_rapids_sumsteps...,(f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.019608,0.031513,0.011204,0.571429,18.133333,1.0,0.010587,2.259804,0.96375,0.280702,0.557484,0.463492
6,(f_slp:fitbit_sleep_intraday_rapids_maxduratio...,(f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.020308,0.031513,0.011438,0.563218,17.872797,1.0,0.010798,2.217326,0.963618,0.283237,0.549006,0.463091


In [14]:
rules.shape

(14, 14)

In [17]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth

##################################
# 2. Create SparkSession
##################################
spark = SparkSession.builder \
    .appName("SparkFPGrowthExample") \
    .master("local[*]") \
    .getOrCreate()

##################################
# 3. Create Spark DataFrame from transactions
##################################
# The Spark DataFrame must have:
#   - A unique transaction ID (let's call it "id")
#   - A column of type Array<String> (let's call it "items")
# We can build this by enumerating over the `transactions` list.

df_transactions = spark.createDataFrame(
    [(i, t) for i, t in enumerate(transactions)],
    ["id", "items"]
)

# Check how many transactions were loaded and show the first few
num_transactions = df_transactions.count()
print(f"Number of transactions: {num_transactions}")
df_transactions.show(5, truncate=False)

##################################
# 4. Fit FPGrowth Model
##################################
# Choose your thresholds:
min_support = 0.01
min_confidence = 0.125

fpGrowth = FPGrowth(
    itemsCol="items", 
    minSupport=min_support, 
    minConfidence=min_confidence
)
model = fpGrowth.fit(df_transactions)

##################################
# 5. Extract Frequent Itemsets
##################################
# model.freqItemsets is a Spark DataFrame with columns "items" and "freq".
freq_itemsets = model.freqItemsets
freq_itemsets_pandas = freq_itemsets.toPandas()
print("\nFrequent Itemsets:")
print(freq_itemsets_pandas)

##################################
# 6. Extract Association Rules
##################################
# model.associationRules is a Spark DataFrame with columns "antecedent",
# "consequent", "confidence", "lift", (and "support" in newer Spark versions).
rules = model.associationRules
rules_pandas = rules.toPandas()
print("\nAssociation Rules:")
print(rules_pandas)

##################################
# 7. (Optional) Filter or Sort Rules
##################################
# For example, filter rules by confidence or lift:
filtered_rules = rules_pandas[
    (rules_pandas["confidence"] >= 0.5) & 
    (rules_pandas["lift"] > 2)
]
print("\nFiltered Rules (conf >= 0.5, lift > 2):")
print(filtered_rules)

##################################
# 8. Stop Spark
##################################
spark.stop()

Number of transactions: 4284
+---+-----+
|id |items|
+---+-----+
|0  |[]   |
|1  |[]   |
|2  |[]   |
|3  |[]   |
|4  |[]   |
+---+-----+
only showing top 5 rows


Frequent Itemsets:
                                                items  freq
0   [f_steps:fitbit_steps_intraday_rapids_sumsteps...    84
1   [f_steps:fitbit_steps_intraday_rapids_sumsteps...    48
2   [f_slp:fitbit_sleep_intraday_rapids_maxduratio...    52
3   [f_steps:fitbit_steps_intraday_rapids_sumsteps...   135
4   [f_steps:fitbit_steps_intraday_rapids_sumsteps...    81
5   [f_screen:phone_screen_rapids_avgdurationunloc...    51
6   [f_steps:fitbit_steps_intraday_rapids_sumsteps...   116
7   [f_steps:fitbit_steps_intraday_rapids_sumsteps...    67
8   [f_steps:fitbit_steps_intraday_rapids_sumsteps...   112
9   [f_steps:fitbit_steps_intraday_rapids_sumsteps...    55
10  [f_steps:fitbit_steps_intraday_rapids_sumsteps...    59
11  [f_screen:phone_screen_rapids_avgdurationunloc...    79
12  [f_screen:phone_screen_rapids_avgd

In [None]:
filtered_rules

Unnamed: 0,antecedent,consequent,confidence,lift,support
0,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.505882,16.053333,0.010037
2,[f_screen:phone_screen_rapids_avgdurationunloc...,[f_screen:phone_screen_rapids_avgdurationunloc...,0.553398,21.750067,0.013305
15,[f_slp:fitbit_sleep_intraday_rapids_maxduratio...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.535714,22.067308,0.010504
16,[f_slp:fitbit_sleep_intraday_rapids_maxduratio...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.52381,16.622222,0.010271
17,[f_slp:fitbit_sleep_intraday_rapids_maxduratio...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.647059,20.533333,0.012838
19,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.526786,16.716667,0.013772
22,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.577586,18.328736,0.01564
31,[f_slp:fitbit_sleep_intraday_rapids_maxduratio...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.51087,18.866942,0.010971
32,[f_slp:fitbit_sleep_intraday_rapids_maxduratio...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.608696,19.315942,0.013072
33,[f_slp:fitbit_sleep_intraday_rapids_maxduratio...,[f_steps:fitbit_steps_intraday_rapids_sumsteps...,0.54023,19.951249,0.010971


In [22]:
for idx, row in filtered_rules.iterrows():
    print(f"\nRow {idx}:")
    for col in filtered_rules.columns:
        print(f"  {col} = {row[col]}")


Row 0:
  antecedent = ['f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:afternoon=m']
  consequent = ['f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:morning=h']
  confidence = 0.5058823529411764
  lift = 16.053333333333335
  support = 0.010037348272642391

Row 2:
  antecedent = ['f_screen:phone_screen_rapids_avgdurationunlock_dis:morning=l']
  consequent = ['f_screen:phone_screen_rapids_avgdurationunlock_dis:afternoon=l']
  confidence = 0.5533980582524272
  lift = 21.750066803242184
  support = 0.01330532212885154

Row 15:
  antecedent = ['f_slp:fitbit_sleep_intraday_rapids_maxdurationasleepunifiedmain_dis:morning=m']
  consequent = ['f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:evening=h']
  confidence = 0.5357142857142857
  lift = 22.06730769230769
  support = 0.01050420168067227

Row 16:
  antecedent = ['f_slp:fitbit_sleep_intraday_rapids_maxdurationasleepunifiedmain_dis:morning=m']
  consequent = ['f_steps:fitbit_steps_intraday_rapids_sumsteps_dis:morning=h']
  confiden