In [1]:
import pandas as pd
from sklearn.cluster import MeanShift
import numpy as np

df = pd.read_csv("datasets/fitbit/processed_data/calories.csv", parse_dates=["vt"])
df = df[df["participant"] == "p01"][["vt", "calories"]]
# creating a seconds column showing the distance from the start point
df["seconds"]  = (df["vt"] - min(df["vt"])).apply(lambda x: x.total_seconds())
df.head()

Unnamed: 0,vt,calories,seconds
0,2019-11-01 00:00:00,1.39,0.0
1,2019-11-01 00:01:00,1.39,60.0
2,2019-11-01 00:02:00,1.39,120.0
3,2019-11-01 00:03:00,1.39,180.0
4,2019-11-01 00:04:00,1.39,240.0


a function that takes a dataframe and creates buckets based on the the value in the list of bucket_values

In [2]:
def create_buckets(df, bucket_values, group_name, value_name, prefix):
    df['bucket'] = pd.cut(df[group_name], bins=[-float('inf')] + bucket_values + [float('inf')],
                           labels=['{}_{}'.format(prefix,i) for i in range(1, len(bucket_values) + 2)])
    result_df = df.groupby('bucket', observed=False)[value_name].apply(list).reset_index(name='values_in_bucket')
    return result_df

# Example usage:
bucket_df = create_buckets(df, [1.40, 4.50, 10], 'calories', 'seconds', 'cal')
bucket_df

Unnamed: 0,bucket,values_in_bucket
0,cat_1,"[0.0, 60.0, 120.0, 180.0, 240.0, 300.0, 360.0,..."
1,cat_2,"[6360.0, 8940.0, 11700.0, 12480.0, 12780.0, 12..."
2,cat_3,"[23580.0, 23640.0, 23700.0, 23760.0, 23820.0, ..."
3,cat_4,"[24540.0, 24600.0, 24660.0, 24720.0, 24780.0, ..."


#### Function for Creating Interval pairs based on MeanShift clustering

In [3]:

def bucket_to_interval(lst): #we suppose that the list has a single feature of the values in the bucket
    ms = MeanShift(bandwidth=None, bin_seeding=True)
    ms.fit(np.array(lst).reshape(-1, 1)) 
    labels = ms.labels_
    
    intervals = []
    current_label = labels[0]
    start_index = 0
    
    # Iterates through labels to identify continuous intervals
    for i, label in enumerate(labels[1:], start=1):
        if label != current_label:
            intervals.append([start_index, i - 1])
            start_index = i
            current_label = label
    
    # Adds the last interval
    intervals.append([start_index, len(labels) - 1])
    
    # Converts intervals to start and end values
    start_end_values = []
    for interval in intervals:
        start_value = lst[interval[0]]
        end_value = lst[interval[1]]
        start_end_values.append([start_value, end_value])
    
    return start_end_values

# Example usage:
example_lst = bucket_df.values_in_bucket.iloc[2][:24*60*2]
example_lst2 = bucket_df.values_in_bucket.iloc[3][:24*60*2]
#values that we have [0.0, 60.0, 120.0, 180.0, 240.0, 300.0, 360.0, 420.0, 480.0, 540.0]
# labels [2, 2, 2, 4, 4, 1, 0, 0, 3, 3]
result1 = bucket_to_interval(example_lst)
result2 = bucket_to_interval(example_lst2)

result1 #,result2

[[23580.0, 485760.0], [488940.0, 998280.0], [998460.0, 1442760.0]]

In [13]:
bucket_df["intervals_in_bucket"] = bucket_df.values_in_bucket.apply(lambda x: bucket_to_interval(x[:1000]))
bucket_df

Unnamed: 0,bucket,values_in_bucket,intervals_in_bucket
0,cat_1,"[0.0, 60.0, 120.0, 180.0, 240.0, 300.0, 360.0,...","[[0.0, 28080.0], [28140.0, 68940.0], [69000.0,..."
1,cat_2,"[6360.0, 8940.0, 11700.0, 12480.0, 12780.0, 12...","[[6360.0, 88860.0], [96900.0, 173220.0], [1733..."
2,cat_3,"[23580.0, 23640.0, 23700.0, 23760.0, 23820.0, ...","[[23580.0, 140340.0], [140400.0, 258180.0], [2..."
3,cat_4,"[24540.0, 24600.0, 24660.0, 24720.0, 24780.0, ...","[[24540.0, 1455780.0], [1496340.0, 2186640.0]]"


In [21]:
a = pd.concat([pd.DataFrame([{"bucket": row[1].bucket, "interval": i} for i in row[1].intervals_in_bucket])  for row in bucket_df.iterrows()])
a.reset_index(inplace=True, drop=True)
a    

Unnamed: 0,bucket,interval
0,cat_1,"[0.0, 28080.0]"
1,cat_1,"[28140.0, 68940.0]"
2,cat_1,"[69000.0, 106260.0]"
3,cat_2,"[6360.0, 88860.0]"
4,cat_2,"[96900.0, 173220.0]"
5,cat_2,"[173340.0, 215400.0]"
6,cat_3,"[23580.0, 140340.0]"
7,cat_3,"[140400.0, 258180.0]"
8,cat_3,"[283080.0, 465960.0]"
9,cat_4,"[24540.0, 1455780.0]"


In [25]:
def create_intervals(created_buckets):
    cb = created_buckets[created_buckets.values_in_bucket.apply(lambda x: len(x)>0)].copy()
    cb["intervals_in_bucket"] = cb.values_in_bucket.apply(lambda x: bucket_to_interval(x)) 
    r = pd.concat([pd.DataFrame([{"bucket": row[1].bucket, "interval": i} for i in row[1].intervals_in_bucket])  for row in cb.iterrows()])
    r.reset_index(inplace=True, drop=True)
    return r
    

In [28]:
test = bucket_df[["bucket", "values_in_bucket"]].copy()

In [30]:
test["values_in_bucket"] = test["values_in_bucket"].apply(lambda x: x[:1000]) 

In [31]:
create_intervals(test)

Unnamed: 0,bucket,interval
0,cat_1,"[0.0, 28080.0]"
1,cat_1,"[28140.0, 68940.0]"
2,cat_1,"[69000.0, 106260.0]"
3,cat_2,"[6360.0, 88860.0]"
4,cat_2,"[96900.0, 173220.0]"
5,cat_2,"[173340.0, 215400.0]"
6,cat_3,"[23580.0, 140340.0]"
7,cat_3,"[140400.0, 258180.0]"
8,cat_3,"[283080.0, 465960.0]"
9,cat_4,"[24540.0, 1455780.0]"
