In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path


In [6]:
# Project Paths 

PROJECT_ROOT = Path("/Users/vineeth/Desktop/RingCon/RingConn-Analysis")
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"


In [7]:
# Loading the data
df_activity = pd.read_csv(DATA_RAW/"V_activity.csv")
df_vitals = pd.read_csv(DATA_RAW/"V_vital_signs.csv")
df_sleep = pd.read_csv(DATA_RAW/"V_sleep.csv")

In [8]:
def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace(r"[()/%.-]", "", regex=True)
    )
    return df

df_activity = clean_columns(df_activity)
df_vitals = clean_columns(df_vitals)
df_sleep = clean_columns(df_sleep)







In [9]:
# Correct Data Types
columns = ["avg_spo2", "min_spo2", "max_spo2"]
for col in columns:
    df_vitals[col] = df_vitals[col].str.rstrip("%").astype(float)

df_vitals["date"] = pd.to_datetime(df_vitals["date"], errors="coerce")
df_activity["date"] = pd.to_datetime(df_activity["date"], errors="coerce")

# Correct format for df_sleep
date_columns = ["start_time", "end_time" , "falling_asleep_time", "wakeup_time"]
for col in date_columns:
    df_sleep[col] = pd.to_datetime(df_sleep[col], errors="coerce")

df_sleep["sleep_time_ratio"] = df_sleep["sleep_time_ratio"].str.rstrip("%").astype(float)


In [10]:
# Feature Engineering

#a) time to fall asleep
df_sleep["time_to_fall_asleep"] = df_sleep["falling_asleep_time"] - df_sleep["start_time"]

#b) time to wake up
df_sleep["time_to_wake_up"] = df_sleep["end_time"] - df_sleep["wakeup_time"]

#c) time asleep hour
df_sleep["time_asleep_hours"] = df_sleep['time_asleepmin'] / 60

#d) REM cycle percentage (Sleep efficiency)
df_sleep["rem_pct"] = df_sleep["sleep_stages__remmin"] / df_sleep["time_asleepmin"]

#e) Deep Sleep percentage
df_sleep["deep_sleep_pct"] = df_sleep["sleep_stages__deep_sleepmin"] / df_sleep["time_asleepmin"]

#f) Light Sleep percentage
df_sleep["light_sleep_pct"] = df_sleep["sleep_stages__light_sleepmin"] / df_sleep["time_asleepmin"]

#g) Sleep midpoint 
df_sleep["sleep_midpoint"] = (
    df_sleep["start_time"] +
    (df_sleep["end_time"] - df_sleep["start_time"]) / 2
)

#h) Time in bed
df_sleep["time_in_bed"] = df_sleep["end_time"] - df_sleep["start_time"]

#i) Date of sleep 
df_sleep["date_of_sleep"] = df_sleep["wakeup_time"].dt.date








In [11]:
# Correcting the data types of the enigeered features
df_sleep["date_of_sleep"] = pd.to_datetime(df_sleep["date_of_sleep"], errors="coerce")

In [12]:
df_sleep.head()

Unnamed: 0,start_time,end_time,falling_asleep_time,wakeup_time,sleep_time_ratio,time_asleepmin,sleep_stages__awakemin,sleep_stages__remmin,sleep_stages__light_sleepmin,sleep_stages__deep_sleepmin,time_to_fall_asleep,time_to_wake_up,time_asleep_hours,rem_pct,deep_sleep_pct,light_sleep_pct,sleep_midpoint,time_in_bed,date_of_sleep
0,2026-01-01 05:59:27,2026-01-01 12:47:01,2026-01-01 06:14:28,2026-01-01 12:44:27,91.0,372,18,82,230,60,0 days 00:15:01,0 days 00:02:34,6.2,0.22043,0.16129,0.61828,2026-01-01 09:23:14.000,0 days 06:47:34,2026-01-01
1,2026-01-01 23:37:49,2026-01-02 09:05:19,2026-01-02 00:00:19,2026-01-02 08:55:19,88.0,502,33,120,300,82,0 days 00:22:30,0 days 00:10:00,8.366667,0.239044,0.163347,0.59761,2026-01-02 04:21:34.000,0 days 09:27:30,2026-01-02
2,2026-01-03 01:47:44,2026-01-03 11:30:14,2026-01-03 02:02:44,2026-01-03 11:17:44,91.0,531,24,108,328,95,0 days 00:15:00,0 days 00:12:30,8.85,0.20339,0.178908,0.617702,2026-01-03 06:38:59.000,0 days 09:42:30,2026-01-03
3,2026-01-03 22:55:19,2026-01-04 09:02:49,2026-01-03 23:15:19,2026-01-04 08:55:19,87.0,531,49,115,338,78,0 days 00:20:00,0 days 00:07:30,8.85,0.216573,0.146893,0.636535,2026-01-04 03:59:04.000,0 days 10:07:30,2026-01-04
4,2026-01-04 23:35:11,2026-01-05 08:27:50,2026-01-05 00:02:42,2026-01-05 08:17:50,90.0,480,16,100,285,95,0 days 00:27:31,0 days 00:10:00,8.0,0.208333,0.197917,0.59375,2026-01-05 04:01:30.500,0 days 08:52:39,2026-01-05


#### It's important to check how many days are there that have more than 1 sleep session in the same day

In [13]:
# Merging the data sets
# Create a new dataframe if you want to only select a few columns. I'll be using all the columns hence no temp df's

daily_df = pd.merge(df_activity, df_vitals, on="date", how = "inner")
daily_df = pd.merge(daily_df, df_sleep, left_on="date", right_on="date_of_sleep", how = "inner")

In [14]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows

In [15]:
daily_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype          
---  ------                        --------------  -----          
 0   date                          33 non-null     datetime64[us] 
 1   steps                         33 non-null     int64          
 2   calorieskcal                  33 non-null     int64          
 3   avg_heart_ratebpm             33 non-null     int64          
 4   min_heart_ratebpm             33 non-null     int64          
 5   max_heart_ratebpm             33 non-null     int64          
 6   avg_spo2                      33 non-null     float64        
 7   min_spo2                      33 non-null     float64        
 8   max_spo2                      33 non-null     float64        
 9   avg_hrvms                     33 non-null     int64          
 10  min_hrvms                     33 non-null     int64          
 11  max_hrvms                     33

In [32]:
daily_df.tail()

Unnamed: 0,date,steps,calorieskcal,avg_heart_ratebpm,min_heart_ratebpm,max_heart_ratebpm,avg_spo2,min_spo2,max_spo2,avg_hrvms,min_hrvms,max_hrvms,start_time,end_time,falling_asleep_time,wakeup_time,sleep_time_ratio,time_asleepmin,sleep_stages__awakemin,sleep_stages__remmin,sleep_stages__light_sleepmin,sleep_stages__deep_sleepmin,time_to_fall_asleep,time_to_wake_up,time_asleep_hours,rem_pct,deep_sleep_pct,light_sleep_pct,sleep_midpoint,time_in_bed,date_of_sleep
28,2026-01-29,3506,2378,79,52,123,96.0,82.0,99.0,53,13,145,2026-01-29 01:35:23,2026-01-29 09:02:53,2026-01-29 01:50:23,2026-01-29 08:55:23,88.0,395,30,90,230,75,0 days 00:15:00,0 days 00:07:30,6.583333,0.227848,0.189873,0.582278,2026-01-29 05:19:08,0 days 07:27:30,2026-01-29
29,2026-01-30,5454,2343,78,55,127,96.0,84.0,100.0,41,13,89,2026-01-30 01:09:11,2026-01-30 09:11:41,2026-01-30 01:21:41,2026-01-30 09:06:41,90.0,433,32,75,290,68,0 days 00:12:30,0 days 00:05:00,7.216667,0.17321,0.157044,0.669746,2026-01-30 05:10:26,0 days 08:02:30,2026-01-30
30,2026-01-31,6447,2623,78,53,117,97.0,92.0,100.0,49,11,124,2026-01-31 01:00:21,2026-01-31 10:52:51,2026-01-31 01:35:21,2026-01-31 10:50:21,89.0,527,28,90,352,85,0 days 00:35:00,0 days 00:02:30,8.783333,0.170778,0.16129,0.667932,2026-01-31 05:56:36,0 days 09:52:30,2026-01-31
31,2026-02-01,1496,2019,81,56,127,96.0,88.0,99.0,38,11,122,2026-02-01 03:56:30,2026-02-01 12:54:02,2026-02-01 04:09:00,2026-02-01 11:54:00,79.0,425,40,105,260,60,0 days 00:12:30,0 days 01:00:02,7.083333,0.247059,0.141176,0.611765,2026-02-01 08:25:16,0 days 08:57:32,2026-02-01
32,2026-02-01,1496,2019,81,56,127,96.0,88.0,99.0,38,11,122,2026-02-01 16:41:32,2026-02-01 19:09:02,2026-02-01 17:29:02,2026-02-01 19:04:02,61.0,90,5,25,40,25,0 days 00:47:30,0 days 00:05:00,1.5,0.277778,0.277778,0.444444,2026-02-01 17:55:17,0 days 02:27:30,2026-02-01


In [30]:
# Validation checks

# 1) Shape / row count
assert len(daily_df) > 0, "Merged df should not be empty"
assert len(daily_df) <= df_activity.shape[0] * 3, "Row count should not explode vs activity rows"

# 2) No missing values in key columns
assert daily_df["date"].notna().all(), "date should have no nulls"
assert daily_df["time_asleepmin"].notna().all(), "time_asleepmin should have no nulls"
assert daily_df["avg_spo2"].notna().all(), "avg_spo2 should have no nulls"

# 3) Value ranges (sanity bounds)
assert (daily_df["rem_pct"] >= 0).all() and (daily_df["rem_pct"] <= 1).all(), "rem_pct should be 0-1"
assert (daily_df["deep_sleep_pct"] >= 0).all() and (daily_df["deep_sleep_pct"] <= 1).all(), "deep_sleep_pct should be 0-1"
assert (daily_df["light_sleep_pct"] >= 0).all() and (daily_df["light_sleep_pct"] <= 1).all(), "light_sleep_pct should be 0-1"
assert (daily_df["time_asleepmin"] >= 0).all(), "time_asleepmin should be non-negative"
assert (daily_df["avg_spo2"] >= 70).all() and (daily_df["avg_spo2"] <= 100).all(), "avg_spo2 should be in 70-100"

In [31]:
# Lock the dataset 
daily_df.to_csv(DATA_PROCESSED/"daily_metrics.csv", index=False)