In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def remove_initial_neg(df: pd.DataFrame) -> pd.DataFrame:
    # Find the first index where sleepStage is not -1
    df.reset_index(drop=True, inplace=True)
    
    first_non_neg_index = (df['sleepStage'] != -1).idxmax()

    # Return the dataframe starting from that index
    return df[first_non_neg_index:]

In [3]:
# Read the files and add the city name to each row - All files
def read_data(files_name: list[str]) -> pd.DataFrame:
    all_data_not_filtered = pd.concat([pd.read_csv(f"./soundless-data/{file_name}").assign(city=file_name.split('_')[0] + '_' + file_name.split('_')[1].replace('.csv', '')) for file_name in files_name if file_name.startswith('Tarragona_province')], ignore_index=True)
    return all_data_not_filtered

## Get and process data

In [None]:
# Read all files in ./tmp/ directory, and put them in a single dataFrame
df = read_data(os.listdir("./soundless-data/"))
df.shape

In [None]:
## Filtering
# Delete duplicates
df = df.drop_duplicates()

# Delete uuids without heartRate
df = df[df['heartRate'] != -1]

# Delete uuids with low number of rows
uuid_counts = df['uuid'].value_counts()
to_keep = uuid_counts[uuid_counts >= 100].index
df = df[df['uuid'].isin(to_keep)]

# Delete uuids that have a value lower or equal than 30 as the average value of the dB column
uuid_group = df.groupby('uuid')['dB'].agg(['mean'])
uuid_group = uuid_group[uuid_group['mean'] > 30]
df = df[df['uuid'].isin(uuid_group.index)]

df_filtered = pd.DataFrame(
        columns=['uuid', 'timestamp', 'dB', 'heartRate', 'sleepStage'])

# Use the remove_initial_neg funtion for every uuid
df.groupby('uuid').apply(remove_initial_neg)

print(df.shape)

In [6]:
## Get all uuids of the user history
# Create an empty dictionary to store the uuids lists
history_uuids = {}

# Loop through each file in the soundless-history folder
for filename in os.listdir('./soundless-history'):
    # Read the file
    with open(os.path.join('./soundless-history', filename), 'r') as f:
        data = f.read()
    # Check if the file is empty
    if data.strip():
        # Split the string by commas to create a list of uuids
        uuids = data.split(',')
        # Add the uuids to the dictionary
        history_uuids[filename] = uuids

# history_uuids

In [None]:
## Add 'user' column to df
# Invert the history_uuids dictionary to easily map uuids to user keys
uuid_to_user = {uuid: user for user, uuids in history_uuids.items() for uuid in uuids}

# Create a new column 'user' in df by mapping the 'uuid' column using the uuid_to_user dictionary
df['user'] = df['uuid'].map(uuid_to_user)

# Now df has a new column 'user' with the corresponding user key from history_uuids
df

#

# Extract the sleep incidents from the dataset

In [8]:
## IMPORTANT: This values represent the number of recordings used to extract the sleep incidents
number_of_recordings = 7

In [9]:
max_dB_list = []
min_dB_list = []

## Plot and extract all the recordings of all the users
for user in history_uuids:
    # Pre-filter dataframe for the current user's UUIDs
    user_data = df[df.uuid.isin(history_uuids[user])].copy()
    
    # Skip if no data
    if user_data.empty:
        continue
    
    # Vectorized operations
    user_data['timestamp'] = pd.to_datetime(user_data['timestamp'], unit='ms')
    user_data['time_str'] = user_data['timestamp'].dt.strftime('%H:%M:%S')
    
    # Filter data in one go
    mask = (user_data['time_str'] <= '08:00:00') & user_data['dB'].notna()
    user_data = user_data[mask]
    
    if user_data.empty:
        continue
    
    # Calculate means for all UUIDs at once
    mean_dB = user_data.groupby('uuid')['dB'].mean()
    
    if mean_dB.empty:
        continue
    
    # Get top 2 and bottom 2 UUIDs in one sort
    sorted_uuids = mean_dB.sort_values(ascending=False)
    max_dB_list.extend(sorted_uuids.head(number_of_recordings).index)
    min_dB_list.extend(sorted_uuids.tail(number_of_recordings).index)

In [10]:
def incidents_sleep(sleepStage: list[int]) -> list[int]:
    data_list = []
    position = 0
    last_deep_sleep_position = -1

    for row in sleepStage:
        if row >= 2:
            data_list.append(0)
            last_deep_sleep_position = position
        elif row <= 0 and last_deep_sleep_position != -1 and position <= last_deep_sleep_position + 4:
            data_list.append(1)
        else:
            data_list.append(0)
        
        position += 1

    return data_list

In [11]:
# Get the timestamps of the signal changes
def get_timestamps_incident(signals: list[int], timestamps: list[int]) -> list[int]:
    # Initialize a list to store the timestamps of the signal changes
    change_timestamps = []

    # Loop through the signals
    for i in range(1, len(signals)):
        # If the signal is 1 and the previous signal was 0, append the timestamp to the list
        if signals[i] == 1 and signals[i-1] == 0:
            change_timestamps.append(int(timestamps[i]))
            
    # Return the list of timestamps
    return change_timestamps

In [12]:
## Get the incidents
# Dictionary with uuids and their incidents
max_sleepStage_incidents = {}
min_sleepStage_incidents = {}

# For every uuid
for uuid in max_dB_list:
    # Data of this uuid
    data_uuid = df[df.uuid == uuid]
    timestamps_uuid = data_uuid['timestamp'].values
    y_sleepStage = data_uuid['sleepStage'].values

    # Run function to detect incidents in sleepStage
    result = incidents_sleep(y_sleepStage)
    max_sleepStage_incidents[uuid] = get_timestamps_incident(result, timestamps_uuid)

# For every uuid
for uuid in min_dB_list:
    # Data of this uuid
    data_uuid = df[df.uuid == uuid]
    timestamps_uuid = data_uuid['timestamp'].values
    y_sleepStage = data_uuid['sleepStage'].values

    # Run function to detect incidents in sleepStage
    result = incidents_sleep(y_sleepStage)
    min_sleepStage_incidents[uuid] = get_timestamps_incident(result, timestamps_uuid)

In [None]:
# Check of incidents on heartRate_db and sleepStage_db
print("Number of uuids in max_sleepStage_db incidents: ", len(max_sleepStage_incidents.keys()))
print("Number of max_sleepStage_db incidents: ", sum(len(v) for v in max_sleepStage_incidents.values()))
print("")

print("Number of uuids in min_sleepStage_db incidents: ", len(min_sleepStage_incidents.keys()))
print("Number of min_sleepStage_db incidents: ", sum(len(v) for v in min_sleepStage_incidents.values()))

In [14]:
# Extract the hour of the day in 24-hour format
max_sleepStage_incidents_datetime = {k: [pd.to_datetime(i, unit='ms') for i in v] for k, v in max_sleepStage_incidents.items()}
max_sleepStage_incidents_time = {k: [i.strftime('%H') for i in v] for k, v in max_sleepStage_incidents_datetime.items()}

min_sleepStage_incidents_datetime = {k: [pd.to_datetime(i, unit='ms') for i in v] for k, v in min_sleepStage_incidents.items()}
min_sleepStage_incidents_time = {k: [i.strftime('%H') for i in v] for k, v in min_sleepStage_incidents_datetime.items()}

#

# Generate the plots

In [None]:
# Plot the aggregated number of incidents per hour
plt.figure(figsize=(15, 7))

# Aggregate all hours
all_hours = []
for uuid in max_sleepStage_incidents_time:
    all_hours.extend(max_sleepStage_incidents_time[uuid])
    
all_hours = [int(hour) for hour in all_hours]

# Create ordered hour list (22:00 to 09:00 next day)
hour_order = list(range(23, 24)) + list(range(0, 9))

# Count occurrences for each hour in the new order
counts = [all_hours.count(h) for h in hour_order]

# Plot the histogram with reordered hours
bars = plt.bar(range(10), counts)

# Add labels and title
plt.xlabel('Hour')
plt.ylabel('Number of awakenings')
plt.title(f'Number of Awakenings per Hour - {sum(counts)} total - {number_of_recordings} noisy nights')

# Set x-ticks to show hours in correct order
plt.xticks(range(10), [f"{h:02d}:00" for h in hour_order])
# plt.ylim(0, 25) # 25 / 35

print("Number of awakenings total: ", sum(counts))

plt.show()

In [None]:
# Plot the aggregated number of incidents per hour
plt.figure(figsize=(15, 7))

# Aggregate all hours
all_hours = []
for uuid in min_sleepStage_incidents_time:
    all_hours.extend(min_sleepStage_incidents_time[uuid])
    
all_hours = [int(hour) for hour in all_hours]

# Create ordered hour list (22:00 to 09:00 next day)
hour_order = list(range(23, 24)) + list(range(0, 9))

# Count occurrences for each hour in the new order
counts = [all_hours.count(h) for h in hour_order]

# Plot the histogram with reordered hours
bars = plt.bar(range(10), counts)

# Add labels and title
plt.xlabel('Hour')
plt.ylabel('Number of awakenings')
plt.title(f'Number of Awakenings per Hour - {sum(counts)} total - {number_of_recordings} quiet nights')

# Set x-ticks to show hours in correct order
plt.xticks(range(10), [f"{h:02d}:00" for h in hour_order])
# plt.ylim(0, 25) # 25 / 35

print("Number of awakenings total: ", sum(counts))

plt.show()