# **Raw Data Calculations**
### **Notebook Objective:** 
This notebook performs the necessary calculations on raw data to obtain metrics such as the number of awakenings, and the average decibel and heart rate levels during the noisiest and quietest nights, among others.

All these data are read and processed in this notebook and finally saved to a file named *data.pkl*. This file contains the final data, which will later be used in another notebook (*plot_data.ipynb*) to generate the corresponding plots.

In [None]:
import pandas as pd
import os
import pickle

In [None]:
def remove_initial_neg(df: pd.DataFrame) -> pd.DataFrame:
    # Find the first index where sleepStage is not -1
    df.reset_index(drop=True, inplace=True)
    
    first_non_neg_index = (df['sleepStage'] != -1).idxmax()

    # Return the dataframe starting from that index
    return df[first_non_neg_index:]

In [None]:
# Read the files and add the city name to each row - All files
def read_data(files_name: list[str]) -> pd.DataFrame:
    all_data_not_filtered = pd.concat([pd.read_csv(f"./soundless/data/{file_name}").assign(city=file_name.split('_')[0] + '_' + file_name.split('_')[1].replace('.csv', '')) for file_name in files_name if file_name.startswith('Tarragona_province')], ignore_index=True)
    return all_data_not_filtered

## Get and process data

In [None]:
# Read all files in ./tmp/ directory, and put them in a single dataFrame
df = read_data(os.listdir("./soundless/data/"))
df.shape

In [None]:
## Filtering
# Delete duplicates
df = df.drop_duplicates()

# Delete uuids without heartRate
df = df[df['heartRate'] != -1]

# Delete uuids with low number of rows
uuid_counts = df['uuid'].value_counts()
to_keep = uuid_counts[uuid_counts >= 100].index
df = df[df['uuid'].isin(to_keep)]

# Delete uuids that have a value lower or equal than 30 as the average value of the dB column
uuid_group = df.groupby('uuid')['dB'].agg(['mean'])
uuid_group = uuid_group[uuid_group['mean'] > 30]
df = df[df['uuid'].isin(uuid_group.index)]

df_filtered = pd.DataFrame(
        columns=['uuid', 'timestamp', 'dB', 'heartRate', 'sleepStage'])

# Use the remove_initial_neg funtion for every uuid
df.groupby('uuid').apply(remove_initial_neg)

print(df.shape)

In [None]:
## Get all uuids of the user history
# Create an empty dictionary to store the uuids lists
history_uuids = {}

# Loop through each file in the soundless/history folder
for filename in os.listdir('./soundless/history/'):
    # Read the file
    with open(os.path.join('./soundless/history', filename), 'r') as f:
        data = f.read()
    # Check if the file is empty
    if data.strip():
        # Split the string by commas to create a list of uuids
        uuids = data.split(',')
        # Add the uuids to the dictionary
        history_uuids[filename] = uuids

# history_uuids

In [None]:
## Add 'user' column to df
# Invert the history_uuids dictionary to easily map uuids to user keys
uuid_to_user = {uuid: user for user, uuids in history_uuids.items() for uuid in uuids}

# Create a new column 'user' in df by mapping the 'uuid' column using the uuid_to_user dictionary
df['user'] = df['uuid'].map(uuid_to_user)

# Now df has a new column 'user' with the corresponding user key from history_uuids
df

#

# Auxiliar functions

In [None]:
## Sleep stages
# 0 / -1 -> Awake
# 1 -> Light sleep
# 2 -> Deep sleep
# 3 -> REM sleep

def incidents_sleep(sleepStage, timestamps):
    data_list = []
    position = 0
    last_deep_sleep_position = -1
    
    # Get the moment (timestamp) that the user goes from deep sleep to awake in 10 seconds
    for stage, timestamp in zip(sleepStage, timestamps):
        # If the user is in deep sleep or rem phase
        if stage >= 2:
            last_deep_sleep_position = position
            
        # If the user is awake
        if stage <= 0 and last_deep_sleep_position != -1 and position <= last_deep_sleep_position + 1:
            data_list.append(timestamp)
        
        position += 1

    return data_list

#

# Final Nights with Illegal levels

In [None]:
# Generate the plot
# Get the number of nights that the sound level was higher than 40 dB
sleeping_df = df[df['sleepStage'] != -1]
max_db_per_uuid = sleeping_df.groupby('uuid')['dB'].max()

# Count nights above/below 40dB
nights_higher_40 = (max_db_per_uuid > 40).sum()
nights_lower_40 = (max_db_per_uuid <= 40).sum()

#

## Plot and extract the user data -- Final mean dB

In [None]:
# PLOT MEAN THE LOUDEST HOURS OF ALL NIGHTS
all_dB_df = df.copy()

# Round all the timestamps to 1-minute intervals
all_dB_df['timestamp'] = pd.to_datetime(all_dB_df['timestamp'], unit='ms').dt.round('1min')

# Delete the day from the timestamp
all_dB_df['timestamp'] = all_dB_df['timestamp'].dt.strftime('%H:%M:%S')
all_dB_df = all_dB_df.sort_values(by=['timestamp'])

# Get data from 00:00:00 to 08:00:00
all_dB_df = all_dB_df[(all_dB_df['timestamp'] >= '00:00:00') & (all_dB_df['timestamp'] <= '08:00:00')]

# Delete uuids without sleepStage
all_dB_df_sleeping = all_dB_df[all_dB_df['sleepStage'] != -1]

# Group by timestamp and calculate the mean of the dB values on each timestamp
mean = all_dB_df.groupby('timestamp')['dB'].mean()
mean_sleeping = all_dB_df_sleeping.groupby('timestamp')['dB'].mean()

## Bar chart

In [None]:
# PLOT MEAN THE LOUDEST HOURS OF ALL NIGHTS
all_dB_df = df.copy()

# Convert the timestamp to datetime and extract hour only
all_dB_df['timestamp'] = pd.to_datetime(all_dB_df['timestamp'], unit='ms')
all_dB_df['hour'] = all_dB_df['timestamp'].dt.strftime('%H:00')

# Get data from 00:00:00 to 08:00:00
all_dB_df = all_dB_df[(all_dB_df['timestamp'].dt.hour >= 0) & (all_dB_df['timestamp'].dt.hour <= 8)]

# Delete uuids without sleepStage
all_dB_df_sleeping = all_dB_df[all_dB_df['sleepStage'] != -1]

# Group by hour and calculate mean
hourly_mean = all_dB_df.groupby('hour')['dB'].mean()
hourly_mean_sleeping = all_dB_df_sleeping.groupby('hour')['dB'].mean()

#

# Extract the sleep incidents from the dataset --> Final Awakenings

In [None]:
## IMPORTANT: This values represent the number of recordings used to extract the sleep incidents
number_of_recordings = 4

In [None]:
max_dB_list = []
min_dB_list = []

## Plot and extract all the recordings of all the users
for user in history_uuids:
    # Pre-filter dataframe for the current user's UUIDs
    user_data = df[df.uuid.isin(history_uuids[user])].copy()
    
    # Skip if no data
    if user_data.empty:
        continue
    
    # Vectorized operations
    user_data['timestamp'] = pd.to_datetime(user_data['timestamp'], unit='ms')
    user_data['time_str'] = user_data['timestamp'].dt.strftime('%H:%M:%S')
    
    # Filter data in one go
    mask = (user_data['time_str'] <= '08:00:00') & user_data['dB'].notna()
    user_data = user_data[mask]
    
    if user_data.empty:
        continue
    
    # Calculate means for all UUIDs at once
    mean_dB = user_data.groupby('uuid')['dB'].mean()
    
    if mean_dB.empty:
        continue
    
    # Get top 2 and bottom 2 UUIDs in one sort
    sorted_uuids = mean_dB.sort_values(ascending=False)
    max_dB_list.extend(sorted_uuids.head(number_of_recordings).index)
    min_dB_list.extend(sorted_uuids.tail(number_of_recordings).index)

In [None]:
## Get the incidents
# Dictionary with uuids and their incidents
max_sleepStage_incidents = {}
min_sleepStage_incidents = {}

# For every uuid
for uuid in max_dB_list:
    # Data of this uuid
    data_uuid = df[df.uuid == uuid]
    timestamps_uuid = data_uuid['timestamp'].values
    sleepStage = data_uuid['sleepStage'].values

    # Run function to detect incidents in sleepStage
    max_sleepStage_incidents[uuid] = incidents_sleep(sleepStage, timestamps_uuid)

# For every uuid
for uuid in min_dB_list:
    # Data of this uuid
    data_uuid = df[df.uuid == uuid]
    timestamps_uuid = data_uuid['timestamp'].values
    sleepStage = data_uuid['sleepStage'].values

    # Run function to detect incidents in sleepStage
    min_sleepStage_incidents[uuid] = incidents_sleep(sleepStage, timestamps_uuid)

In [None]:
# Extract the hour of the day in 24-hour format
max_sleepStage_incidents = {k: [pd.to_datetime(i, unit='ms') for i in v] for k, v in max_sleepStage_incidents.items()}
max_sleepStage_incidents = {k: [i.strftime('%H') for i in v] for k, v in max_sleepStage_incidents.items()}

min_sleepStage_incidents = {k: [pd.to_datetime(i, unit='ms') for i in v] for k, v in min_sleepStage_incidents.items()}
min_sleepStage_incidents = {k: [i.strftime('%H') for i in v] for k, v in min_sleepStage_incidents.items()}

#

# Generate the plots -- Final noisy quiet db and hr levels

In [None]:
# Create a Dataframe with all the db values of each uuid of all the users
max_dB_df = pd.DataFrame(columns=['uuid', 'timestamp', 'dB', 'heartRate'])
min_dB_df = pd.DataFrame(columns=['uuid', 'timestamp', 'dB', 'heartRate'])


for uuid in max_dB_list:
    data = df[df.uuid == uuid]

    # Check if the data is empty
    if data.shape[0] == 0:
        continue

    # Sort by timestamp
    data = data.sort_values(by=['timestamp'])

    # Convert the timestamp to datetime
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')

    # Delete timestamsp that are bigger than 08:00:00
    data = data[data.timestamp.dt.strftime('%H:%M:%S') <= '08:00:00']
    
    # Add the db values to the max_dB_df
    # uuid | timestamp | dB
    max_dB_df = pd.concat([max_dB_df, data[['uuid', 'timestamp', 'dB', 'heartRate']]])


for uuid in min_dB_list:
    data = df[df.uuid == uuid]

    # Check if the data is empty
    if data.shape[0] == 0:
        continue

    # Sort by timestamp
    data = data.sort_values(by=['timestamp'])

    # Convert the timestamp to datetime
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')

    # Delete timestamsp that are bigger than 08:00:00
    data = data[data.timestamp.dt.strftime('%H:%M:%S') <= '08:00:00']

    # Add the db values to the min_dB_df
    # uuid | timestamp | dB
    min_dB_df = pd.concat([min_dB_df, data[['uuid', 'timestamp', 'dB', 'heartRate']]])

In [None]:
# Round all the timestamps to 1-minute intervals
max_dB_df['timestamp'] = max_dB_df['timestamp'].dt.round('1min')
min_dB_df['timestamp'] = min_dB_df['timestamp'].dt.round('1min')

# Delete the day from the timestamp
max_dB_df['timestamp'] = max_dB_df['timestamp'].dt.strftime('%H:%M:%S')
min_dB_df['timestamp'] = min_dB_df['timestamp'].dt.strftime('%H:%M:%S')

# Group by timestamp and calculate the mean of the dB values on each timestamp
mean_max_dB = max_dB_df.groupby('timestamp')['dB'].mean()
mean_dB = min_dB_df.groupby('timestamp')['dB'].mean()

# Group by timestamp and calculate the mean of the hR values on each timestamp
mean_max_hR = max_dB_df.groupby('timestamp')['heartRate'].mean()
mean_hR = min_dB_df.groupby('timestamp')['heartRate'].mean()

#

# Save the data to plot it

In [None]:
# Combine them into a dictionary for easy storage
data = {
    # Nights with illegal levels of noise
    "nights_higher_40": nights_higher_40,
    "nights_lower_40": nights_lower_40,
    
    # Mean dB values
    "mean": mean,
    "mean_sleeping": mean_sleeping,
    "hourly_mean": hourly_mean,
    "hourly_mean_sleeping": hourly_mean_sleeping,
    
    # Number of awakenings
    "number_of_recordings": number_of_recordings,
    "max_sleepStage_incidents": max_sleepStage_incidents,
    "min_sleepStage_incidents": min_sleepStage_incidents,
    
    # Noisy quiet dB hR levels
    "mean_max_dB": mean_max_dB,
    "mean_dB": mean_dB,
    "mean_max_hR": mean_max_hR,
    "mean_hR": mean_hR,
}

# Save the variables to a file named 'data.pkl'
with open('data.pkl', 'wb') as file:
    pickle.dump(data, file)