# **Average dB value**

This notebook allows us to show the average value of dB of all the dataset. We can show it when the user is confirmed that is sleeping and when we don't see if the user is sleeping or not. We show this data with two types of plots, a bar plot and a line plot.

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
def remove_initial_neg(df: pd.DataFrame) -> pd.DataFrame:
    # Find the first index where sleepStage is not -1
    df.reset_index(drop=True, inplace=True)
    
    first_non_neg_index = (df['sleepStage'] != -1).idxmax()

    # Return the dataframe starting from that index
    return df[first_non_neg_index:]

In [3]:
# Read the files and add the city name to each row - All files
def read_data(files_name: list[str]) -> pd.DataFrame:
    all_data_not_filtered = pd.concat([pd.read_csv(f"./soundless-data/{file_name}").assign(city=file_name.split('_')[0] + '_' + file_name.split('_')[1].replace('.csv', '')) for file_name in files_name if file_name.startswith('Tarragona_province')], ignore_index=True)
    return all_data_not_filtered

## Get and process data

In [None]:
# Read all files in ./tmp/ directory, and put them in a single dataFrame
df = read_data(os.listdir("./soundless-data/"))
df.shape

In [None]:
## Filtering
# Delete duplicates
df = df.drop_duplicates()

# Delete uuids without heartRate
df = df[df['heartRate'] != -1]

# Delete uuids with low number of rows
uuid_counts = df['uuid'].value_counts()
to_keep = uuid_counts[uuid_counts >= 100].index
df = df[df['uuid'].isin(to_keep)]

# Delete uuids that have a value lower or equal than 30 as the average value of the dB column
uuid_group = df.groupby('uuid')['dB'].agg(['mean'])
uuid_group = uuid_group[uuid_group['mean'] > 30]
df = df[df['uuid'].isin(uuid_group.index)]

df_filtered = pd.DataFrame(
        columns=['uuid', 'timestamp', 'dB', 'heartRate', 'sleepStage'])

# Use the remove_initial_neg funtion for every uuid
df.groupby('uuid').apply(remove_initial_neg)

print(df.shape)

In [6]:
## Get all uuids of the user history
# Create an empty dictionary to store the uuids lists
history_uuids = {}

# Loop through each file in the soundless-history folder
for filename in os.listdir('./soundless-history'):
    # Read the file
    with open(os.path.join('./soundless-history', filename), 'r') as f:
        data = f.read()
    # Check if the file is empty
    if data.strip():
        # Split the string by commas to create a list of uuids
        uuids = data.split(',')
        # Add the uuids to the dictionary
        history_uuids[filename] = uuids

# history_uuids

In [None]:
## Add 'user' column to df
# Invert the history_uuids dictionary to easily map uuids to user keys
uuid_to_user = {uuid: user for user, uuids in history_uuids.items() for uuid in uuids}

# Create a new column 'user' in df by mapping the 'uuid' column using the uuid_to_user dictionary
df['user'] = df['uuid'].map(uuid_to_user)

# Now df has a new column 'user' with the corresponding user key from history_uuids
df

## Plot and extract the user data

## Plot with the user sleeping

In [None]:
# PLOT MEAN THE LOUDEST HOURS OF ALL NIGHTS
all_dB_df = df.copy()

# Round all the timestamps to 5-seconds intervals
# Sort by timestamp
all_dB_df = all_dB_df.sort_values(by=['timestamp'])

# Convert the timestamp to datetime
all_dB_df['timestamp'] = pd.to_datetime(all_dB_df['timestamp'], unit='ms')
all_dB_df['timestamp'] = all_dB_df['timestamp'].dt.round('1min')

# Delete the day from the timestamp
all_dB_df['timestamp'] = all_dB_df['timestamp'].dt.strftime('%H:%M:%S')
all_dB_df = all_dB_df.sort_values(by=['timestamp'])


# Get data from 00:00:00 to 08:00:00
all_dB_df = all_dB_df[(all_dB_df['timestamp'] >= '00:00:00') & (all_dB_df['timestamp'] <= '08:00:00')]

# Delete uuids without sleepStage
all_dB_df = all_dB_df[all_dB_df['sleepStage'] != -1]

# Group by timestamp and calculate the mean of the dB values on each timestamp
mean = all_dB_df.groupby('timestamp')['dB'].mean()

# Plot the results
plt.figure(figsize=(20, 10))

# Plot the mean line
plt.plot(mean.index, mean, label='Mean dB', color='red')

# Set ticks
tick_spacing = max(len(mean.index) // 10, 1)  # Prevent division by zero
plt.xticks(range(0, len(mean.index), tick_spacing), mean.index[::tick_spacing], fontsize=10, rotation=45)

plt.title("Mean of dB level with the users sleeping", fontsize=20)

plt.xlabel('Time', fontsize=15)
plt.ylabel('dB', fontsize=15)

# Add grid
plt.grid()

plt.legend(fontsize=15)
plt.show()

# Mean dB value
print(f"Mean heartRate value: {mean.mean()}")


## Plot with the user sleeping or not

In [None]:
# PLOT MEAN THE LOUDEST HOURS OF ALL NIGHTS
all_dB_df = df.copy()

# Round all the timestamps to 5-seconds intervals
# Sort by timestamp
all_dB_df = all_dB_df.sort_values(by=['timestamp'])

# Convert the timestamp to datetime
all_dB_df['timestamp'] = pd.to_datetime(all_dB_df['timestamp'], unit='ms')
all_dB_df['timestamp'] = all_dB_df['timestamp'].dt.round('1min')

# Delete the day from the timestamp
all_dB_df['timestamp'] = all_dB_df['timestamp'].dt.strftime('%H:%M:%S')
all_dB_df = all_dB_df.sort_values(by=['timestamp'])


# Get data from 00:00:00 to 08:00:00
all_dB_df = all_dB_df[(all_dB_df['timestamp'] >= '00:00:00') & (all_dB_df['timestamp'] <= '08:00:00')]

# Group by timestamp and calculate the mean of the dB values on each timestamp
mean = all_dB_df.groupby('timestamp')['dB'].mean()

# Plot the results
plt.figure(figsize=(20, 10))

# Plot the mean line
plt.plot(mean.index, mean, label='Mean dB', color='red')

# Set ticks
tick_spacing = max(len(mean.index) // 10, 1)  # Prevent division by zero
plt.xticks(range(0, len(mean.index), tick_spacing), mean.index[::tick_spacing], fontsize=10, rotation=45)

plt.title("Mean of dB level", fontsize=20)

plt.xlabel('Time', fontsize=15)
plt.ylabel('dB', fontsize=15)

# Add grid
plt.grid()

plt.legend(fontsize=15)
plt.show()

# Mean dB value
print(f"Mean heartRate value: {mean.mean()}")


#

# Plot with bar chart

## Plot with the user sleeping

In [None]:
# PLOT MEAN THE LOUDEST HOURS OF ALL NIGHTS
all_dB_df = df.copy()

# Convert the timestamp to datetime and extract hour only
all_dB_df['timestamp'] = pd.to_datetime(all_dB_df['timestamp'], unit='ms')
all_dB_df['hour'] = all_dB_df['timestamp'].dt.strftime('%H:00')

# Get data from 00:00:00 to 08:00:00
all_dB_df = all_dB_df[(all_dB_df['timestamp'].dt.hour >= 0) & (all_dB_df['timestamp'].dt.hour <= 8)]

# Delete uuids without sleepStage
all_dB_df = all_dB_df[all_dB_df['sleepStage'] != -1]

# Group by hour and calculate mean
hourly_mean = all_dB_df.groupby('hour')['dB'].mean()

# Create bar plot
plt.figure(figsize=(20, 10))

plt.bar(range(len(hourly_mean)), hourly_mean.values)

# Customize the plot
plt.title("Average dB Level by Hour with the Users Sleeping", fontsize=20)
plt.xlabel('Hour', fontsize=15)
plt.ylabel('Average dB', fontsize=15)

# Set x-axis ticks to show hours
plt.xticks(range(len(hourly_mean)), hourly_mean.index, rotation=45, fontsize=12)

# Add value labels on top of each bar
for i, v in enumerate(hourly_mean.values):
    plt.text(i, v + 0.5, f'{v:.1f}', ha='center', fontsize=12)
    
# Y axis start at 25
plt.ylim(30, 45)

plt.tight_layout()
plt.show()

# Print overall mean
print(f"Overall mean dB value: {hourly_mean.mean():.2f}")

## Plot with the user sleeping or not

In [None]:
# PLOT MEAN THE LOUDEST HOURS OF ALL NIGHTS
all_dB_df = df.copy()

# Convert the timestamp to datetime and extract hour only
all_dB_df['timestamp'] = pd.to_datetime(all_dB_df['timestamp'], unit='ms')
all_dB_df['hour'] = all_dB_df['timestamp'].dt.strftime('%H:00')

# Get data from 00:00:00 to 08:00:00
all_dB_df = all_dB_df[(all_dB_df['timestamp'].dt.hour >= 0) & (all_dB_df['timestamp'].dt.hour <= 8)]

# Group by hour and calculate mean
hourly_mean = all_dB_df.groupby('hour')['dB'].mean()

# Create bar plot
plt.figure(figsize=(20, 10))

plt.bar(range(len(hourly_mean)), hourly_mean.values)

# Customize the plot
plt.title("Average dB Level by Hour", fontsize=20)
plt.xlabel('Hour', fontsize=15)
plt.ylabel('Average dB', fontsize=15)

# Set x-axis ticks to show hours
plt.xticks(range(len(hourly_mean)), hourly_mean.index, rotation=45, fontsize=12)

# Add value labels on top of each bar
for i, v in enumerate(hourly_mean.values):
    plt.text(i, v + 0.5, f'{v:.1f}', ha='center', fontsize=12)
    
# Y axis start at 25
plt.ylim(30, 45)

plt.tight_layout()
plt.show()

# Print overall mean
print(f"Overall mean dB value: {hourly_mean.mean():.2f}")