# **Comparison between quiet and noisy nights**
### **Objective of this Notebook:** Show the differences between the quiet and noisy nights of our dataset.

This notebook allows us to create different plots that compare the average noise levels between the noisiest and quietest nights of each user. Also, we can see the differences between the average heart rate levels of these, showing the relation between noise and health impact.

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
def remove_initial_neg(df: pd.DataFrame) -> pd.DataFrame:
    # Find the first index where sleepStage is not -1
    df.reset_index(drop=True, inplace=True)
    
    first_non_neg_index = (df['sleepStage'] != -1).idxmax()

    # Return the dataframe starting from that index
    return df[first_non_neg_index:]

In [3]:
# Read the files and add the city name to each row - All files
def read_data(files_name: list[str]) -> pd.DataFrame:
    all_data_not_filtered = pd.concat([pd.read_csv(f"./soundless-data/{file_name}").assign(city=file_name.split('_')[0] + '_' + file_name.split('_')[1].replace('.csv', '')) for file_name in files_name if file_name.startswith('Tarragona_province')], ignore_index=True)
    return all_data_not_filtered

## Get and process data

In [None]:
# Read all files in ./tmp/ directory, and put them in a single dataFrame
df = read_data(os.listdir("./soundless-data/"))
df.shape

In [None]:
## Filtering
# Delete duplicates
df = df.drop_duplicates()

# Delete uuids without heartRate
df = df[df['heartRate'] != -1]

# Delete uuids with low number of rows
uuid_counts = df['uuid'].value_counts()
to_keep = uuid_counts[uuid_counts >= 100].index
df = df[df['uuid'].isin(to_keep)]

# Delete uuids that have a value lower or equal than 30 as the average value of the dB column
uuid_group = df.groupby('uuid')['dB'].agg(['mean'])
uuid_group = uuid_group[uuid_group['mean'] > 30]
df = df[df['uuid'].isin(uuid_group.index)]

df_filtered = pd.DataFrame(
        columns=['uuid', 'timestamp', 'dB', 'heartRate', 'sleepStage'])

# Use the remove_initial_neg funtion for every uuid
df.groupby('uuid').apply(remove_initial_neg)

print(df.shape)
df

In [6]:
## Get all uuids of the user history
# Create an empty dictionary to store the uuids lists
history_uuids = {}

# Loop through each file in the soundless-history folder
for filename in os.listdir('./soundless-history'):
    # Read the file
    with open(os.path.join('./soundless-history', filename), 'r') as f:
        data = f.read()
    # Check if the file is empty
    if data.strip():
        # Split the string by commas to create a list of uuids
        uuids = data.split(',')
        # Add the uuids to the dictionary
        history_uuids[filename] = uuids

# history_uuids

In [None]:
## Add 'user' column to df
# Invert the history_uuids dictionary to easily map uuids to user keys
uuid_to_user = {uuid: user for user, uuids in history_uuids.items() for uuid in uuids}

# Create a new column 'user' in df by mapping the 'uuid' column using the uuid_to_user dictionary
df['user'] = df['uuid'].map(uuid_to_user)

# Now df has a new column 'user' with the corresponding user key from history_uuids
df

# Generate the plots

In [8]:
max_dB_by_users = {}
min_dB_by_users = {}

## Plot and extract all the recordings of all the users
for user in history_uuids:
    mean_dB_uuids = {}

    for uuid in history_uuids[user]:
        data = df[df.uuid == uuid]

        # Check if the data is empty
        if data.shape[0] == 0:
            continue

        # Sort by timestamp
        data = data.sort_values(by=['timestamp'])

        # Convert the timestamp to datetime
        data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')

        # Delete timestamsp that are bigger than 08:00:00
        data = data[data.timestamp.dt.strftime('%H:%M:%S') <= '08:00:00']

        # Delete rows with NaN values
        data = data.dropna()

        # Check if the data is empty
        if data.shape[0] == 0:
            continue

        # Add the dB values to the max_desviation_df
        mean_dB_uuids[uuid] = data['dB'].mean()

    # If the user has no data, continue to the next user
    if len(mean_dB_uuids) == 0:
        continue

    # Get the three max and min dB values
    max_dB_by_users[user] = sorted(mean_dB_uuids.items(), key=lambda x: x[1], reverse=True)[:4]
    min_dB_by_users[user] = sorted(mean_dB_uuids.items(), key=lambda x: x[1])[:4]

In [None]:
# Create a Dataframe with all the db values of each uuid of all the users
max_dB_df = pd.DataFrame(columns=['uuid', 'timestamp', 'dB', 'heartRate'])
min_dB_df = pd.DataFrame(columns=['uuid', 'timestamp', 'dB', 'heartRate'])

for user in max_dB_by_users:
    for uuid, _ in max_dB_by_users[user]:
        data = df[df.uuid == uuid]

        # Check if the data is empty
        if data.shape[0] == 0:
            continue

        # Sort by timestamp
        data = data.sort_values(by=['timestamp'])

        # Convert the timestamp to datetime
        data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')

        # Delete timestamsp that are bigger than 08:00:00
        data = data[data.timestamp.dt.strftime('%H:%M:%S') <= '08:00:00']
        
        # Add the db values to the max_dB_df
        # uuid | timestamp | dB
        max_dB_df = pd.concat([max_dB_df, data[['uuid', 'timestamp', 'dB', 'heartRate']]])


for user in min_dB_by_users:
    for uuid, _ in min_dB_by_users[user]:
        data = df[df.uuid == uuid]

        # Check if the data is empty
        if data.shape[0] == 0:
            continue

        # Sort by timestamp
        data = data.sort_values(by=['timestamp'])

        # Convert the timestamp to datetime
        data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')

        # Delete timestamsp that are bigger than 08:00:00
        data = data[data.timestamp.dt.strftime('%H:%M:%S') <= '08:00:00']

        # Add the db values to the min_dB_df
        # uuid | timestamp | dB
        min_dB_df = pd.concat([min_dB_df, data[['uuid', 'timestamp', 'dB', 'heartRate']]])

In [10]:
# Round all the timestamps to 5-seconds intervals
max_dB_df['timestamp'] = max_dB_df['timestamp'].dt.round('1min')

# Delete the day from the timestamp
max_dB_df['timestamp'] = max_dB_df['timestamp'].dt.strftime('%H:%M:%S')


# Round all the timestamps to 5-seconds intervals
min_dB_df['timestamp'] = min_dB_df['timestamp'].dt.round('1min')

# Delete the day from the timestamp
min_dB_df['timestamp'] = min_dB_df['timestamp'].dt.strftime('%H:%M:%S')

In [None]:
mean_max = max_dB_df.groupby('timestamp')['dB'].mean()

# Group by timestamp and calculate the mean of the dB values on each timestamp
mean = min_dB_df.groupby('timestamp')['dB'].mean()

# Plot the results
plt.figure(figsize=(20, 10))

# Plot the mean line
plt.plot(mean_max.index, mean_max, label='Noisy Night - Mean', color='red')
plt.plot(mean.index, mean, label='Quiet Night - Mean', color='blue', linestyle='--')

# Convert timestamps to HH:MM format
def format_time(timestamp):
    if isinstance(timestamp, str):
        try:
            # Try HH:MM:SS format first
            timestamp = datetime.strptime(timestamp, '%H:%M:%S')
        except ValueError:
            try:
                # Try full datetime format
                timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                # If both fail, just return the first 5 characters (HH:MM)
                return timestamp[:5]
    return timestamp.strftime('%H:%M')

# Set ticks with formatted time
tick_spacing = max(len(mean.index) // 10, 1)
formatted_times = [format_time(t) for t in mean.index[::tick_spacing]]
plt.xticks(range(0, len(mean.index), tick_spacing), formatted_times, fontsize=10, rotation=45)

plt.title("Quiet Night vs Noisy Night Decibels Level", fontsize=20)
plt.xlabel('Time (hours)', fontsize=15)
plt.ylabel('dB', fontsize=15)
plt.legend(fontsize=15)
# plt.grid()
plt.show()

# Mean dB value
print(f"Mean heartRate value: {mean.mean()}")


In [None]:
mean_max = max_dB_df.groupby('timestamp')['heartRate'].mean()

# Group by timestamp and calculate the mean of the dB values on each timestamp
mean = min_dB_df.groupby('timestamp')['heartRate'].mean()

# Plot the results
plt.figure(figsize=(20, 10))

# Plot the mean line
plt.plot(mean_max.index, mean_max, label='Noisy Night - Mean', color='red')
plt.plot(mean.index, mean, label='Quiet Night - Mean', color='blue', linestyle='--')

# Convert timestamps to HH:MM format
def format_time(timestamp):
    if isinstance(timestamp, str):
        try:
            # Try HH:MM:SS format first
            timestamp = datetime.strptime(timestamp, '%H:%M:%S')
        except ValueError:
            try:
                # Try full datetime format
                timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                # If both fail, just return the first 5 characters (HH:MM)
                return timestamp[:5]
    return timestamp.strftime('%H:%M')

# Set ticks with formatted time
tick_spacing = max(len(mean.index) // 10, 1)
formatted_times = [format_time(t) for t in mean.index[::tick_spacing]]
plt.xticks(range(0, len(mean.index), tick_spacing), formatted_times, fontsize=10, rotation=45)

plt.title("Quiet Night vs Noisy Night Heart Rate Level", fontsize=20)
plt.xlabel('Time (hours)', fontsize=15)
plt.ylabel('heartRate', fontsize=15)
# plt.grid()
plt.legend(fontsize=15)
plt.show()


# Mean dB value
print(f"Mean heartRate value: {mean.mean()}")
