# **Number of nights with ilegal noise**

This notebook allows us to show the number of nights that the decibels level exceeds the legal limits set by the WHO (40 dB). This limits are exceeded when the user is sleeping.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
def remove_initial_neg(df: pd.DataFrame) -> pd.DataFrame:
    # Find the first index where sleepStage is not -1
    df.reset_index(drop=True, inplace=True)
    
    first_non_neg_index = (df['sleepStage'] != -1).idxmax()

    # Return the dataframe starting from that index
    return df[first_non_neg_index:]

In [3]:
# Read the files and add the city name to each row - All files
def read_data(files_name: list[str]) -> pd.DataFrame:
    all_data_not_filtered = pd.concat([pd.read_csv(f"./soundless-data/{file_name}").assign(city=file_name.split('_')[0] + '_' + file_name.split('_')[1].replace('.csv', '')) for file_name in files_name if file_name.startswith('Tarragona_province')], ignore_index=True)
    return all_data_not_filtered

## Get and process data

In [None]:
# Read all files in ./tmp/ directory, and put them in a single dataFrame
df = read_data(os.listdir("./soundless-data/"))
df.shape

In [None]:
def calculate_recording_duration(df):
    # Group by UUID and calculate time range
    duration_by_uuid = df.groupby('uuid').agg({
        'timestamp': lambda x: (x.max() - x.min()) / (1000 * 60 * 60)  # ms to hours
    })
    
    total_hours = duration_by_uuid['timestamp'].sum()
    avg_hours = total_hours / len(duration_by_uuid)
    
    print(f"Total recording hours: {total_hours:.2f}")
    print(f"Average hours per UUID: {avg_hours:.2f}")
    print(f"Number of recordings (UUIDs): {len(duration_by_uuid)}")
    
    return total_hours, avg_hours

# Calculate durations after filtering
total_hours, avg_hours = calculate_recording_duration(df)

In [None]:
## Filtering
# Delete duplicates
df = df.drop_duplicates()

# Delete uuids without heartRate
df = df[df['heartRate'] != -1]

# Delete uuids with low number of rows
uuid_counts = df['uuid'].value_counts()
to_keep = uuid_counts[uuid_counts >= 100].index
df = df[df['uuid'].isin(to_keep)]

# Delete uuids that have a value lower or equal than 30 as the average value of the dB column
uuid_group = df.groupby('uuid')['dB'].agg(['mean'])
uuid_group = uuid_group[uuid_group['mean'] > 30]
df = df[df['uuid'].isin(uuid_group.index)]

df_filtered = pd.DataFrame(
        columns=['uuid', 'timestamp', 'dB', 'heartRate', 'sleepStage'])

# Use the remove_initial_neg funtion for every uuid
df.groupby('uuid').apply(remove_initial_neg)

print(df.shape)

In [7]:
## Get all uuids of the user history
# Create an empty dictionary to store the uuids lists
history_uuids = {}

# Loop through each file in the soundless-history folder
for filename in os.listdir('./soundless-history'):
    # Read the file
    with open(os.path.join('./soundless-history', filename), 'r') as f:
        data = f.read()
    # Check if the file is empty
    if data.strip():
        # Split the string by commas to create a list of uuids
        uuids = data.split(',')
        # Add the uuids to the dictionary
        history_uuids[filename] = uuids

# history_uuids

In [None]:
## Add 'user' column to df
# Invert the history_uuids dictionary to easily map uuids to user keys
uuid_to_user = {uuid: user for user, uuids in history_uuids.items() for uuid in uuids}

# Create a new column 'user' in df by mapping the 'uuid' column using the uuid_to_user dictionary
df['user'] = df['uuid'].map(uuid_to_user)

# Now df has a new column 'user' with the corresponding user key from history_uuids
df

#

# Extract the sleep incidents from the dataset

In [9]:
def incidents_sleep(sleepStage: list[int]) -> list[int]:
    data_list = []
    position = 0
    last_deep_sleep_position = -1

    for row in sleepStage:
        if row >= 2:
            data_list.append(0)
            last_deep_sleep_position = position
        elif row <= 0 and last_deep_sleep_position != -1 and position <= last_deep_sleep_position + 4:
            data_list.append(1)
        else:
            data_list.append(0)
        
        position += 1

    return data_list

In [10]:
# Get the timestamps of the signal changes
def get_timestamps_incident(signals: list[int], timestamps: list[int]) -> list[int]:
    # Initialize a list to store the timestamps of the signal changes
    change_timestamps = []

    # Loop through the signals
    for i in range(1, len(signals)):
        # If the signal is 1 and the previous signal was 0, append the timestamp to the list
        if signals[i] == 1 and signals[i-1] == 0:
            change_timestamps.append(int(timestamps[i]))
            
    # Return the list of timestamps
    return change_timestamps

In [11]:
## Get the incidents
# Dictionary with uuids and their incidents
sleepStage_incidents = {}

# For every uuid
for uuid in df['uuid'].unique():
    # Data of this uuid
    data_uuid = df[df.uuid == uuid]
    timestamps_uuid = data_uuid['timestamp'].values
    y_sleepStage = data_uuid['sleepStage'].values

    # Run function to detect incidents in sleepStage
    result = incidents_sleep(y_sleepStage)
    sleepStage_incidents[uuid] = get_timestamps_incident(result, timestamps_uuid)

In [None]:
# Check of incidents on heartRate_db and sleepStage_db
print("Number of uuids in max_sleepStage_db incidents: ", len(sleepStage_incidents.keys()))
print("Number of sleepStage_db incidents: ", sum(len(v) for v in sleepStage_incidents.values()))
print("Average number of sleepStage_db incidents per uuid: ", sum(len(v) for v in sleepStage_incidents.values()) / len(sleepStage_incidents.keys()))
print("Minimum number of sleepStage_db incidents: ", min(len(v) for v in sleepStage_incidents.values()))
print("Maximum number of sleepStage_db incidents: ", max(len(v) for v in sleepStage_incidents.values()))

#

# Generate the plot

In [13]:
# Get the number of nights that the sound level was higher than 40 dB
nights_higher_40 = 0
nights_lower_40 = 0

sleeping_df = df[df['sleepStage'] != -1]

# Calculate max dB per uuid
max_db_per_uuid = sleeping_df.groupby('uuid')['dB'].max()

# Count nights above/below 40dB
nights_higher_40 = (max_db_per_uuid > 40).sum()
nights_lower_40 = (max_db_per_uuid <= 40).sum()

In [None]:
# Create custom colors and explode effect
colors = ['#FF6B6B', '#4ECDC4']  # Coral red and turquoise
explode = (0.05, 0)  # Pull out first slice slightly

# Create the plot with improved styling
fig, ax = plt.subplots(figsize=(10, 8))

# Create pie chart with enhanced features
wedges, texts, autotexts = ax.pie(
    [nights_higher_40, nights_lower_40],
    explode=explode,
    labels=['Higher than 40 dB', 'Lower or equal to 40 dB'],
    autopct='%1.1f%%',
    startangle=90,
    colors=colors
)

# Style the text elements
plt.setp(autotexts, size=10, weight="bold")
plt.setp(texts, size=12)

# Add title
ax.set_title('Distribution of Nights by Sound Level with the users sleeping', pad=20, size=14, weight='bold')

# Equal aspect ratio ensures that pie is drawn as a circle
ax.axis('equal')

# Add legend
plt.legend(
    wedges,
    ['Higher than 40 dB', 'Lower or equal to 40 dB'],
    title="Sound Levels",
    loc="center left",
    bbox_to_anchor=(1, 0, 0.5, 1)
)

plt.tight_layout()
plt.show()