In [1]:
# dependencies
# importing dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import scipy

In [2]:
# Load the CSV files into DataFrames
noa = pd.read_csv('noa-full.csv')
light = pd.read_csv('light-full.csv')
medium = pd.read_csv('medium-full.csv')
loud = pd.read_csv('loud-full.csv')




In [3]:
for column in noa.columns:
    na_count = noa[column].isna().sum()
    column_length = len(noa[column])
    print(f"Column '{column}': {na_count} NaNs, Length: {column_length}")

Column 'acc_t': 14567724 NaNs, Length: 14596800
Column 'acc_x': 14567724 NaNs, Length: 14596800
Column 'acc_y': 14567724 NaNs, Length: 14596800
Column 'acc_z': 14567724 NaNs, Length: 14596800
Column 'acc_abs': 14567724 NaNs, Length: 14596800
Column 'gyro_x': 14567724 NaNs, Length: 14596800
Column 'gyro_y': 14567724 NaNs, Length: 14596800
Column 'gyro_z': 14567724 NaNs, Length: 14596800
Column 'pressure': 14596568 NaNs, Length: 14596800
Column 'audio': 0 NaNs, Length: 14596800
Column 'gyro_t': 14567724 NaNs, Length: 14596800
Column 'pressure_t': 14596568 NaNs, Length: 14596800
Column 'audio_rate': 14593725 NaNs, Length: 14596800


In [4]:
for column in light.columns:
    na_count = light[column].isna().sum()
    column_length = len(light[column])
    print(f"Column '{column}': {na_count} NaNs, Length: {column_length}")

Column 'acc_t': 14567751 NaNs, Length: 14596800
Column 'acc_x': 14567751 NaNs, Length: 14596800
Column 'acc_y': 14567751 NaNs, Length: 14596800
Column 'acc_z': 14567751 NaNs, Length: 14596800
Column 'acc_abs': 14567751 NaNs, Length: 14596800
Column 'gyro_x': 14567751 NaNs, Length: 14596800
Column 'gyro_y': 14567751 NaNs, Length: 14596800
Column 'gyro_z': 14567751 NaNs, Length: 14596800
Column 'pressure': 14596568 NaNs, Length: 14596800
Column 'audio': 0 NaNs, Length: 14596800
Column 'gyro_t': 14567751 NaNs, Length: 14596800
Column 'pressure_t': 14596568 NaNs, Length: 14596800
Column 'audio_rate': 14593725 NaNs, Length: 14596800


In [5]:
for column in medium.columns:
    na_count = medium[column].isna().sum()
    column_length = len(medium[column])
    print(f"Column '{column}': {na_count} NaNs, Length: {column_length}")

Column 'acc_t': 15065948 NaNs, Length: 15096000
Column 'acc_x': 15065948 NaNs, Length: 15096000
Column 'acc_y': 15065948 NaNs, Length: 15096000
Column 'acc_z': 15065948 NaNs, Length: 15096000
Column 'acc_abs': 15065948 NaNs, Length: 15096000
Column 'gyro_x': 15065948 NaNs, Length: 15096000
Column 'gyro_y': 15065948 NaNs, Length: 15096000
Column 'gyro_z': 15065948 NaNs, Length: 15096000
Column 'pressure': 15095760 NaNs, Length: 15096000
Column 'audio': 0 NaNs, Length: 15096000
Column 'gyro_t': 15065948 NaNs, Length: 15096000
Column 'pressure_t': 15095760 NaNs, Length: 15096000
Column 'audio_rate': 15092820 NaNs, Length: 15096000


In [6]:
for column in loud.columns:
    na_count = loud[column].isna().sum()
    column_length = len(loud[column])
    print(f"Column '{column}': {na_count} NaNs, Length: {column_length}")

Column 'acc_t': 15089943 NaNs, Length: 15120000
Column 'acc_x': 15089943 NaNs, Length: 15120000
Column 'acc_y': 15089943 NaNs, Length: 15120000
Column 'acc_z': 15089943 NaNs, Length: 15120000
Column 'acc_abs': 15089943 NaNs, Length: 15120000
Column 'gyro_x': 15089943 NaNs, Length: 15120000
Column 'gyro_y': 15089943 NaNs, Length: 15120000
Column 'gyro_z': 15089943 NaNs, Length: 15120000
Column 'pressure': 15119760 NaNs, Length: 15120000
Column 'audio': 0 NaNs, Length: 15120000
Column 'gyro_t': 15089943 NaNs, Length: 15120000
Column 'pressure_t': 15119760 NaNs, Length: 15120000
Column 'audio_rate': 15116820 NaNs, Length: 15120000


# Cleaning

In [7]:
# Function to downsample::

def process_dataframe(df, name):
    # Downsample audio data to 100 Hz like the other sensors
    downsample_factor = 48000 // 100
    df_audio_downsampled = df['audio'][::downsample_factor].reset_index(drop=True)

    # Length of downsampled column, because original is filled with NA
    audio_length = len(df_audio_downsampled)

    # Trim accelerometer and gyroscope: match the audio length
    df_trimmed = df.iloc[:audio_length].copy()

    # Finish df with audio in it
    df_trimmed['audio'] = df_audio_downsampled

    # Check
    print(f"Length of downsampled audio in {name}:", len(df_audio_downsampled))
    print(f"Length of trimmed dataframe in {name}:", len(df_trimmed))

    # Check quality and compare columns
    for column in df.columns:
        na_count = df_trimmed[column].isna().sum()
        column_length = len(df_trimmed[column])
        print(f"Column '{column}' in {name}: {na_count} NaNs, Length: {column_length}")
    
    return df_trimmed


def clean_dataframe(df, name):
    # Forward fill pressure data
    df['pressure'] = df['pressure'].fillna(method='ffill')

    # Perform interpolation on the missing values for accelerometer and gyroscope data
    columns_to_interpolate = ['acc_t', 'acc_x', 'acc_y', 'acc_z', 'acc_abs', 'gyro_t', 'gyro_x', 'gyro_y', 'gyro_z']
    for column in columns_to_interpolate:
        df[column] = df[column].interpolate()

    # Drop specified columns
    columns_to_drop = ['pressure_t', 'audio_rate']
    df = df.drop(columns=columns_to_drop)

    # Check quality and compare columns
    for column in df.columns:
        na_count = df[column].isna().sum()
        column_length = len(df[column])
        print(f"Column '{column}' in {name}: {na_count} NaNs, Length: {column_length}")
    
    return df



## cleaning noa


In [8]:
noa_down = process_dataframe(noa, "noa")

Length of downsampled audio in noa: 30410
Length of trimmed dataframe in noa: 30410
Column 'acc_t' in noa: 1334 NaNs, Length: 30410
Column 'acc_x' in noa: 1334 NaNs, Length: 30410
Column 'acc_y' in noa: 1334 NaNs, Length: 30410
Column 'acc_z' in noa: 1334 NaNs, Length: 30410
Column 'acc_abs' in noa: 1334 NaNs, Length: 30410
Column 'gyro_x' in noa: 1334 NaNs, Length: 30410
Column 'gyro_y' in noa: 1334 NaNs, Length: 30410
Column 'gyro_z' in noa: 1334 NaNs, Length: 30410
Column 'pressure' in noa: 30178 NaNs, Length: 30410
Column 'audio' in noa: 0 NaNs, Length: 30410
Column 'gyro_t' in noa: 1334 NaNs, Length: 30410
Column 'pressure_t' in noa: 30178 NaNs, Length: 30410
Column 'audio_rate' in noa: 27335 NaNs, Length: 30410


In [9]:
noa_cleaned = clean_dataframe(noa_down, "noa_down")

Column 'acc_t' in noa_down: 0 NaNs, Length: 30410
Column 'acc_x' in noa_down: 0 NaNs, Length: 30410
Column 'acc_y' in noa_down: 0 NaNs, Length: 30410
Column 'acc_z' in noa_down: 0 NaNs, Length: 30410
Column 'acc_abs' in noa_down: 0 NaNs, Length: 30410
Column 'gyro_x' in noa_down: 0 NaNs, Length: 30410
Column 'gyro_y' in noa_down: 0 NaNs, Length: 30410
Column 'gyro_z' in noa_down: 0 NaNs, Length: 30410
Column 'pressure' in noa_down: 0 NaNs, Length: 30410
Column 'audio' in noa_down: 0 NaNs, Length: 30410
Column 'gyro_t' in noa_down: 0 NaNs, Length: 30410


In [10]:
noa_cleaned.head()

Unnamed: 0,acc_t,acc_x,acc_y,acc_z,acc_abs,gyro_x,gyro_y,gyro_z,pressure,audio,gyro_t
0,0.017443,-0.103828,-0.036572,0.013185,0.110867,0.00614,-0.020941,-0.021615,1015.177155,-1.103038e-07,0.017443
1,0.027423,0.016966,0.000734,0.003513,0.017341,0.01423,-0.05123,-0.034906,1015.178909,-8.869381e-06,0.027423
2,0.037402,0.052361,-0.00583,0.03783,0.06486,0.011364,-0.050322,-0.027478,1015.182571,-1.116034e-05,0.037402
3,0.047381,0.028602,0.017169,0.010993,0.035124,0.002096,-0.024613,-0.019981,1015.182571,-4.920312e-06,0.047381
4,0.05736,-0.001354,0.054812,-0.05532,0.077888,-0.002218,0.000642,-0.015214,1015.179596,-4.233477e-05,0.05736


## cleaning light

In [11]:
light_down = process_dataframe(light, "light")

Length of downsampled audio in light: 30410
Length of trimmed dataframe in light: 30410
Column 'acc_t' in light: 1361 NaNs, Length: 30410
Column 'acc_x' in light: 1361 NaNs, Length: 30410
Column 'acc_y' in light: 1361 NaNs, Length: 30410
Column 'acc_z' in light: 1361 NaNs, Length: 30410
Column 'acc_abs' in light: 1361 NaNs, Length: 30410
Column 'gyro_x' in light: 1361 NaNs, Length: 30410
Column 'gyro_y' in light: 1361 NaNs, Length: 30410
Column 'gyro_z' in light: 1361 NaNs, Length: 30410
Column 'pressure' in light: 30178 NaNs, Length: 30410
Column 'audio' in light: 0 NaNs, Length: 30410
Column 'gyro_t' in light: 1361 NaNs, Length: 30410
Column 'pressure_t' in light: 30178 NaNs, Length: 30410
Column 'audio_rate' in light: 27335 NaNs, Length: 30410


In [12]:
light_cleaned = clean_dataframe(light_down, "light")

Column 'acc_t' in light: 0 NaNs, Length: 30410
Column 'acc_x' in light: 0 NaNs, Length: 30410
Column 'acc_y' in light: 0 NaNs, Length: 30410
Column 'acc_z' in light: 0 NaNs, Length: 30410
Column 'acc_abs' in light: 0 NaNs, Length: 30410
Column 'gyro_x' in light: 0 NaNs, Length: 30410
Column 'gyro_y' in light: 0 NaNs, Length: 30410
Column 'gyro_z' in light: 0 NaNs, Length: 30410
Column 'pressure' in light: 0 NaNs, Length: 30410
Column 'audio' in light: 0 NaNs, Length: 30410
Column 'gyro_t' in light: 0 NaNs, Length: 30410


In [13]:
light_cleaned.head()

Unnamed: 0,acc_t,acc_x,acc_y,acc_z,acc_abs,gyro_x,gyro_y,gyro_z,pressure,audio,gyro_t
0,0.012373,0.039617,-0.009938,0.052368,0.066413,0.010968,-0.000997,-0.003727,1019.220123,-3.329422e-07,0.012373
1,0.022352,0.030913,-0.009669,0.024244,0.040458,0.009328,-0.005989,-0.000227,1019.21669,2.869374e-05,0.022352
2,0.032332,-0.016788,0.001436,0.057522,0.059939,0.008435,-0.014573,0.006058,1019.208374,-0.0002319325,0.032332
3,0.042312,-0.032674,0.009824,0.050177,0.060678,0.006781,-0.02178,0.010552,1019.207306,0.0004460109,0.042312
4,0.052292,-0.034788,-0.003498,0.0286,0.045171,0.00551,-0.023343,0.013885,1019.207306,0.0003474597,0.052292


## cleaning medium

In [14]:
medium_down = process_dataframe(medium, "medium")

Length of downsampled audio in medium: 31450
Length of trimmed dataframe in medium: 31450
Column 'acc_t' in medium: 1398 NaNs, Length: 31450
Column 'acc_x' in medium: 1398 NaNs, Length: 31450
Column 'acc_y' in medium: 1398 NaNs, Length: 31450
Column 'acc_z' in medium: 1398 NaNs, Length: 31450
Column 'acc_abs' in medium: 1398 NaNs, Length: 31450
Column 'gyro_x' in medium: 1398 NaNs, Length: 31450
Column 'gyro_y' in medium: 1398 NaNs, Length: 31450
Column 'gyro_z' in medium: 1398 NaNs, Length: 31450
Column 'pressure' in medium: 31210 NaNs, Length: 31450
Column 'audio' in medium: 0 NaNs, Length: 31450
Column 'gyro_t' in medium: 1398 NaNs, Length: 31450
Column 'pressure_t' in medium: 31210 NaNs, Length: 31450
Column 'audio_rate' in medium: 28270 NaNs, Length: 31450


In [15]:
medium_cleaned = clean_dataframe(medium_down, "medium")

Column 'acc_t' in medium: 0 NaNs, Length: 31450
Column 'acc_x' in medium: 0 NaNs, Length: 31450
Column 'acc_y' in medium: 0 NaNs, Length: 31450
Column 'acc_z' in medium: 0 NaNs, Length: 31450
Column 'acc_abs' in medium: 0 NaNs, Length: 31450
Column 'gyro_x' in medium: 0 NaNs, Length: 31450
Column 'gyro_y' in medium: 0 NaNs, Length: 31450
Column 'gyro_z' in medium: 0 NaNs, Length: 31450
Column 'pressure' in medium: 0 NaNs, Length: 31450
Column 'audio' in medium: 0 NaNs, Length: 31450
Column 'gyro_t' in medium: 0 NaNs, Length: 31450


In [16]:
medium_cleaned.head()

Unnamed: 0,acc_t,acc_x,acc_y,acc_z,acc_abs,gyro_x,gyro_y,gyro_z,pressure,audio,gyro_t
0,0.015982,-0.020588,0.110642,0.016016,0.113675,0.016927,0.029854,-0.008035,1019.400635,3.970814e-07,0.015982
1,0.025961,0.0138,0.052366,0.040128,0.067401,0.010479,0.042972,-0.012995,1019.398499,3.649469e-05,0.025961
2,0.035941,0.037712,0.048042,0.029093,0.067651,0.006417,0.049971,-0.019349,1019.398117,0.0003263162,0.035941
3,0.04592,0.031038,0.057012,0.028047,0.070714,0.006728,0.052481,-0.02773,1019.392853,0.0007769451,0.04592
4,0.0559,0.017268,0.027473,0.001009,0.032464,0.008562,0.048546,-0.035543,1019.392853,0.001113705,0.0559


## cleaning loud

In [17]:
loud_down = process_dataframe(loud, "loud")


Length of downsampled audio in loud: 31500
Length of trimmed dataframe in loud: 31500
Column 'acc_t' in loud: 1443 NaNs, Length: 31500
Column 'acc_x' in loud: 1443 NaNs, Length: 31500
Column 'acc_y' in loud: 1443 NaNs, Length: 31500
Column 'acc_z' in loud: 1443 NaNs, Length: 31500
Column 'acc_abs' in loud: 1443 NaNs, Length: 31500
Column 'gyro_x' in loud: 1443 NaNs, Length: 31500
Column 'gyro_y' in loud: 1443 NaNs, Length: 31500
Column 'gyro_z' in loud: 1443 NaNs, Length: 31500
Column 'pressure' in loud: 31260 NaNs, Length: 31500
Column 'audio' in loud: 0 NaNs, Length: 31500
Column 'gyro_t' in loud: 1443 NaNs, Length: 31500
Column 'pressure_t' in loud: 31260 NaNs, Length: 31500
Column 'audio_rate' in loud: 28320 NaNs, Length: 31500


In [18]:
loud_cleaned = clean_dataframe(loud_down, "loud")

Column 'acc_t' in loud: 0 NaNs, Length: 31500
Column 'acc_x' in loud: 0 NaNs, Length: 31500
Column 'acc_y' in loud: 0 NaNs, Length: 31500
Column 'acc_z' in loud: 0 NaNs, Length: 31500
Column 'acc_abs' in loud: 0 NaNs, Length: 31500
Column 'gyro_x' in loud: 0 NaNs, Length: 31500
Column 'gyro_y' in loud: 0 NaNs, Length: 31500
Column 'gyro_z' in loud: 0 NaNs, Length: 31500
Column 'pressure' in loud: 0 NaNs, Length: 31500
Column 'audio' in loud: 0 NaNs, Length: 31500
Column 'gyro_t' in loud: 0 NaNs, Length: 31500


In [19]:
loud_cleaned.head()

Unnamed: 0,acc_t,acc_x,acc_y,acc_z,acc_abs,gyro_x,gyro_y,gyro_z,pressure,audio,gyro_t
0,0.00645,-3.065662,-1.057909,-1.533705,3.587437,-0.550138,-0.206621,-0.100526,1019.268036,-0.001747,0.00645
1,0.016431,-1.986884,-1.497531,-1.611219,2.964175,-0.806182,1.131129,0.43915,1019.255142,-0.008878,0.016431
2,0.026412,1.326233,-0.968634,2.801568,3.247449,-0.842933,1.554489,0.416276,1019.244537,0.007614,0.026412
3,0.036392,1.942921,-0.735363,0.216575,2.088685,-0.885478,1.342746,0.385167,1019.240494,0.065324,0.036392
4,0.046373,1.641204,0.019911,-1.032943,1.939309,-0.581892,0.812823,0.440728,1019.22821,-0.003682,0.046373


# create final dataframe

In [20]:
def truncate_dataframe(df, length):
    """
    Truncate the dataframe to the specified length.
    """
    return df.iloc[:length]

def add_labels_and_ids(df_list, labels):
    """
    Add labels and recording IDs to each dataframe and combine them into a single dataframe.
    """
    combined_df = pd.DataFrame()
    recording_id = 0
    for df, label in zip(df_list, labels):
        num_records = len(df) // (10 * 100)  # Number of 10-second recordings
        for i in range(num_records):
            start = i * 10 * 100
            end = start + 10 * 100
            df_segment = df.iloc[start:end].copy()
            df_segment['label'] = label
            df_segment['recording_id'] = recording_id
            combined_df = pd.concat([combined_df, df_segment], ignore_index=True)
            recording_id += 1
    return combined_df


In [21]:
required_length = 30 * 10 * 100
noa_truncated = truncate_dataframe(noa_cleaned, required_length)
light_truncated = truncate_dataframe(light_cleaned, required_length)
medium_truncated = truncate_dataframe(medium_cleaned, required_length)
loud_truncated = truncate_dataframe(loud_cleaned, required_length)

In [22]:
print(len(noa_truncated), len(light_truncated), len(medium_truncated), len(loud_truncated))

30000 30000 30000 30000


In [23]:
# Combine all truncated dataframes into one with labels and recording IDs
dataframes = [noa_truncated, light_truncated, medium_truncated, loud_truncated]
labels = ['noa', 'light', 'medium', 'loud']

final_df = add_labels_and_ids(dataframes, labels)

In [24]:
final_df.head()

Unnamed: 0,acc_t,acc_x,acc_y,acc_z,acc_abs,gyro_x,gyro_y,gyro_z,pressure,audio,gyro_t,label,recording_id
0,0.017443,-0.103828,-0.036572,0.013185,0.110867,0.00614,-0.020941,-0.021615,1015.177155,-1.103038e-07,0.017443,noa,0
1,0.027423,0.016966,0.000734,0.003513,0.017341,0.01423,-0.05123,-0.034906,1015.178909,-8.869381e-06,0.027423,noa,0
2,0.037402,0.052361,-0.00583,0.03783,0.06486,0.011364,-0.050322,-0.027478,1015.182571,-1.116034e-05,0.037402,noa,0
3,0.047381,0.028602,0.017169,0.010993,0.035124,0.002096,-0.024613,-0.019981,1015.182571,-4.920312e-06,0.047381,noa,0
4,0.05736,-0.001354,0.054812,-0.05532,0.077888,-0.002218,0.000642,-0.015214,1015.179596,-4.233477e-05,0.05736,noa,0


In [25]:
# Check quality and compare columns
for column in final_df.columns:
    na_count = final_df[column].isna().sum()
    column_length = len(final_df[column])
    print(f"Column '{column}' in final_df: {na_count} NaNs, Length: {column_length}")
    

Column 'acc_t' in final_df: 0 NaNs, Length: 120000
Column 'acc_x' in final_df: 0 NaNs, Length: 120000
Column 'acc_y' in final_df: 0 NaNs, Length: 120000
Column 'acc_z' in final_df: 0 NaNs, Length: 120000
Column 'acc_abs' in final_df: 0 NaNs, Length: 120000
Column 'gyro_x' in final_df: 0 NaNs, Length: 120000
Column 'gyro_y' in final_df: 0 NaNs, Length: 120000
Column 'gyro_z' in final_df: 0 NaNs, Length: 120000
Column 'pressure' in final_df: 0 NaNs, Length: 120000
Column 'audio' in final_df: 0 NaNs, Length: 120000
Column 'gyro_t' in final_df: 0 NaNs, Length: 120000
Column 'label' in final_df: 0 NaNs, Length: 120000
Column 'recording_id' in final_df: 0 NaNs, Length: 120000


In [26]:
# Check the structure of the final dataframe
print(final_df.head())
print(final_df.tail())
print(final_df['label'].value_counts())
print(final_df['recording_id'].nunique())

      acc_t     acc_x     acc_y     acc_z   acc_abs    gyro_x    gyro_y  \
0  0.017443 -0.103828 -0.036572  0.013185  0.110867  0.006140 -0.020941   
1  0.027423  0.016966  0.000734  0.003513  0.017341  0.014230 -0.051230   
2  0.037402  0.052361 -0.005830  0.037830  0.064860  0.011364 -0.050322   
3  0.047381  0.028602  0.017169  0.010993  0.035124  0.002096 -0.024613   
4  0.057360 -0.001354  0.054812 -0.055320  0.077888 -0.002218  0.000642   

     gyro_z     pressure         audio    gyro_t label  recording_id  
0 -0.021615  1015.177155 -1.103038e-07  0.017443   noa             0  
1 -0.034906  1015.178909 -8.869381e-06  0.027423   noa             0  
2 -0.027478  1015.182571 -1.116034e-05  0.037402   noa             0  
3 -0.019981  1015.182571 -4.920312e-06  0.047381   noa             0  
4 -0.015214  1015.179596 -4.233477e-05  0.057360   noa             0  
             acc_t      acc_x      acc_y      acc_z    acc_abs    gyro_x  \
119995  317.512722  10.230064   1.817370  -6.20

In [27]:
# save the df to vsv
final_df.to_csv('final_dataframe.csv', index=False)
