## Audio Signal Extract and Process to String Classes

### Libraries

In [1]:
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import torch.nn as nn

import warnings
warnings.filterwarnings('ignore')

# Audio processing library for analyzing and extracting features from audio signals
import librosa

# Library for reducing noise from audio signals
import noisereduce as nr

import pickle

# Library for working with video files, including reading and editing
from moviepy.editor import VideoFileClip

from numpy import isfinite 

import math


### Require Data Loading and Processing 

In [2]:
print(os.getcwd())

/home/chijuiwu/snap/snapd-desktop-integration/83/Desktop/CapstoneProject


In [3]:
# Path of training excal which including video name, labels, duration, etc.
Train_excal_path = '/home/chijuiwu/snap/snapd-desktop-integration/83/Desktop/CapstoneProject/CharadesEgoInfo/CharadesEgo_v1_test_only1st.csv'

df = pd.read_csv(Train_excal_path)


In [4]:
print(df)

           id subject         scene  quality  relevance verified  \
0    W1XD0EGO    I2IV       Bedroom      7.0        7.0      Yes   
1    NU9BQEGO    V258        Pantry      5.0        7.0      Yes   
2    EY2W7EGO    VQ30       Bedroom      4.0        6.0      Yes   
3    X28BIEGO    VT5W   Dining room      6.0        6.0      Yes   
4    DVSVQEGO    7CY9       Kitchen      7.0        7.0      Yes   
..        ...     ...           ...      ...        ...      ...   
841  39JUOEGO    A66R        Pantry      7.0        7.0      Yes   
842  HKKESEGO    H8N1         Other      5.0        5.0      Yes   
843  F829EEGO    15B9        Pantry      5.0        5.0      Yes   
844  ZLYISEGO    Z241   Living room      6.0        6.0      Yes   
845  7HU7REGO    Z241  Laundry room      6.0        7.0      Yes   

                                                script  \
0    Person is holding picture, then puts picture o...   
1    A person is tidying up the groceries in a pant...   
2    Pers

In [5]:
# Select only 'id' and 'actions' columns
selected_data = df[['id', 'actions']]

# Print the selected data
print(selected_data)

           id                                            actions
0    W1XD0EGO  c115 0.00 21.70;c116 13.10 22.50;c123 0.00 22....
1    NU9BQEGO  c098 0.60 23.88;c113 1.60 21.20;c112 16.80 21....
2    EY2W7EGO  c110 24.20 30.75;c059 12.00 30.75;c109 29.70 3...
3    X28BIEGO  c051 0.00 31.25;c046 27.90 31.25;c053 0.00 31....
4    DVSVQEGO  c152 15.00 22.70;c018 20.30 28.10;c016 24.10 3...
..        ...                                                ...
841  39JUOEGO  c149 0.90 6.90;c061 14.70 24.75;c099 2.20 7.90...
842  HKKESEGO  c056 3.80 9.75;c053 4.10 9.75;c082 5.10 9.75;c...
843  F829EEGO  c063 5.10 13.60;c008 4.60 10.70;c062 14.80 22....
844  ZLYISEGO  c097 30.90 31.17;c065 2.70 13.90;c156 2.80 13....
845  7HU7REGO  c102 21.90 29.75;c100 17.60 23.40;c154 27.40 2...

[846 rows x 2 columns]


In [6]:
# Extract all unique action codes using regex
action_pattern = r'c\d{3}'
unique_actions = set()

unique_actions_with_no_action = set(['no_action'])  # Initialize set with 'no_action' for NaN cases

for actions in df['actions']:
    if pd.isna(actions):
        continue  # 'no_action' already added, continue without finding regex
    matches = re.findall(action_pattern, str(actions))
    unique_actions_with_no_action.update(matches)

print(unique_actions_with_no_action)

{'c117', 'c022', 'c149', 'c002', 'c143', 'c039', 'c127', 'c009', 'c056', 'c045', 'c047', 'c090', 'c050', 'c088', 'c125', 'c130', 'c032', 'c079', 'c028', 'c055', 'c000', 'c145', 'c021', 'c057', 'c067', 'c144', 'c071', 'c007', 'c156', 'c118', 'c122', 'c104', 'c065', 'c141', 'c015', 'c054', 'c097', 'c095', 'c078', 'c151', 'c083', 'c063', 'c116', 'c073', 'c034', 'c111', 'c112', 'c029', 'c148', 'c107', 'c030', 'c006', 'c037', 'c153', 'c060', 'c025', 'c142', 'c110', 'c058', 'c066', 'c109', 'c005', 'c042', 'c100', 'c081', 'c113', 'c135', 'c114', 'c126', 'c020', 'c001', 'c018', 'c076', 'c140', 'c138', 'c137', 'c027', 'c004', 'c121', 'c105', 'c108', 'c064', 'c011', 'c115', 'c129', 'c070', 'c150', 'c085', 'c084', 'c080', 'c023', 'c012', 'c154', 'c016', 'c044', 'c136', 'c123', 'c128', 'c053', 'no_action', 'c059', 'c046', 'c062', 'c061', 'c146', 'c048', 'c155', 'c120', 'c052', 'c077', 'c014', 'c103', 'c134', 'c139', 'c087', 'c101', 'c072', 'c019', 'c074', 'c119', 'c040', 'c068', 'c147', 'c098', 'c

In [7]:
# Sorting the unique_actions set
sorted_unique_actions = sorted(unique_actions_with_no_action)
print(sorted_unique_actions)


['c000', 'c001', 'c002', 'c003', 'c004', 'c005', 'c006', 'c007', 'c008', 'c009', 'c010', 'c011', 'c012', 'c013', 'c014', 'c015', 'c016', 'c017', 'c018', 'c019', 'c020', 'c021', 'c022', 'c023', 'c024', 'c025', 'c026', 'c027', 'c028', 'c029', 'c030', 'c031', 'c032', 'c033', 'c034', 'c035', 'c036', 'c037', 'c038', 'c039', 'c040', 'c041', 'c042', 'c043', 'c044', 'c045', 'c046', 'c047', 'c048', 'c049', 'c050', 'c051', 'c052', 'c053', 'c054', 'c055', 'c056', 'c057', 'c058', 'c059', 'c060', 'c061', 'c062', 'c063', 'c064', 'c065', 'c066', 'c067', 'c068', 'c069', 'c070', 'c071', 'c072', 'c073', 'c074', 'c075', 'c076', 'c077', 'c078', 'c079', 'c080', 'c081', 'c082', 'c083', 'c084', 'c085', 'c086', 'c087', 'c088', 'c089', 'c090', 'c091', 'c092', 'c093', 'c094', 'c095', 'c096', 'c097', 'c098', 'c099', 'c100', 'c101', 'c102', 'c103', 'c104', 'c105', 'c106', 'c107', 'c108', 'c109', 'c110', 'c111', 'c112', 'c113', 'c114', 'c115', 'c116', 'c117', 'c118', 'c119', 'c120', 'c121', 'c122', 'c123', 'c124',

In [8]:
# Counting the number of sorted unique actions
number_of_actions = len(sorted_unique_actions)
print(number_of_actions)

158


In [9]:
# Convert 'id' column to list for video_files
video_files = selected_data['id'].tolist()
print(video_files)


['W1XD0EGO', 'NU9BQEGO', 'EY2W7EGO', 'X28BIEGO', 'DVSVQEGO', '2UAB5EGO', 'FIDBIEGO', 'P9SOAEGO', 'KOJLPEGO', 'ZDY4KEGO', 'KLI31EGO', 'KHEM0EGO', 'OWPFMEGO', 'PK0LLEGO', '3OHCPEGO', '2MK76EGO', 'GQT75EGO', 'B78XTEGO', '3L274EGO', 'ZBOC2EGO', 'BNCZFEGO', '0FEXMEGO', 'I7BSOEGO', 'CV173EGO', 'RHJ7SEGO', '4VCG4EGO', 'O9PE4EGO', 'GYCTQEGO', 'D6JJ2EGO', 'OZH09EGO', 'W28V1EGO', 'XYHFAEGO', 'GAAOVEGO', '3JKBFEGO', 'RCOHJEGO', 'CANHSEGO', 'V2I7EEGO', 'L5UQJEGO', 'BUITJEGO', 'AOF10EGO', '6XAC5EGO', 'UNB3NEGO', 'J3MN6EGO', 'WVH1DEGO', '2PIAOEGO', '64U9XEGO', 'YCQM7EGO', 'UMY7CEGO', 'R9N7BEGO', 'PHEHKEGO', 'XBHJOEGO', '1GL7YEGO', 'O4A9IEGO', 'GP0I5EGO', '3NJLIEGO', 'LXK5XEGO', '6GETYEGO', 'REMUKEGO', '8JPE1EGO', 'KTHI4EGO', '6D5DHEGO', 'AUQD5EGO', '555W8EGO', '15AKPEGO', 'LS926EGO', '0N16WEGO', 'CBFVTEGO', 'TQINTEGO', 'TXW0EEGO', 'HSLZ2EGO', 'KLIW5EGO', 'SFHKPEGO', '5LWZKEGO', 'QO363EGO', '0HJFXEGO', 'DI2PBEGO', 'LXD11EGO', 'DXASPEGO', 'KRDB3EGO', '14J04EGO', '4QPMTEGO', 'HIX6DEGO', 'F4OV6EGO', 'WJ

In [10]:
# Create a dictionary from 'id' and 'actions' for annotations
annotations = dict(zip(selected_data['id'], selected_data['actions']))
print(annotations)


{'W1XD0EGO': 'c115 0.00 21.70;c116 13.10 22.50;c123 0.00 22.50;c139 21.40 22.50;c077 11.60 22.50;c085 17.10 22.50;c084 0.00 20.50;c086 14.90 21.70;c152 20.50 22.50;c077 20.30 22.50;c085 23.50 22.50', 'NU9BQEGO': 'c098 0.60 23.88;c113 1.60 21.20;c112 16.80 21.80;c099 22.20 23.88;c006 16.20 22.30;c008 1.10 6.50;c141 0.90 6.70;c102 0.20 4.40;c100 0.40 4.60', 'EY2W7EGO': 'c110 24.20 30.75;c059 12.00 30.75;c109 29.70 30.75;c008 0.00 30.75;c152 31.20 30.75;c097 10.30 30.75;c141 8.70 30.75;c118 16.80 23.70;c108 12.80 25.00;c008 0.00 8.20;c152 27.50 30.75;c097 0.00 11.70;c141 0.00 11.20;c151 6.80 16.30;c110 16.20 24.00;c009 29.30 30.75;c059 10.20 19.00;c109 22.00 27.50;c107 16.70 30.75;c011 12.80 30.75;c104 4.80 11.20;c118 12.50 30.75;c106 24.70 30.75', 'X28BIEGO': 'c051 0.00 31.25;c046 27.90 31.25;c053 0.00 31.25;c048 0.00 31.25;c052 0.00 31.25;c151 0.50 14.50;c054 0.20 8.70;c059 1.40 13.90;c011 0.00 13.60;c154 0.00 11.00', 'DVSVQEGO': 'c152 15.00 22.70;c018 20.30 28.10;c016 24.10 30.71;c152 

### Functions for Extract Audio and Label the Clips

In [11]:
# Function to parse annotations
def parse_annotations(annotation_str, video_duration):
    actions = []
    last_end = 0

    # Check if annotation_str is a float and nan
    if isinstance(annotation_str, float) and math.isnan(annotation_str):
        # Handle the case where the annotation is nan
        return [{'action': 'no_action', 'start': 0, 'end': video_duration}]

    # Proceed if annotation_str is not nan
    for action in annotation_str.split(';'):
        parts = action.split()
        action_id = parts[0]
        start = round(float(parts[1]))
        end = round(float(parts[2]))

        # Check for gap between last action and current action
        if start > last_end:
            actions.append({'action': 'no_action', 'start': last_end, 'end': start})

        actions.append({'action': action_id, 'start': start, 'end': end})
        last_end = end

    # Check for action-less segment at the end of the video
    if last_end < video_duration:
        actions.append({'action': 'no_action', 'start': last_end, 'end': video_duration})

    return actions

def process_video(file_name, annotations, file_path):
    video = VideoFileClip(f"{file_path}/{file_name}.mp4")
    duration = int(video.duration)

    # Generate the action index map
    sorted_unique_actions = sorted(unique_actions_with_no_action)  # From your sorting code
    action_index_map = {action: idx for idx, action in enumerate(sorted_unique_actions)}

    mfcc_data = []
    spectrogram_data = []
    sample_rate = 16000

    for i in range(duration):
        audio_clip = video.subclip(i, i + 1).audio
        audio_clip_filename = f"audio_clip_{i}.wav"
        audio_clip.write_audiofile(audio_clip_filename, codec='pcm_s16le')
        audio_clip_input, sr = librosa.load(audio_clip_filename, sr=16000)
        os.remove(audio_clip_filename)  # Clean up audio files right away
        
        # Check for non-finite values
        if not np.isfinite(audio_clip_input).all():
            print(f"Non-finite values found in audio clip {i} of {file_name}, skipping this clip.")
            continue

        # Perform noise reduction
        try:
            audio_clip_input_reduced = nr.reduce_noise(y=audio_clip_input, sr=sr)
        except Exception as e:
            print(f"Error in noise reduction for clip {i} of {file_name}: {e}, skipping this clip.")
            continue

        # Check again for non-finite values after noise reduction
        if not np.isfinite(audio_clip_input_reduced).all():
            print(f"Non-finite values found after noise reduction in audio clip {i} of {file_name}, skipping this clip.")
            continue
        
        # Initialize binary action list for this second
        actions_this_second = [0] * len(sorted_unique_actions)

        # Populate the binary list based on actions occurring in this second
        for action in annotations:
            if i >= action['start'] and i < action['end']:
                action_idx = action_index_map[action['action']]
                actions_this_second[action_idx] = 1

        # Perform noise reduction and feature extraction
        audio_clip_input_reduced = nr.reduce_noise(y=audio_clip_input, sr=sample_rate)
        mfcc = librosa.feature.mfcc(y=audio_clip_input_reduced, sr=sample_rate)
        spect = librosa.amplitude_to_db(np.abs(librosa.stft(audio_clip_input_reduced)), ref=np.max)
        # Resizing operations
        # Adjust frequency dimension to 1024
        if spect.shape[0] > 1024:
            spect = spect[:1024, :]
        elif spect.shape[0] < 1024:
            padding_size = 1024 - spect.shape[0]
            spect = np.pad(spect, ((0, padding_size), (0, 0)))

        # Convert to tensor for resampling
        spect_tensor = torch.tensor(spect, dtype=torch.float32)

        # Add a channel dimension for interpolation
        spect_tensor = spect_tensor.unsqueeze(0).unsqueeze(0)  # Now [1, 1, 1024, Time]

        # Resample time dimension to 128
        spect_tensor = F.interpolate(spect_tensor, size=(1024, 128), mode='bilinear', align_corners=False)

        # Remove channel dimension
        spect = spect_tensor.squeeze(0).squeeze(0).numpy()
        # Append to data lists
        mfcc_data.append({
            'file_id': file_name,
            'sample_rate': sample_rate,
            'start_sec': i,
            'end_sec': i + 1,
            'mfcc': mfcc,
            'action': actions_this_second
        })

        spectrogram_data.append({
            'file_id': file_name,
            'sample_rate': sample_rate,
            'start_sec': i,
            'end_sec': i + 1,
            'spectrogram': spect,
            'action': actions_this_second
        })

    return mfcc_data, spectrogram_data


### Processing

In [12]:
video_file_path = "/media/DiskDrive1/CharadesEgo"


# Main processing loop
all_mfcc_data = []
all_spectrogram_data = []

for file in video_files:
    video = VideoFileClip(f"{video_file_path}/{file}.mp4")
    parsed_anno = parse_annotations(annotations[file], int(video.duration))

    mfcc_data, spectrogram_data = process_video(file, parsed_anno, video_file_path)
    all_mfcc_data.extend(mfcc_data)
    all_spectrogram_data.extend(spectrogram_data)

# Convert to DataFrame and then to dictionaries
mfcc_dataset = pd.DataFrame(all_mfcc_data).to_dict('records')
spectrogram_dataset = pd.DataFrame(all_spectrogram_data).to_dict('records')

# Save the datasets
with open('mfcc_dataset_list.pkl', 'wb') as mfcc_file_list:
    pickle.dump(mfcc_dataset, mfcc_file_list)

with open('spectrogram_dataset_list.pkl', 'wb') as spectrogram_file_list:
    pickle.dump(spectrogram_dataset, spectrogram_file_list)

# Print dataset information
print(f"MFCC Dataset num_rows: {len(mfcc_dataset)}")
print(f"Spectrogram Dataset num_rows: {len(spectrogram_dataset)}")

MoviePy - Writing audio in audio_clip_0.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_1.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_2.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_3.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_4.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_5.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_6.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_7.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_8.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_9.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_10.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_11.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_12.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_13.wav


                                                                                

MoviePy - Done.




MoviePy - Writing audio in audio_clip_14.wav


                                                                                

MoviePy - Done.




KeyboardInterrupt: 

In [None]:
print(spectrogram_dataset[:5])


### Normalize

In [None]:
# Normalize MFCC dataset
for record in mfcc_dataset:
    record['mfcc'] = (record['mfcc'] - (-4.2677393)) / (4.5689974 * 2)

# Normalize Spectrogram dataset
for record in spectrogram_dataset:
    record['spectrogram'] = (record['spectrogram'] - (-4.2677393)) / (4.5689974 * 2)
    
# Save the datasets
with open('mfcc_dataset_list.pkl', 'wb') as mfcc_file_list:
    pickle.dump(mfcc_dataset, mfcc_file_list)

with open('spectrogram_dataset_list.pkl', 'wb') as spectrogram_file_list:
    pickle.dump(spectrogram_dataset, spectrogram_file_list)

In [8]:
import pickle

with open('spectrogram_dataset_list.pkl', 'rb') as file:
    spectrogram_dataset = pickle.load(file)

In [9]:
import gzip

# Compress data
def save_zipped_pickle_gz(obj, filename, protocol=-1):
    # Using gzip to compress the data
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f, protocol)

save_zipped_pickle_gz(spectrogram_dataset, 'spectrogram_dataset_list.pkl.gz')