## Audio Signal Extract and Process to Regular Classes

### Libraries

In [1]:
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import torch.nn as nn

import warnings
warnings.filterwarnings('ignore')

# Audio processing library for analyzing and extracting features from audio signals
import librosa

# Library for reducing noise from audio signals
import noisereduce as nr

import pickle

# Library for working with video files, including reading and editing
from moviepy.editor import VideoFileClip

from numpy import isfinite 

import math


### Require Data Loading and Processing 

In [2]:
# Check current working directory
print(os.getcwd())

/home/chijuiwu/snap/snapd-desktop-integration/83/Desktop/CapstoneProject


In [3]:
# Path of training excal which including video name, labels, duration, etc.
Train_excal_path = '/home/chijuiwu/snap/snapd-desktop-integration/83/Desktop/CapstoneProject/CharadesEgoInfo/CharadesEgo_v1_test_only1st.csv'

df = pd.read_csv(Train_excal_path)


In [None]:
#Checking the content in the csv. file
print(df)

In [None]:
# Select only 'id' and 'actions' columns
selected_data = df[['id', 'actions']]

print(selected_data)

# The 'actions' category contain the action and second. 
# For instance, c115 is the action 'Someone is holding a paper/notebook'.
# The first 0.00 is the start sec of the action, The second 21.70 is the end sec of the action.


In [None]:
# Extract all unique action codes 
action_pattern = r'c\d{3}'
unique_actions = set()

unique_actions_with_no_action = set(['no_action'])  # Initialize set with 'no_action' for NaN cases

for actions in df['actions']:
    if pd.isna(actions): # Check if the actions data is NaN (missing), and skip further processing for this row.
        continue  
    matches = re.findall(action_pattern, str(actions)) # Use regex to find all occurrences of the action pattern in the actions data.
    unique_actions_with_no_action.update(matches) # Update the set with any new action codes found in the current row.

# Sorting the unique_actions set
sorted_unique_actions = sorted(unique_actions_with_no_action)
print(sorted_unique_actions)

# Counting the number of sorted unique actions
number_of_actions = len(sorted_unique_actions)
print(f"number of actions: {number_of_actions}")


In [None]:
# Convert 'id' column to list for video_files.
# The name of each vedio file for extract audio process.
video_files = selected_data['id'].tolist()
print(video_files)


In [None]:
# Create a dictionary from 'id' and 'actions' for annotations
annotations = dict(zip(selected_data['id'], selected_data['actions']))
print(annotations)


## Functions for Extract Audio and Label the Clips

In [None]:
# Function to parse annotations
def parse_annotations(annotation_str, video_duration):
    actions = []
    last_end = 0

    # Check if annotation_str is a float and nan
    if isinstance(annotation_str, float) and math.isnan(annotation_str):
        # Handle the case where the annotation is nan
        return [{'action': 'no_action', 'start': 0, 'end': video_duration}]

    # Proceed if annotation_str is not nan
    for action in annotation_str.split(';'):
        parts = action.split()
        action_id = parts[0]
        start = round(float(parts[1]))
        end = round(float(parts[2]))

        # Check for gap between last action and current action
        if start > last_end:
            actions.append({'action': 'no_action', 'start': last_end, 'end': start})

        actions.append({'action': action_id, 'start': start, 'end': end})
        last_end = end

    # Check for action-less segment at the end of the video
    if last_end < video_duration:
        actions.append({'action': 'no_action', 'start': last_end, 'end': video_duration})

    return actions


def process_video(file_name, annotations, file_path):
    
    # Load the video by file name and video's duration
    video = VideoFileClip(f"{file_path}/{file_name}.mp4")
    duration = int(video.duration)
    
    #Initial index for mfcc and spectrogram
    mfcc_data = []
    spectrogram_data = []
    sample_rate = 16000

    for i in range(duration):
        audio_clip = video.subclip(i, i + 1).audio
        audio_clip_filename = f"audio_clip_{i}.wav"
        audio_clip.write_audiofile(audio_clip_filename, codec='pcm_s16le')

        audio_clip_input, sr = librosa.load(audio_clip_filename, sr=sample_rate)
        
        # Check and handle non-finite values in the audio clip
        if not np.isfinite(audio_clip_input).all():
            non_finite_indices = np.where(~np.isfinite(audio_clip_input))
            print(f"Non-finite values found in audio clip {i} of {file_name} at indices {non_finite_indices}, handling them.")
            audio_clip_input = np.nan_to_num(audio_clip_input)  # Replace NaN and Inf with zero

        audio_clip_input_reduced = nr.reduce_noise(y=audio_clip_input, sr=sample_rate)

        # Remove the temporary audio file, saving storage space
        os.remove(audio_clip_filename)

        # Check for non-finite values
        if not np.isfinite(audio_clip_input).all():
            print(f"Non-finite values found in audio clip {i} of {file_name}, skipping this clip.")
            continue

        # Perform noise reduction
        try:
            audio_clip_input_reduced = nr.reduce_noise(y=audio_clip_input, sr=sr)
        # There are some vedio duration is less then one sec, skippig the video
        except Exception as e:
            print(f"Error in noise reduction for clip {i} of {file_name}: {e}, skipping this clip.")
            continue

        # Check again for non-finite values after noise reduction
        if not np.isfinite(audio_clip_input_reduced).all():
            print(f"Non-finite values found after noise reduction in audio clip {i} of {file_name}, skipping this clip.")
            continue
            
        # Generate mfcca and spectrogram by librosa
        mfcc = librosa.feature.mfcc(y=audio_clip_input_reduced, sr=sample_rate)
        spect = librosa.amplitude_to_db(np.abs(librosa.stft(audio_clip_input_reduced)), ref=np.max)

        
        
        # Resizing operations to fit the model input requirement
        # Since I only use the spectrogram to training the model, therefore I only reshape the spectrogram data
        # If use the mfcc to the model, can use the similar method to fit the model input requirement
        
        # Adjust frequency dimension to 1024
        if spect.shape[0] > 1024:
            spect = spect[:1024, :]
        elif spect.shape[0] < 1024:
            padding_size = 1024 - spect.shape[0]
            spect = np.pad(spect, ((0, padding_size), (0, 0)))

        # Convert to tensor for resampling
        spect_tensor = torch.tensor(spect, dtype=torch.float32)

        # Add a channel dimension for interpolation
        spect_tensor = spect_tensor.unsqueeze(0).unsqueeze(0)  # Now [1, 1, 1024, Time]

        # Resample time dimension to 128
        spect_tensor = F.interpolate(spect_tensor, size=(1024, 128), mode='bilinear', align_corners=False)

        # Remove channel dimension
        spect = spect_tensor.squeeze(0).squeeze(0).numpy()

        # Identify all actions that occur in this second
        actions_in_this_second = []
        for action in annotations:
            if i >= action['start'] and i < action['end']:
                actions_in_this_second.append(action['action'])

        # If no specific action, label this as 'no_action'
        if not actions_in_this_second:
            actions_in_this_second.append('no_action')

        # Create a separate entry for each action
        for action in actions_in_this_second:
            mfcc_data.append({
                'file_id': file_name,
                'sample_rate': sample_rate,
                'start_sec': i,
                'end_sec': i + 1,
                'mfcc': mfcc,
                'action': action
            })

            spectrogram_data.append({
                'file_id': file_name,
                'sample_rate': sample_rate,
                'start_sec': i,
                'end_sec': i + 1,
                'spectrogram': spect,
                'action': action
            })

    return mfcc_data, spectrogram_data

### Processing

In [None]:
# Path of video file
video_file_path = "/media/DiskDrive1/CharadesEgo"


# Main processing loop
all_mfcc_data = []
all_spectrogram_data = []

for file in video_files:
    video = VideoFileClip(f"{video_file_path}/{file}.mp4")
    parsed_anno = parse_annotations(annotations[file], int(video.duration))

    mfcc_data, spectrogram_data = process_video(file, parsed_anno, video_file_path)
    all_mfcc_data.extend(mfcc_data)
    all_spectrogram_data.extend(spectrogram_data)

# Convert to DataFrame and then to dictionaries
mfcc_dataset = pd.DataFrame(all_mfcc_data).to_dict('records')
spectrogram_dataset = pd.DataFrame(all_spectrogram_data).to_dict('records')

# Save the datasets
with open('mfcc_dataset.pkl', 'wb') as mfcc_file:
    pickle.dump(mfcc_dataset, mfcc_file)

with open('spectrogram_dataset.pkl', 'wb') as spectrogram_file:
    pickle.dump(spectrogram_dataset, spectrogram_file)

# Print dataset information
print(f"MFCC Dataset num_rows: {len(mfcc_dataset)}")
print(f"Spectrogram Dataset num_rows: {len(spectrogram_dataset)}")

In [None]:
# Check the data format
print(spectrogram_dataset[:5])


### Normalize

In [None]:
# Normalize MFCC dataset
for record in mfcc_dataset:
    record['mfcc'] = (record['mfcc'] - (-4.2677393)) / (4.5689974 * 2)

# Normalize Spectrogram dataset
for record in spectrogram_dataset:
    record['spectrogram'] = (record['spectrogram'] - (-4.2677393)) / (4.5689974 * 2)
    
# Save the datasets
with open('mfcc_dataset.pkl', 'wb') as mfcc_file:
    pickle.dump(mfcc_dataset, mfcc_file)

with open('spectrogram_dataset.pkl', 'wb') as spectrogram_file:
    pickle.dump(spectrogram_dataset, spectrogram_file)

In [6]:
import pickle

with open('spectrogram_dataset.pkl', 'rb') as file:
    # Load the object from the pickle file
    spectrogram_dataset = pickle.load(file)

In [7]:
import gzip

def save_zipped_pickle_gz(obj, filename, protocol=-1):
    # Using gzip to compress the data
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f, protocol)

save_zipped_pickle_gz(spectrogram_dataset, 'spectrogram_dataset.pkl.gz')