In [1]:
import pyaudio
import numpy as np
import cv2
import mediapipe as mp
import torch
import datetime
import os
from multiscale_mine import Multiscale_MINE
from multiscale_mine import Multiscale_MINE_test
from multiscale_mine import GRUCell
from multiscale_mine import RNNCell
import random
import torch.optim as optim
import torch.nn as nn 
from torch.nn import init
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from torch.utils.tensorboard import SummaryWriter
# Francois Delalande, Glenn Gould study, read

By estimating mutual information between audio and movement across musically relevant features, 
we get an unsupervised learning criterion to train mappings between movement and sound. 
Below is my (very non-standard) implementation of the MINE method (Bengio et al. https://arxiv.org/pdf/1801.04062.pdf) for estimating mutual information between
random variables using neural networks across multiple time- and frequency- resolutions. Also a few recording and data processing utilities to apply it directly on audio and mocap data recorded from the computers input devices. This is intended as a POC to explore in our coming workshopswith music and dance.

In [2]:
# instantiate the neural network (my implementation code in multiscale_mine.py file, which wraps the spiralnet
# mocap encoder and various signal processing on the audio signal) used to estimate mutual information between music audio 
# and movements, creating a joint embedding of movement and audio to compute mutual information scores across a binning of 
# the time-frequency plane given by selected (nr_time_scales, nr_frequency_bands).

mine_network = Multiscale_MINE(GRUCell,nr_time_scales = 9, 
                               nr_frequency_bands = 13, 
                               embedding_dim = 50, #gives dim 100 for the concattenated encoding of audio and mocap input
                               input_dim = 1470, # selected input size for audio sample rate 44100 since blazenet runs at aproximately 30fps
                               nr_layers_per_timescale = 4, # nr of layers for the RNNs encoding the respective timescales
                               nr_spiralnet_layers = 16, 
                               delay_size = 3) 

PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: patch re-linking failed
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: patch re-linking failed
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: patch re-linking failed
PolyMeshT::add_

current version instantiated


PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: patch re-linking failed
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: patch re-linking failed
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: complex edge
PolyMeshT::add_face: patch re-linking failed
PolyMeshT::add_

In [3]:
sum([p.numel() for p in mine_network.parameters() if p.requires_grad]) # count number of model parameters with current model settings

32285677

In [4]:
# data processing utilities to obtain joint and marginal samples and construct the training data set for the MINE estimation

def get_marginal_samples(current_session_dir, audio_data_tensors, audioposes_data_dir='audioposes_data'):
    all_sessions = [d for d in os.listdir(audioposes_data_dir) if os.path.isdir(os.path.join(audioposes_data_dir, d))]
    other_sessions = [s for s in all_sessions if s != current_session_dir]

    marginal_poses = None
    while marginal_poses is None or len(marginal_poses) < len(audio_data_tensors):
        selected_session = random.choice(other_sessions)
        poses_path = os.path.join(audioposes_data_dir, selected_session, 'poses_tensors.pt')

        if os.path.exists(poses_path):
            new_poses = torch.load(poses_path)
            if marginal_poses is None:
                marginal_poses = new_poses
            else:
                marginal_poses = torch.cat((marginal_poses, new_poses), dim=0)

    return marginal_poses

def generate_joint_and_marginal_samples(audio_data_tensors, poses_tensors, pose_selected_values, poses_marginal_samples, batch_size=32):
    joint_sample_pairs, marginal_sample_pairs = {}, {}
    
    for i in range(len(pose_selected_values)):
        if 0 < pose_selected_values[i] <= len(poses_tensors):
            pose_ind = pose_selected_values[i] - 1    # converting to 0-indexed
            joint_sample_pairs[i] = [audio_data_tensors[i], poses_tensors[pose_ind]]
    
    joint_keys = list(joint_sample_pairs.keys())
    for i in range(len(joint_keys)):
        idx = joint_keys[i] 
        marginal_sample_pairs[idx] = [joint_sample_pairs[idx][0], poses_marginal_samples[i]]

    joint_batches, marginal_batches = [], []
    joint_batch, marginal_batch = [], []

    for idx in joint_keys:
        joint_batch.append(joint_sample_pairs[idx])
        marginal_batch.append(marginal_sample_pairs[idx])

        if len(joint_batch) == batch_size:
            joint_tensor = [torch.stack(samples) for samples in zip(*joint_batch)]
            marginal_tensor = [torch.stack(samples) for samples in zip(*marginal_batch)]
            joint_batches.append(joint_tensor)
            marginal_batches.append(marginal_tensor)
            joint_batch, marginal_batch = [], []

    if joint_batch:
        joint_tensor = [torch.stack(samples) for samples in zip(*joint_batch)]
        marginal_tensor = [torch.stack(samples) for samples in zip(*marginal_batch)]
        joint_batches.append(joint_tensor)
        marginal_batches.append(marginal_tensor)

    return joint_batches[:-1], marginal_batches[:-1] # remove the last batch since it has an irregular number of data messing with model architecture

In [6]:
# This cell records realtime audio and mocap from the computer mic and camera and upon stopping the cell compiles the training dataset.
 
#Audio Configuration
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 512
DEVICE_INDEX = 1   # adjust this integer according to your computers available audio input devices

# Initialize PyAudio
p = pyaudio.PyAudio()

# Define the dictionary
pose_counter = {0: 0}

# List to save the audio buffers and dictionary values
saved_audio_data = []

# List to save the poses
saved_poses = []

# Initialize MediaPipe BlazePose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Define the callback function for audio
def audio_callback(in_data, frame_count, time_info, status):
    audio_data = np.frombuffer(in_data, dtype=np.int16)#float32)
    saved_audio_data.append([audio_data.tolist(), pose_counter[0]])
    return (None, pyaudio.paContinue)

# Open the audio stream
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                input_device_index=DEVICE_INDEX,
                frames_per_buffer=CHUNK,
                stream_callback=audio_callback)

# Start the audio stream
stream.start_stream()

# Open webcam
cap = cv2.VideoCapture(0)

#print("Recording... Press 'q' in the webcam window to stop.") # if uncommenting cv2.imshow('Pose Tracking', frame)
try:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break
        
        # Convert the BGR image to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Get pose results
        results = pose.process(rgb_frame)
        
        if results.pose_landmarks:
            # Save the pose landmarks
            pose_landmarks = results.pose_landmarks.landmark
            pose_data = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks]
            saved_poses.append(pose_data)
            
            # Increment the dictionary value
            pose_counter[0] += 1
        
        # Display the frame
        #cv2.imshow('Pose Tracking', frame)
        
        cv2.waitKey(1) 
       
except KeyboardInterrupt:
    print("Recording stopped.")

finally:
    # Release the webcam
    cap.release()
    
    # Close the audio stream
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # Close the OpenCV windows
    cv2.destroyAllWindows()
    

    audio_data = [audio_data for audio_data, _ in saved_audio_data]
    audio_data_flat = np.concatenate(audio_data)
    
    # Reshape count values and select the most frequent (larger in case of tie) for each buffer
    count_values = [np.full_like(audio_data, count_value) for audio_data, count_value in saved_audio_data]
    count_values_flat = np.concatenate(count_values)
    
    # since we record audio at sample rate 44100 and poses at 30fps and want to try time-sync audio and movements, 
    # reshape the data to audio buffers of size 1470 = 44100/30

    leftover_samples = len(audio_data_flat) % 1470
    if leftover_samples != 0:
        audio_data_flat = audio_data_flat[:-leftover_samples]
        count_values_flat = count_values_flat[:-leftover_samples]

    audio_data_reshaped = audio_data_flat.reshape(-1, 1470)
    count_values_reshaped = count_values_flat.reshape(-1, 1470)
    
    selected_count_values = []
    for buffer in count_values_reshaped:
        # Select the count value with the most occurrences (larger in case of tie)
        unique_values, counts = np.unique(buffer, return_counts=True)
        selected_count = unique_values[np.argmax(counts)]
        selected_count_values.append(selected_count)
    
    # Convert to PyTorch tensors
    audio_data_tensors = torch.tensor(audio_data_reshaped, dtype=torch.float32) #dtype=torch.int16)
    selected_count_values_tensor = torch.tensor(selected_count_values, dtype=torch.int32)
    poses_tensors = torch.tensor(saved_poses, dtype=torch.float32)

        # Save the tensors to disk with the date and time in the filename

    parent_dir = "audioposes_data"

    # Create a session directory inside the parent directory
    current_time = datetime.datetime.now()
    formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    session_dir = f'session_{formatted_time}_nr_poses='+str(len(poses_tensors))
    session_dir_path = os.path.join(parent_dir, session_dir)
    os.makedirs(session_dir_path, exist_ok=True)

    # Save the tensors to disk in the session directory
    audio_data_file_path = os.path.join(session_dir_path, 'audio_data_tensors.pt')
    dict_values_file_path = os.path.join(session_dir_path, 'dict_values_tensors.pt')
    poses_file_path = os.path.join(session_dir_path, 'poses_tensors.pt')
    
    torch.save(audio_data_tensors, audio_data_file_path)
    torch.save(selected_count_values_tensor, dict_values_file_path)
    torch.save(poses_tensors, poses_file_path)

    audioposes_data_dir='audioposes_data'
    all_sessions = [d for d in os.listdir(audioposes_data_dir) if os.path.isdir(os.path.join(audioposes_data_dir, d))]
    other_sessions = [s for s in all_sessions if s != session_dir]
    
    if not other_sessions:
        print("No previous session recorded to use for marginal samples. Record another session to obtain marginal samples for the mine computation.")
    
    else:
        marginal_poses = get_marginal_samples(session_dir, audio_data_tensors)
        joint_batches, marginal_batches = generate_joint_and_marginal_samples(audio_data_tensors, poses_tensors, selected_count_values_tensor.numpy(), marginal_poses, batch_size=32)
        
        print('Generated a dataset with '+str(len(joint_batches))+' nr of batches, with batch size 32')
        print(f"Audio data tensors saved to {audio_data_file_path}")
        print(f"Dictionary values tensors saved to {dict_values_file_path}")
        print(f"Poses tensors saved to {poses_file_path}")

I0000 00:00:1700158517.617406       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 83), renderer: Apple M2 Max


Recording stopped.
Generated a dataset with 12 nr of batches, with batch size 32
Audio data tensors saved to audioposes_data/session_20231116_191533_nr_poses=345/audio_data_tensors.pt
Dictionary values tensors saved to audioposes_data/session_20231116_191533_nr_poses=345/dict_values_tensors.pt
Poses tensors saved to audioposes_data/session_20231116_191533_nr_poses=345/poses_tensors.pt


In [None]:
# if you need to save the current data as backup
#tensor_list = [joint_batches, marginal_batches]
#torch.save(tensor_list, 'tensor_list'+current_time+'.pth')

In [None]:
# load existing training data example
#loaded_tensor_list = torch.load('tensor_list.pth')
#joint_batches, marginal_batches = loaded_tensor_list[0], loaded_tensor_list[0]

In [None]:
#state_dict = torch.load('data/trained_models/epoch_1.pt')
#mine_network.load_state_dict(state_dict=state_dict)

In [3]:
# Train the mine_network on the data recorded in the previous cell to optimize the mutual information estimate across multiple time and frequency resolutions.
# Since RNNs are prone to exploding gradients, we need to chech for nan values often and if found revert to previous model states and varying learning rates.
current_mines = {}
params_without_gradients = {}
def train(new_session=True, session_path=None):
    if new_session:
        parent_dir = "training_sessions"
        current_time = datetime.datetime.now()
        formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
        session_dir = f'training_session_{formatted_time}'
        session_path = os.path.join(parent_dir, session_dir)
        os.makedirs(session_path, exist_ok=True)
        
        models_path = os.path.join(session_path, 'trained_models')
        os.makedirs(models_path, exist_ok=True)
        output_path = os.path.join(session_path, 'mine_matrices')
        os.makedirs(output_path, exist_ok=True)
        print(f'Created new training session {session_dir}, starting training')
    elif session_path:
        print(f'Continuing training session {session_path}, starting training')
    else:
        print('Set new_session = True or enter a path to existing training session')
        return
    
    writer = SummaryWriter(log_dir=session_path)
    

    # Initialize optimizer
    optimizer = optim.AdamW(mine_network.parameters(), lr=0.0001) 
    num_epochs = 10000

    # To keep track of the best model
    best_model_state = None
    best_loss = float('inf')
    training_scores = []

    global_step = 0

    for epoch in range(num_epochs):
        running_neg_mine_estimate = 0.0
        mine_matrices = []

        for i, (joint_batch, marginal_batch) in enumerate(zip(joint_batches, marginal_batches)):
            # Extract audio, joint poses, and marginal poses from batches
            x_joint, y_joint = joint_batch
            _, y_marginal = marginal_batch
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass, criterion computation
            mine_score, mine_matrix = mine_network(x_joint, y_joint, y_marginal)
            if torch.isnan(mine_score).any():
                print('Found NaN in mine_score, reverting to state_dict from previous epoch and lowering learning rate by *0.75')
                if epoch > 0:
                    model_path = os.path.join(models_path, f'epoch_{epoch-1}.pt')
                    if os.path.exists(model_path):
                        mine_network.load_state_dict(torch.load(model_path))
                        optimizer = optim.AdamW(mine_network.parameters(), lr=optimizer.param_groups[0]['lr'] * 0.75)
                    else:
                        print(f'Model file not found: {model_path}')
                break
            
            else:
                mine_matrices.append(mine_matrix.detach().cpu().numpy())
                current_mines[i]=mine_matrix.detach().cpu().numpy()
                training_scores.append(mine_score.item())
                criterion = mine_score
                print('MI lower bound = ', -criterion.item())
                criterion.backward()
                optimizer.step()
                running_neg_mine_estimate += criterion.item()             
                
                if i % 100 == 99:
                    print(f'[{epoch + 1}, {i + 1}] MI lower bound running mean: {-running_neg_mine_estimate / 100:.3f}')
                    running_neg_mine_estimate = 0.0

                # Log gradients of all model parameters
                params_no_grad = []
                for name, param in mine_network.named_parameters():
                    if param.grad is not None:
                        writer.add_histogram(f'Gradients/{name}', param.grad, global_step)
                    else:
                        params_no_grad.append(name) #print(f'Gradient not found for parameter: {name}')
                params_without_gradients[global_step] = params_no_grad
                # Log optimizer state
                for j, param_group in enumerate(optimizer.param_groups):
                    writer.add_scalar(f'Learning Rate/param_group_{j}', param_group['lr'], global_step)

                global_step += 1

        torch.save(mine_network.state_dict(), os.path.join(models_path, f'epoch_{epoch}.pt'))
        np.save(os.path.join(output_path, f'epoch_{epoch}.npy'), mine_matrices)

        if mine_matrices and np.isnan(mine_matrices[-1]).any():
            print('Found NaN in mine_matrix, reverting to state_dict from previous epoch and lowering learning rate by *0.75')
            if epoch > 0:
                model_path = os.path.join(models_path, f'epoch_{epoch-1}.pt')
                if os.path.exists(model_path):
                    mine_network.load_state_dict(torch.load(model_path))
                    optimizer = optim.AdamW(mine_network.parameters(), lr=optimizer.param_groups[0]['lr'] * 0.75)
                else:
                    print(f'Model file not found: {model_path}')
        elif optimizer.param_groups[0]['lr'] < 0.0001:
            optimizer.param_groups[0]['lr'] = 0.0001

        mine_matrices = []
    return mine_matrices, training_scores
    print('Finished Training')

# starting the training session:
train()


Created new training session training_session_20231116_212123, starting training


NameError: name 'joint_batches' is not defined

In [14]:
# Visualise the mine matrix across a training epoch as a heat map animation. Each matrix element is the mutual information lower bound estimate between 
# the recorded mocap and audio input signals. 

import matplotlib.pyplot as plt
import matplotlib.animation as animation
from JSAnimation.IPython_display import display_animation
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

num_tensors = len(list(current_mines.keys()))

fig, ax = plt.subplots()
cax = ax.imshow(current_mines[0], cmap='viridis', aspect='auto', animated=True)

def animate(i):
    cax.set_array(current_mines[i])
    return [cax]

ani = animation.FuncAnimation(fig, animate, frames=num_tensors, interval=5, blit=True)

# To display the animation in Jupyter
HTML(ani.to_jshtml())

ModuleNotFoundError: No module named 'JSAnimation'

In [16]:
params_without_gradients

NameError: name 'params_without_gradients' is not defined

In [None]:
# irrelevant experimentation cells bellow

In [None]:


def train(new_session = True, session_path = None):

    if new_session == True:
        parent_dir = "training_sessions"
        # Create a session directory inside the parent directory
        current_time = datetime.datetime.now()
        formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
        session_dir = f'training_session_{formatted_time}'
        session_path = os.path.join(parent_dir, session_dir)
        os.makedirs(session_path, exist_ok=True)
        
        models_path = os.path.join(session_dir, 'trained_models')
        os.makedirs(models_path, exist_ok=True)
        output_path = os.path.join(session_dir, 'mine_matrices')
        os.makedirs(output_path, exist_ok=True)
        print('created new training session '+session_dir_path+', starting training')
    elif session_path != None:
        session_dir = session_path
        print('continueing training session '+session_path+', starting training')
    else:
        print('set new_session = True or enter a path to existing training session')

    #Initialize optimizer
    optimizer = optim.AdamW(mine_network.parameters(), lr=0.0001) 
    #joint_batches, marginal_batches = joint_batches_, marginal_batches_
    # Number of training epochs
    num_epochs = 10000

    # To keep track of the best model
    best_model_state = None
    best_loss = float('inf')
    training_scores = []


    for epoch in range(num_epochs):
        running_neg_mine_estimate = 0.0
        mine_matrices = []

        for i, (joint_batch, marginal_batch) in enumerate(zip(joint_batches, marginal_batches)):
            # Extract audio, joint poses, and marginal poses from batches
            x_joint, y_joint = joint_batch
            _, y_marginal = marginal_batch
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass, criterion computation
            mine_score, mine_matrix = mine_network(x_joint, y_joint, y_marginal)
            if np.isnan(mine_score.detach().numpy()).any():
                print('found nan in mine_score, reverting to state_dict from previous epoch and lowering learning rate by *0.75')
                #if epoch > 0:
                mine_network.load_state_dict(torch.load(session_path+f'/trained_models/epoch_{epoch-1}.pt'))
                optimizer = optim.AdamW(mine_network.parameters(), lr=0.0001) 
                optimizer.param_groups[0]['lr'] *= 0.75
                #else:
                        #print('instantiate a new model and try again')
                break
            
            else:    
                
                mine_matrices.append(mine_matrix.detach().cpu().numpy())
                training_scores.append(mine_score)
                criterion = mine_score
                #training_scores.append(-criterion.item())
                print('MI lower bound =  ', -criterion.item()) #-neg_mine_estimate
                
                # Backward pass and optimization
                criterion.backward()
                optimizer.step()

                # Print statistics
                running_neg_mine_estimate += criterion.item()             
                
                if i % 100 == 99:    # Print every 100 mini-batches
                    print('[%d, %5d] MI lower bound running mean: %.3f' %
                        (epoch + 1, i + 1, -running_neg_mine_estimate / 100))
                    running_neg_mine_estimate = 0.0

        # Save model and mine_matrices after each epoch
        torch.save(mine_network.state_dict(), session_path+f'/trained_models/epoch_{epoch}.pt')
        np.save(session_path+f'/mine_matrices/epoch_{epoch}.npy', mine_matrices)

        # Check for NaN values in the latest mine_matrix
        if np.isnan(mine_matrices[-1]).any():
            # If learning rate is higher than 0.0001, reduce it and load previous state_dict
            print('found nan in mine_matrix, reverting to state_dict from previous epoch and lowering learning rate by *0.75')
            
            if epoch > 0:
                mine_network.load_state_dict(torch.load(session_path+f'/trained_models/epoch_{epoch-1}.pt'))
                optimizer = optim.AdamW(mine_network.parameters(), lr=0.0001) 
                optimizer.param_groups[0]['lr'] *= 0.75
        else:
            # If no NaN values and learning rate is less than 0.0001, reset it back to 0.0001
            if optimizer.param_groups[0]['lr'] < 0.0001:
                optimizer.param_groups[0]['lr'] = 0.0001

        # Clear mine_matrices from running memory
        mine_matrices = []

    print('Finished Training')

# starting the training session:

train()

In [None]:
# train the mine_network instantiated above on the audio and mocap data recorded in the previous cell

# need to experiment with the learning rate and scheduling a lot more, but initial experiments show we need to keep relatively small
# learning rate to avoid exploding gradients in the RNN layers.

optimizer = optim.AdamW(mine_network.parameters(), lr=0.0001) 

mine_matrices = []
# Number of training epochs
num_epochs = 1000

for epoch in range(num_epochs):
    running_neg_mine_estimate  = 0.0
    for i, (joint_batch, marginal_batch) in enumerate(zip(joint_batches, marginal_batches)):
        # Extract audio, joint poses, and marginal poses from batches
        x_joint, y_joint = joint_batch
        _, y_marginal = marginal_batch
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass, criterion computation
        mine_score, mine_matrix = mine_network(x_joint, y_joint, y_marginal)
        mine_matrices.append(mine_matrix)
        
        criterion = mine_score
        
        print('MI lower bound =  ', -criterion.item()) #-neg_mine_estimate
        
        # Backward pass and optimization
        criterion.backward()
        #torch.nn.utils.clip_grad_norm_(mine_network.parameters(), max_norm=1.0)
        optimizer.step()

        # Print statistics
        running_neg_mine_estimate += criterion.item()
        #print('MI lower bound running mean ', -running_neg_mine_estimate )
        
            
        if i % 100 == 99:    # Print every 100 mini-batches
            print('[%d, %5d] MI lower bound running mean: %.3f' %
                  (epoch + 1, i + 1, -running_neg_mine_estimate / 100))
            running_neg_mine_estimate = 0.0

        

print('Finished Training')

In [None]:
len(joint_batches), len(marginal_batches), len(joint_batches[0]), len(marginal_batches[0]), 
joint_batches[-1][0].shape, 
marginal_batches[-1][0].shape,  
joint_batches[-1][0].shape, 
marginal_batches[-1][0].shape,  
joint_batches[-1][0].shape, marginal_batches[-1][1].shape

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming output_matrix is the output from the neural network
output_matrix = np.random.rand(8, 12)

# Plotting the heatmap
plt.figure(figsize=(10, 6))
cax = plt.matshow(output_matrix, cmap="viridis")
plt.colorbar(cax)
plt.title("Neural Network Output Heatmap")
plt.xlabel("Columns")
plt.ylabel("Rows")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

# Generating a sequence of 1000 matrices (8x12) as an example
sequence_of_matrices = np.random.rand(1000, 8, 12)

# Set up the figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
cax = ax.matshow(sequence_of_matrices[0], cmap="viridis")
plt.colorbar(cax)
plt.title("Neural Network Output Heatmap")
plt.xlabel("Columns")
plt.ylabel("Rows")

def update(frame):
    cax.set_array(sequence_of_matrices[frame].flatten())
    return [cax]

# Create an animation
ani = FuncAnimation(fig, update, frames=range(1000), interval=50, blit=True)

# To save the animation, use ani.save('filename.mp4')
plt.show()


In [None]:
import pyaudio
import numpy as np
import datetime

# Define the audio stream parameters
FORMAT = pyaudio.paInt16  # Audio format (16-bit PCM)
CHANNELS = 1             # Number of audio channels (1 for mono, 2 for stereo)
RATE = 44100             # Sample rate (samples per second)
CHUNK = 512              # Number of frames per buffer
DEVICE_INDEX = 1         # Index of the audio device to use

# Initialize PyAudio
p = pyaudio.PyAudio()

# Define the dictionary
my_dict = {0: 0}

# List to save the audio buffers and dictionary values
saved_data = []

# Define the callback function
def callback(in_data, frame_count, time_info, status):
    # Convert the byte data to numpy array
    audio_data = np.frombuffer(in_data, dtype=np.int16)
    
    # Save the audio buffer and dictionary value to the list
    saved_data.append([audio_data.tolist(), my_dict[0]])
    
    # Return the data and continue
    return (None, pyaudio.paContinue)

# Open the audio stream
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                input_device_index=DEVICE_INDEX,
                frames_per_buffer=CHUNK,
                stream_callback=callback)

# Start the audio stream
stream.start_stream()

# Keep the script running to record audio in real time
print("Recording... Press Ctrl+C to stop.")
try:
    while stream.is_active():
        pass
except KeyboardInterrupt:
    # Stop the audio stream when Ctrl+C is pressed
    stream.stop_stream()
    stream.close()
    p.terminate()
    print("Recording stopped.")

# Print the saved data
#print("Saved data:", saved_data)

In [None]:
import pyaudio
import numpy as np
import cv2
import mediapipe as mp
import torch
import datetime

# Audio Configuration
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 512
DEVICE_INDEX = 1

# Initialize PyAudio
p = pyaudio.PyAudio()

# Define the dictionary
my_dict = {0: 0}

# List to save the audio buffers and dictionary values
saved_audio_data = []

# List to save the poses
saved_poses = []

# Initialize MediaPipe BlazePose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Define the callback function for audio
def audio_callback(in_data, frame_count, time_info, status):
    audio_data = np.frombuffer(in_data, dtype=np.int16)
    saved_audio_data.append([audio_data.tolist(), my_dict[0]])
    return (None, pyaudio.paContinue)

# Open the audio stream
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                input_device_index=DEVICE_INDEX,
                frames_per_buffer=CHUNK,
                stream_callback=audio_callback)

# Start the audio stream
stream.start_stream()

# Open webcam
cap = cv2.VideoCapture(0)

#print("Recording... Press 'q' in the webcam window to stop.")
try:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break
        
        # Convert the BGR image to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Get pose results
        results = pose.process(rgb_frame)
        
        if results.pose_landmarks:
            # Save the pose landmarks
            pose_landmarks = results.pose_landmarks.landmark
            pose_data = [[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose_landmarks]
            saved_poses.append(pose_data)
            
            # Increment the dictionary value
            my_dict[0] += 1
        
        # Display the frame
        #cv2.imshow('Pose Tracking', frame)
        
        cv2.waitKey(1) 
       
except KeyboardInterrupt:
    print("Recording stopped.")

finally:
    # Release the webcam
    cap.release()
    
    # Close the audio stream
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # Close the OpenCV windows
    cv2.destroyAllWindows()

    # Convert saved audio data and poses to PyTorch tensors
    audio_data_tensors = [torch.tensor(audio_data) for audio_data, _ in saved_audio_data]
    dict_values_tensors = torch.tensor([dict_value for _, dict_value in saved_audio_data], dtype=torch.int32)
    poses_tensors = torch.tensor(saved_poses, dtype=torch.float32)

    # Save the tensors to disk with the date and time in the filename
    current_time = datetime.datetime.now()
    formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    audio_data_file_path = f'audio_data_tensors_{formatted_time}.pt'
    dict_values_file_path = f'dict_values_tensors_{formatted_time}.pt'
    poses_file_path = f'poses_tensors_{formatted_time}.pt'
    torch.save(audio_data_tensors, audio_data_file_path)
    torch.save(dict_values_tensors, dict_values_file_path)
    torch.save(poses_tensors, poses_file_path)

    print("Tensors saved to disk.")
    print(f"Audio data tensors saved to {audio_data_file_path}")
    print(f"Dictionary values tensors saved to {dict_values_file_path}")
    print(f"Poses tensors saved to {poses_file_path}")

In [None]:
import pyaudio
import numpy as np
import cv2
import mediapipe as mp
import torch
import datetime

# Audio Configuration
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 512
DEVICE_INDEX = 1

# Initialize PyAudio
p = pyaudio.PyAudio()

# Define the dictionary
my_dict = {0: 0}

# List to save the audio buffers and dictionary values
saved_audio_data = []

# List to save the poses
saved_poses = []

# Initialize MediaPipe BlazePose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Define the callback function for audio
def audio_callback(in_data, frame_count, time_info, status):
    audio_data = np.frombuffer(in_data, dtype=np.int16)
    saved_audio_data.append([audio_data.tolist(), my_dict[0]])
    return (None, pyaudio.paContinue)

# Open the audio stream
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                input_device_index=DEVICE_INDEX,
                frames_per_buffer=CHUNK,
                stream_callback=audio_callback)

# Start the audio stream
stream.start_stream()

# Open webcam
cap = cv2.VideoCapture(0)

print("Recording... Press 'q' in the webcam window to stop.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        break
    
    # Convert the BGR image to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Get pose results
    results = pose.process(rgb_frame)
    
    if results.pose_landmarks:
        # Save the pose landmarks
        pose_landmarks = results.pose_landmarks.landmark
        pose_data = [[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose_landmarks]
        saved_poses.append(pose_data)
        
        # Increment the dictionary value
        my_dict[0] += 1
    
    # Display the frame
    cv2.imshow('Pose Tracking', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close the audio stream
cap.release()
cv2.destroyAllWindows()
stream.stop_stream()
stream.close()
p.terminate()

print("Recording stopped.")

audio_data_tensors = [torch.tensor(audio_data) for audio_data, _ in saved_audio_data]
dict_values_tensors = torch.tensor([dict_value for _, dict_value in saved_audio_data], dtype=torch.int32)
poses_tensors = torch.tensor(saved_poses, dtype=torch.float32)

current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y%m%d_%H%M%S")

# Save the tensors to disk with the date and time in the filename
audio_data_file_path = f'audio_data_tensors_{formatted_time}.pt'
dict_values_file_path = f'dict_values_tensors_{formatted_time}.pt'
poses_file_path = f'poses_tensors_{formatted_time}.pt'

torch.save(audio_data_tensors, audio_data_file_path)
torch.save(dict_values_tensors, dict_values_file_path)
torch.save(poses_tensors, poses_file_path)

print(f"Audio data tensors saved to {audio_data_file_path}")
print(f"Dictionary values tensors saved to {dict_values_file_path}")
print(f"Poses tensors saved to {poses_file_path}")

# Print the saved audio data
print("Saved audio data:", saved_audio_data)

# Print the number of recorded poses
print("Number of recorded poses:", len(saved_poses))


In [None]:
import pyaudio
import numpy as np
import cv2
import mediapipe as mp
import torch

# Audio Configuration
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 512
DEVICE_INDEX = 1

# Initialize PyAudio
p = pyaudio.PyAudio()

# Define the dictionary
my_dict = {0: 0}

# Lists to save the audio buffers, dictionary values, and poses
saved_audio_data = []
saved_poses = []

# Initialize MediaPipe BlazePose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Define the callback function for audio
def audio_callback(in_data, frame_count, time_info, status):
    audio_data = np.frombuffer(in_data, dtype=np.int16)
    saved_audio_data.append([audio_data.tolist(), my_dict[0]])
    return (None, pyaudio.paContinue)

# Open the audio stream
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                input_device_index=DEVICE_INDEX,
                frames_per_buffer=CHUNK,
                stream_callback=audio_callback)

# Start the audio stream
stream.start_stream()

# Open webcam
cap = cv2.VideoCapture(0)

print("Recording... Press 'q' in the webcam window to stop.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        break
    
    # Convert the BGR image to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Get pose results
    results = pose.process(rgb_frame)
    
    if results.pose_landmarks:
        # Save the pose landmarks
        pose_landmarks = results.pose_landmarks.landmark
        pose_data = [[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose_landmarks]
        saved_poses.append(pose_data)
        
        # Increment the dictionary value
        my_dict[0] += 1
    
    # Display the frame
    cv2.imshow('Pose Tracking', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam
cap.release()

# Close the audio stream
stream.stop_stream()
stream.close()
p.terminate()

# Close the OpenCV windows
cv2.destroyAllWindows()

print("Recording stopped.")

# Convert saved audio data and poses to PyTorch tensors
audio_data_tensors = [torch.tensor(audio_data) for audio_data, _ in saved_audio_data]
dict_values_tensors = torch.tensor([dict_value for _, dict_value in saved_audio_data], dtype=torch.int32)
poses_tensors = torch.tensor(saved_poses, dtype=torch.float32)

print("Audio data tensors:", audio_data_tensors)
print("Dictionary values tensors:", dict_values_tensors)
print("Poses tensors:", poses_tensors)


In [None]:
import torch
import torch.nn as nn
from torch.nn import init

class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional=True, batch_first=True, nonlinearity='tanh'):
        super(LSTMCell, self).__init__()

        self.layer_norm = nn.LayerNorm(input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional, batch_first=batch_first)
        
        for param in self.lstm.parameters():
            if len(param.shape) >= 2:
                init.xavier_uniform_(param)

    def forward(self, x, state):
        x = self.layer_norm(x)
        output, hn = self.lstm(x, state)
        return output, hn
    
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional=True, batch_first=True, nonlinearity='tanh'):
        super(GRUCell, self).__init__()

        self.layer_norm = nn.LayerNorm(input_size)
        self.gru = nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional, batch_first=batch_first)
        
        for param in self.gru.parameters():
            if len(param.shape) >= 2:
                init.xavier_uniform_(param)

    def forward(self, x, state):
        x = self.layer_norm(x)
        output, hn = self.gru(x, state)
        return output, hn
    

class TransformerCell(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional=True, batch_first=True, nonlinearity='tanh'):
        super(TransformerCell, self).__init__()

        self.layer_norm = nn.LayerNorm(input_size)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_size, nhead=hidden_size // 30),
            num_layers=num_layers
        )

    def forward(self, x, state):
        x = self.layer_norm(x)
        output = self.transformer(x)
        return output, None  # Transformer does not maintain state like RNNs



class FCLayersCell(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional=True, batch_first=True, nonlinearity='tanh'):
        # ...
        super(FCLayersCell, self).__init__()

        self.layer_norm = nn.LayerNorm(input_size)
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_size, hidden_size)
        )

    def forward(self, x, state):
        x = self.layer_norm(x)
        output = self.layers(x)
        return output, None  # Fully connected layers do not maintain state like RNNs


class WaveNetCell(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional=True, batch_first=True, nonlinearity='relu'):
        super(WaveNetCell, self).__init__()
        
        # Adding Layer Normalization
        self.layer_norm = nn.LayerNorm(input_size)
        
        # Defining a simple WaveNet-style CNN
        self.cnn = nn.Sequential(
            nn.Conv1d(input_size, hidden_size, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(hidden_size, 2*hidden_size, kernel_size=3, padding=1)
        )
        
        # Applying Xavier Uniform Initialization to CNN weights
        for layer in self.cnn:
            if isinstance(layer, nn.Conv1d):
                init.xavier_uniform_(layer.weight)

    def forward(self, x, state=None):
        x = self.layer_norm(x)
        output = self.cnn(x.permute(0, 2, 1)).permute(0, 2, 1)
        return output, state



In [None]:
cells = [FCLayersCell, TransformerCell, GRUCell, LSTMCell, FCLayersCell, TransformerCell]

In [None]:
for cell in cells:

    mine_network = Multiscale_MINE_test(cell, nr_time_scales = 9, 
                                nr_frequency_bands = 13, 
                                embedding_dim = 100, #gives dim 200 for the concattenated encoding of audio and mocap input
                                input_dim = 1470, # selected input size for audio sample rate 44100 since blazenet runs at aproximately 30fps
                                nr_layers_per_timescale = 4, #nr of layers for the RNNs encoding the respective timescales
                                nr_spiralnet_layers = 16, 
                                delay_size = 3) 
    
    optimizer = optim.AdamW(mine_network.parameters(), lr=0.001)


    # Number of training epochs
    num_epochs = 1

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (joint_batch, marginal_batch) in enumerate(zip(joint_batches, marginal_batches)):
            # Extract audio, joint poses, and marginal poses from batches
            x_joint, y_joint = joint_batch
            _, y_marginal = marginal_batch
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            #print('x_joint : ', x_joint)
            #print('y_joint : ', y_joint)
            #print('y_marginal : ', y_marginal)
            # Forward pass
            score_joint, score_marginal = mine_network(x_joint, y_joint, y_marginal)
            
            #print(score_joint)
            #print(score_marginal)
            # Loss computation
            loss = mine_score(score_joint, score_marginal)
            print('MI lower bound =  ', loss)
            
            # Backward pass and optimization
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(mine_network.parameters(), max_norm=1.0)
            optimizer.step()

            # Print statistics
            running_loss += loss.item()
            print('loss', -running_loss)
            
                
            if i % 100 == 99:    # Print every 100 mini-batches
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0
    
    

In [None]:
import torch
import torch.nn as nn
from torch.nn import init

class DummyModel(nn.Module):
    def __init__(self, layer_type):
        super(DummyModel, self).__init__()
        self.layer_type = layer_type
        self.input_dim = 1024
        self.hidden_dim = 100
        self.num_layers = 4
        self.batch_size = 32
        self.seq_len = 10
        
        if layer_type == 'RNN':
            self.layer = nn.RNN(input_size=self.input_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, bidirectional=True, batch_first=True)
        elif layer_type == 'LSTM':
            self.layer = nn.LSTM(input_size=self.input_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, bidirectional=True, batch_first=True)
        elif layer_type == 'GRU':
            self.layer = nn.GRU(input_size=self.input_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, bidirectional=True, batch_first=True)
        elif layer_type == 'FC':
            self.layer = nn.Sequential(
                nn.Linear(self.input_dim, self.hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(self.hidden_dim, self.hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(self.hidden_dim, self.hidden_dim),
            )
        elif layer_type == 'Transformer':
            self.layer = nn.Transformer(d_model=self.input_dim, num_encoder_layers=self.num_layers, num_decoder_layers=self.num_layers, batch_first=True)
        
        elif layer_type == 'CNN':
            self.layer = nn.Sequential(
                nn.Conv1d(self.input_dim, self.hidden_dim, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.Conv1d(self.hidden_dim, self.hidden_dim, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.Conv1d(self.hidden_dim, self.hidden_dim, kernel_size=3, padding=1),
            )
        else:
            raise ValueError("Invalid layer type")

        # Initializing weights
        for param in self.layer.parameters():
            if len(param.shape) >= 2:
                init.xavier_uniform_(param)

    def forward(self, x):
        if self.layer_type in ['RNN', 'LSTM', 'GRU']:
            output, _ = self.layer(x)
        elif self.layer_type == 'Transformer':
            output = self.layer(x, x)
        else:  # 'FC' or 'CNN'
            output = self.layer(x)
        return output

# Validation function
def validate_layer(layer_type):
    model = DummyModel(layer_type)
    x = torch.randn(model.batch_size, model.seq_len, model.input_dim)
    output = model(x)
    print(f"Layer Type: {layer_type}")
    print(f"Input Shape: {x.shape}")
    print(f"Output Shape: {output.shape}")
    print("="*30)

# List of layer types to validate
layer_types = ['RNN', 'LSTM', 'GRU', 'Transformer', 'FC', 'CNN']

# Run validation
for layer_type in layer_types:
    validate_layer(layer_type)
