In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from tensorflow.keras.layers import Dense, Concatenate, LSTM, Lambda

In [4]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.utils import plot_model

**Model to predict the type of songs - tap dance, ballet etc**

Input - audio and poses of a song
Output - type of song (ballet, tap etc) in the form of one hot encoding.

One hot encoding - Assume there are 4 types of dances- tap, ballet, hip hop and break. Heres how the outputs would look

0001 - tap

0010 - ballet

0100 - hip hop

1000 - break

In [5]:
import numpy as np
import librosa
import json

In [6]:
#labelling the data1

import os

folder_path = "/content/drive/MyDrive/Summer_Internship_Data/Datasets"  # Replace with the actual path to your "Datasets" folder
output_array = []

# Iterate over the files in the folder
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith("_data.json"):  # Consider only files ending with "_data.json"
        name = filename.split("_data")[0]  # Extract the name by removing "_data" suffix
        dance_form = ''.join([i for i in name if not i.isdigit()])  # Remove numeric part from the name
        output_array.append(dance_form)

In [7]:
import os

filenames = sorted(os.listdir('/content/drive/MyDrive/Summer_Internship_Data/Datasets'))

In [8]:
len(filenames)

57

In [9]:
filenames = sorted(os.listdir('/content/drive/MyDrive/Summer_Internship_Data/Processed_audios'))
len(filenames)

60

In [None]:
import os


pose_path = '/content/drive/MyDrive/Summer_Internship_Data/Datasets'
pose_files = os.listdir(pose_path)

pose_data = []
for pose_data_file in pose_files:
  with open(os.path.join(pose_path, pose_data_file), "r") as f:
      pose_data.append(json.load(f))

# Load the audio files and preprocess them (e.g., convert to mel spectrograms)
# audio_files = ["path/to/audio1.wav", "path/to/audio2.wav", ...]
audio_path = '/content/drive/MyDrive/Summer_Internship_Data/Processed_audios'
audio_files = sorted(os.listdir(audio_path))
preprocessed_audio = []
for audio_file in audio_files:
    audio, sr = librosa.load(os.path.join(audio_path, audio_file), sr=None)  # Load audio file
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)  # Convert to mel spectrogram
    preprocessed_audio.append(spectrogram)

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np

# Initialize the one-hot encoder
onehot_encoder = OneHotEncoder(sparse=False)

# Fit and transform the output array with one-hot encoder
onehot_encoded = onehot_encoder.fit_transform(np.array(output_array).reshape(-1, 1))

# Print the one-hot encoded array
print(onehot_encoded)

In [None]:
np.array(preprocessed_audio[0]).shape

In [None]:
np.array(preprocessed_audio[1]).shape

In [None]:
import numpy as np

# Dataset of shape (26, 128, n)
dataset = preprocessed_audio  # Replace ... with the actual variable containing the mel spectrogram data

# Step 1: Find the longest length
max_length = max([audio_clip.shape[1] for audio_clip in dataset])

# Step 2: Insert End of Song token
eos_token = np.zeros((128, 1))  # Assuming 128 is the height of the mel spectrogram
dataset_with_eos = [np.concatenate((audio_clip, eos_token), axis=1) for audio_clip in dataset]

# Step 3: Pad audio clips with 0's
padded_dataset = [np.pad(audio_clip, ((0, 0), (0, max_length - audio_clip.shape[1] + 1)), mode='constant', constant_values=0) for audio_clip in dataset_with_eos]

# Step 4: Convert the list to a numpy array
padded_array = np.array(padded_dataset)

print(padded_array.shape)


In [None]:
min_length = float('inf')  # Initialize with a large value

for pose_data_file in pose_data:
    length = len(pose_data_file)
    if length < min_length:
        min_length = length

print('Minimum length:', min_length)

In [None]:
pose_data = [np.array(x)[:min_length, :, :] for x in pose_data]

In [None]:
import numpy as np

# Assuming preprocessed_audio is a list of arrays with varying shapes

# Step 1: Find the maximum shape
max_shape = max([audio.shape for audio in preprocessed_audio])

# Step 2: Truncate or pad the arrays to have the same shape
padded_audio = []
for audio in preprocessed_audio:
    # Pad or truncate the array to the maximum shape
    padded_audio.append(np.pad(audio, [(0, max_shape[0] - audio.shape[0]), (0, max_shape[1] - audio.shape[1])], mode='constant'))

# Step 3: Create the NumPy array with dtype=object
preprocessed_audio = np.array(padded_audio, dtype=object)

**Your audio files are of different lengths and so are your video files. FIX THIS BEFORE GOING AHEAD**

In [None]:
pose_data = np.array(pose_data, dtype=np.float)
preprocessed_audio = np.array(preprocessed_audio, dtype=np.float)
onehot_encoded = np.array(onehot_encoded, dtype=np.float)

In [None]:
pose_data.shape

In [None]:
preprocessed_audio.shape

In [None]:
onehot_encoded.shape

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Flatten, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model

video_input = Input(shape=(911, 33, 3))
audio_input = Input(shape=(128, 17183))

num_classes = onehot_encoded.shape[1]  # Adjust based on the shape of onehot_encoded

x1 = Conv2D(32, kernel_size=(3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01))(video_input)
x1 = MaxPooling2D(pool_size=(2, 2))(x1)
x1 = Flatten()(x1)

x2 = LSTM(32, kernel_regularizer=regularizers.l2(0.01))(audio_input)

x = Concatenate()([x1, x2])
x = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
x = Dropout(0.5)(x)
output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=[video_input, audio_input], outputs=output)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with your data
model.fit([pose_data, preprocessed_audio], onehot_encoded, batch_size=8, epochs=100, validation_split=0.2)


In [None]:
model.summary()

In [None]:
import tensorflow as tf
def scheduler(epoch, lr):
  if epoch < 10:
    return lr
  else:
    return lr * tf.math.exp(-0.1)
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
  # Train the model with your data
  model.fit([pose_data, preprocessed_audio], onehot_encoded, batch_size=16, epochs=100, callbacks=[callback], validation_split=0.2)

In [None]:
print(model.evaluate(x = [pose_data, preprocessed_audio], y = onehot_encoded, batch_size = 16))

In [None]:
plot_model(model, show_shapes = True)

In [None]:
model.save('dance_classifierv1.hdf5')

**ML Model for dance to music recommender**



In [None]:
poses = pose_data.copy()
np.random.shuffle(poses)

audios = preprocessed_audio.copy()
np.random.shuffle(audios)

In [None]:
num_positive_samples = len(pose_data)
labels_positive = np.ones(num_positive_samples)

num_negative_samples = num_positive_samples
labels_negative = np.zeros(num_negative_samples)

# Combine positive and negative labels
labels = np.concatenate((labels_positive, labels_negative), axis=0)

# Combine pose inputs and audio inputs for positive and negative samples
pose_combined = np.concatenate((pose_data, poses), axis=0)
audio_combined = np.concatenate((preprocessed_audio, audios), axis=0)

combined_data = list(zip(pose_combined, audio_combined, labels))
np.random.shuffle(combined_data)
pose_combined, audio_combined, labels_combined = zip(*combined_data)

pose_combined = np.array(pose_combined)
audio_combined = np.array(audio_combined)
labels_combined = np.array(labels_combined)


In [None]:
video_input = Input(shape=(911, 33, 3))
audio_input = Input(shape=(128, 17183))

x1 = Conv2D(32, kernel_size=(3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01))(video_input)
x1 = MaxPooling2D(pool_size=(2, 2))(x1)
x1 = Flatten()(x1)

x2 = LSTM(32, kernel_regularizer=regularizers.l2(0.01))(audio_input)

x = Concatenate()([x1, x2])
x = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
x = Dense(1, activation= 'sigmoid')(x)

song_model = Model(inputs = [video_input, audio_input], outputs = x)

In [None]:
# def scheduler(epoch, lr):
#     if epoch < 10:
#         return lr
#     else:
#         return lr * tf.math.exp(-0.1)

# callback = tf.keras.callbacks.LearningRateScheduler(scheduler)


In [None]:
import tensorflow as tf

def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

class LRSchedulerCallback(tf.keras.callbacks.Callback):
    def set_model(self, model):
        self.model = model

    def on_epoch_begin(self, epoch, logs=None):
        lr = self.model.optimizer.lr
        lr.assign(scheduler(epoch, lr))

callback = LRSchedulerCallback()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with your data
model.fit([pose_data, preprocessed_audio], onehot_encoded, batch_size=16, epochs=100, callbacks=[callback], validation_split=0.2)


In [None]:
song_model.save('song_preds.h5')

In [None]:
# Load the saved model
loaded_model = tf.keras.models.load_model('song_preds.h5')

# Assuming pose_data and preprocessed_audio are your test data
# Assuming onehot_encoded is your one-hot encoded labels

# Reshape the test data to match the input shape of the loaded model
pose_data_reshaped = np.reshape(pose_data, (-1, 911, 33, 3))
preprocessed_audio_reshaped = np.reshape(preprocessed_audio, (-1, 128, 17183))

# Make predictions using the loaded model
predictions = loaded_model.predict([pose_data_reshaped, preprocessed_audio_reshaped])

# Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)

# Calculate accuracy
accuracy = np.mean(predicted_classes == np.argmax(onehot_encoded, axis=1))

# Print the accuracy
print('Accuracy:', accuracy)



**Model to predict similarity between 2 songs using Siamese Network**<br/>
Input - there are 4 inputs: audio of song 1, poses of song 1, audio of song 2 and video of song 2. <br/>
Output - A score between 0 & 1 of how similar 2 songs are. <br/>

How to use:
Predict the score for ever pair of songs. If given a song (audio and video) you are asked to recommend another song, just return the song that has the highest similarity score with your given song.

How to train: Watch this youtube video https://www.youtube.com/watch?v=6jfw8MuKwpI&pp=ygUZYW5kcmV3IG5nIHNpYW1lc2UgbmV0d29yaw%3D%3D


In [None]:
# import tensorflow as tf
# model = tf.keras.models.load('dance_classifier.hdf5')

In [None]:
# extract_features = Model(inputs = [video_input, audio_input], outputs = model.layers[-2].output)

In [None]:
# from tensorflow.keras import backend as K

# pose_input_0 = Input(shape=(5384, 33, 3))
# audio_input_0 = Input(shape=(128, 17183))
# pose_input_1 = Input(shape=(5384, 33, 3))
# audio_input_1 = Input(shape=(128, 17183))

# features_1 = extract_features([pose_input_0, audio_input_0])
# features_2 = extract_features([pose_input_1, audio_input_1])

# def cosine_similarity(vectors):
#     x, y = vectors
#     x = K.l2_normalize(x, axis=-1)
#     y = K.l2_normalize(y, axis=-1)
#     return K.sum(x * y, axis=-1, keepdims=True)

# similarity = Lambda(cosine_similarity)([features_1, features_2])

# # Create the Siamese network
# siamese_model = Model(inputs=[pose_input_0, audio_input_0, pose_input_1, audio_input_1], outputs=similarity)

In [None]:
# plot_model(siamese_model, show_shapes=True)

In [None]:
# siamese_model.summary()