In [26]:
# Import needed modules
import moviepy.editor as mp
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer
import numpy as np

# Define data path
video_path = 'data/Yann Zurbrugg/1.mp4'

In [27]:
# Download dataset
# Login: wolftau
# Password: wtal997

In [32]:
# Split the video data in 2 seconds frames and audio
class VideoProcessor:
    def __init__(self, video_path):
        self.video_path = video_path
        self.output_path = 'preprocessed_data/' + video_path.split('/')[1]
    # Split the video into frames and save them in the output folder
    def split_video(self, split_duration=2):
        """Split the video into frames and save them in the output path"""
        video = mp.VideoFileClip(self.video_path)
        # Split the video to 2 second clips
        for i in range(0, int(video.duration), split_duration):
            # If the last clip is shorter than 2 seconds, ignore it
            if i + split_duration > video.duration:
                break
            output = self.output_path + '/clip' + str(i)
            if not os.path.exists(output):
                os.makedirs(output)
            subclip = (video.subclip(i, i + split_duration))
            subclip.write_images_sequence(output + '/frame%04d.jpg')
        print('Video split into', int(split_duration), 'second clips and saved in', self.output_path)

    def split_audio(self, split_duration=2):
        """Split the audio from the video and save it in the output path"""
        video = mp.VideoFileClip(self.video_path)
        # Split the audio to 2 second clips
        for i in range(0, int(video.duration), split_duration):
            # If the last clip is shorter than 2 seconds, ignore it
            if i + split_duration > video.duration:
                break
            output = self.output_path + '/clip' + str(i)
            if not os.path.exists(output):
                os.makedirs(output)
            subclip = video.subclip(i, i + split_duration)
            subclip.audio.write_audiofile(output + '/audio.wav')
        print('Audio split into', int(split_duration), 'second clips and saved in', self.output_path)

# Execute the video processor
video_processor = VideoProcessor(video_path)
video_processor.split_video()
video_processor.split_audio()

Moviepy - Writing frames preprocessed_data/Yann Zurbrugg/clip0/frame%04d.jpg.


                                                             

Moviepy - Done writing frames preprocessed_data/Yann Zurbrugg/clip0/frame%04d.jpg.
Video split into 2 second clips and saved in preprocessed_data/Yann Zurbrugg
MoviePy - Writing audio in preprocessed_data/Yann Zurbrugg/clip0/audio.wav


                                                       

MoviePy - Done.
Audio split into 2 second clips and saved in preprocessed_data/Yann Zurbrugg




In [29]:
# Create an LSTM model in lightning to extract the features from the audio data
class LSTM(LightningModule):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.hidden = self.init_hidden()

    def forward(self, x):
        h0, c0 = self.hidden
        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

    def init_hidden(self):
        h0 = torch.zeros(self.num_layers, 1, self.hidden_size).to(self.device)
        c0 = torch.zeros(self.num_layers, 1, self.hidden_size).to(self.device)
        return (h0, c0)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.001)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        return loss


In [None]:
# Create the CNN model in lightning to extract the features from the video data
# Use the ResNet18 model from torchvision
class CNN(LightningModule):
    def __init__(self, output_size):
        super(CNN, self).__init__()
        self.resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True)
        self.resnet.fc = nn.Linear(512, output_size)

    def forward(self, x):
        return self.resnet(x)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.001)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        return loss

TypeError: <genexpr> is not a Module subclass

In [31]:

# Create the data loader
# Load the audio data
class AudioDataset(Dataset):
    def __init__(self, data_path):
        self.data_path = data_path
        self.samples = []
        for root, dirs, files in os.walk(data_path):
            for file in files:
                if file.endswith('.wav'):
                    self.samples.append(os.path.join(root, file))
        self.samples.sort()

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        audio = torch.load(sample)
        return audio
    
# Load audio data
audio_dataset = AudioDataset("model/preprocessed_data/Yann Zurbrugg/clip0/audio.wav")
audio_loader = DataLoader(audio_dataset, batch_size=1, shuffle=False)

# Initialize the LSTM model
input_size = 1
hidden_size = 128
num_layers = 2
output_size = 128
lstm = LSTM(input_size, hidden_size, num_layers, output_size)

# Summary of the model architecture
print(lstm)

# Train the model
trainer = Trainer(max_epochs=10)
trainer.fit(lstm, audio_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type   | Params | Mode 
----------------------------------------
0 | lstm | LSTM   | 199 K  | train
1 | fc   | Linear | 16.5 K | train
----------------------------------------
215 K     Trainable params
0         Non-trainable params
215 K     Total params
0.863     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode
`Trainer.fit` stopped: No training batches.


LSTM(
  (lstm): LSTM(1, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=128, bias=True)
)
