In [50]:
# !pip install torchsummary
# !pip install torchvision

In [12]:
from torch import nn
import torchaudio, torchvision
from torchsummary import summary
import numpy as np
from torch.utils.data import DataLoader 
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from moviepy.editor import *

In [52]:
def tensor_to_numpy():    
    list_ = ['data_test','data_train','label_test','label_train']
    file_save = 'features_labels.npy'
    with open(file_save, 'wb') as f:
        for name in list_:
            tensor = torch.load(name+'.pt')
            if len(tensor.shape) == 3:
                tensor = tensor.reshape((tensor.shape[0],1,40,tensor.shape[-1]))
            array = tensor.numpy()
            np.save(f, array)

In [53]:
tensor_to_numpy()

In [54]:
def load_data():
    filenames = 'features_labels_500.npy'
    with open(filenames, 'rb') as f:
        X_test = np.load(f)
        X_train = np.load(f)
        Y_test = np.load(f)
        Y_train = np.load(f)
    return X_test, X_train, Y_test, Y_train

In [55]:
X_test, X_train, Y_test, Y_train = load_data()

print(f'X_train:{X_train.shape}, y_train:{Y_train.shape}')
print(f'X_test:{X_test.shape}, y_test:{Y_test.shape}')

X_train:(6023, 1, 40, 500), y_train:(6023, 3)
X_test:(1506, 1, 40, 500), y_test:(1506, 3)


## Definition

In [56]:
# Model flow
class define_model(nn.Module):

    # Define layers
    def __init__(self, num_emotions):
        super().__init__()

        transformer_layer = nn.TransformerEncoderLayer(
            d_model=40, #####################
            nhead=4,
            dim_feedforward=512,
            dropout=0.4,
            activation='relu'
        )
        self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)

        #maxpool: reshape (width, height)
        #conv: reshape (channel)
        conv2d_layer = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.BatchNorm2d(16),#######
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=0.3),
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.BatchNorm2d(32),#######
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.BatchNorm2d(64),#######
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
        )
        self.conv2Dblock1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.BatchNorm2d(16),#######
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=0.3),
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.BatchNorm2d(32),#######
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.BatchNorm2d(64),#######
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
        )
        self.conv2Dblock2 = conv2d_layer

        self.fc1_layer = nn.Linear(960*2+40, num_emotions) ########
        self.softmax_out = nn.Softmax(dim=1)

    def forward(self, x):
        conv2d_embedding1 = self.conv2Dblock1(x)
        conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim = 1)

        conv2d_embedding2 = self.conv2Dblock2(x)
        conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim = 1)

        x_maxpool = self.transformer_maxpool(x)
        x_maxpool_reduced = torch.squeeze(x_maxpool,1) ############
        x = x_maxpool_reduced.permute(2,0,1) ###########
        transformer_output = self.transformer_encoder(x)
        transformer_embedding = torch.mean(transformer_output, dim = 0)

        complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2, transformer_embedding], dim = 1)
        output_logits = self.fc1_layer(complete_embedding)
        output_softmax = self.softmax_out(output_logits)
        return output_logits, output_softmax

In [57]:
# Training step
def make_train_step(model, criterion, optimizer):

    def train_step(X, Y):
        output_logits, output_softmax = model(X)
        predictions = torch.argmax(output_softmax, dim=1)
        Y = torch.argmax(Y, dim=1)
        accuracy = torch.sum(Y==predictions)/float(len(Y))
        loss = criterion(output_logits, Y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        return loss.item(), accuracy*100
    return train_step

#validation step
def make_validate_fnc(model, criterion):
    def validate(X,Y):
        with torch.no_grad():
            model.eval()
            output_logits, output_softmax = model(X)
            predictions = torch.argmax(output_softmax, dim=1)
            Y = torch.argmax(Y, dim=1)
            accuracy = torch.sum(Y==predictions)/float(len(Y))
            loss = criterion(output_logits, Y)
        return loss.item(), accuracy*100, predictions
    return validate

In [58]:
def make_save_checkpoint():
    def save_checkpoint(optimizer, model, epoch, filename):
        checkpoint_dict = {
            'optimizer': optimizer.state_dict(),
            'model': model.state_dict(),
            'epoch': epoch
        }
        torch.save(checkpoint_dict, filename)
    return save_checkpoint

def load_checkpoint(optimizer, model, filename):
    checkpoint_dict = torch.load(filename)
    epoch = checkpoint_dict['epoch']
    model.load_state_dict(checkpoint_dict['model'])
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint_dict['optimizer'])
    return epoch

## Training loop

In [59]:
emotions_dict ={
    '0':'positive',
    '1':'neutral',
    '2':'negative'}

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'{device} selected')

model = define_model(len(emotions_dict)).to(device)
print('Number of trainable params: ', sum(p.numel() for p in model.parameters()))
# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-3, momentum=0.8)
# Define loss/ criterion
def criterion(predictions, targets):
    return nn.CrossEntropyLoss()(input=predictions, target=targets)

save_checkpoint = make_save_checkpoint()
train_step = make_train_step(model, criterion, optimizer)
validate = make_validate_fnc(model, criterion)

train_losses = []
valid_losses = []

cpu selected


Number of trainable params:  245851


In [60]:
def train(optimizer, model, num_epochs, X_train, Y_train, X_valid, Y_valid, list_acc, list_loss):
    train_size = X_train.shape[0]
    minibatch = 128
    for epoch in range(num_epochs):
        #set to train phase
        model.train()

        train_indices = np.random.permutation(train_size)
        X_train = X_train[train_indices,:,:,:]
        Y_train = Y_train[train_indices,:]

        epoch_acc = 0
        epoch_loss = 0
        num_iterations = int(train_size/minibatch)

        for i in range(num_iterations):
            batch_start = i*minibatch
            batch_end = min(batch_start + minibatch, train_size)
            actual_batch_size = batch_end - batch_start

            X = X_train[batch_start:batch_end,:,:,:]
            Y = Y_train[batch_start:batch_end,:]
            
            X_tensor = torch.tensor(X, device = device).float()
            Y_tensor = torch.tensor(Y, dtype = torch.long, device = device)

            loss, acc = train_step(X_tensor, Y_tensor)
            
            epoch_acc += acc * actual_batch_size / train_size
            epoch_loss += loss * actual_batch_size / train_size

        list_acc.append(epoch_acc.cpu().numpy())
        list_loss.append(epoch_loss)
        X_valid_tensor = torch.tensor(X_test, device = device).float()
        Y_valid_tensor = torch.tensor(Y_test, dtype = torch.long, device = device)
        valid_loss, valid_acc, _ = validate(X_valid_tensor,Y_valid_tensor)

        checkpoint_filename = 'cnn_transformerFINAL-{:03d}.pkl'.format(epoch)
        save_checkpoint(optimizer, model, epoch, checkpoint_filename)

        print(f'\nEpoch {epoch} --- loss:{epoch_loss:.3f}, Epoch accuracy:{epoch_acc:.2f}%, Validation loss:{valid_loss:.3f}, Validation accuracy:{valid_acc:.3f}')

In [1]:
# num_epochs = 100
# list_acc = []
# list_loss = []
# train(optimizer, model, num_epochs, X_train, Y_train, X_test, Y_test, list_acc, list_loss)

### Why loss validate is nan?

In [None]:
def check_nan(datasets):
    for data in datasets:
        if np.isnan(data).any() == True:
            return True
    return False

In [2]:
# check_nan([X_train, X_test, Y_train, Y_test])

## Prediction

In [28]:
def preprocess_real_data(file):
    def mp4_to_wav(file):
        # Load the MP4 file
        video = VideoFileClip(file)
        # Extract the audio from the video
        audio = video.audio
        # Export the audio as a WAV file
        file = file[:-4]+'.wav'
        audio.write_audiofile(file)
        return file
    def remove_noise(file):
        # Detect non-silent parts of the audio
        sound_file = AudioSegment.from_wav(file)
        non_sil_times = detect_nonsilent(sound_file, min_silence_len=40, silence_thresh=sound_file.dBFS * 1)

        # Concatenate the non-silent parts of the audio
        if len(non_sil_times) > 0:
            non_sil_times_concat = [non_sil_times[0]]
            if len(non_sil_times) > 1:
                for t in non_sil_times[1:]:
                    if t[0] - non_sil_times_concat[-1][1] < 100:
                        non_sil_times_concat[-1] = (non_sil_times_concat[-1][0], t[1])
                    else:
                        non_sil_times_concat.append(t)
            new_audio = sound_file[non_sil_times_concat[0][0]:non_sil_times_concat[0][1]]
            for t in non_sil_times_concat[1:]:
                new_audio += sound_file[t[0]:t[1]]
        else:
            new_audio = sound_file

        # Export the new audio file
        new_audio.export(file[:-4]+'_denoised.wav', format="wav")
    if file[-4:] == '.mp4':
        file = mp4_to_wav(file)
    remove_noise(file)
    

In [None]:
import torch
from torchvision import transforms

# Load the saved model
model = torch.load('model.pt')
model.eval()  # Set the model to evaluation mode

# Define a sample input (you will need to replace this with your actual data)
sample_input = torch.randn(1, 3, 224, 224)  # Example input with the shape (batch_size, channels, height, width)

# Preprocess the input data
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Example normalization values, adjust as needed
])
input_data = preprocess(sample_input).unsqueeze(0)  # Preprocess the sample input

# Perform the prediction
with torch.no_grad():
    output = model(input_data)  # Call the model on the preprocessed input data

# Process the output as needed
print(output)

In [29]:
preprocess_real_data('file.mp4')

MoviePy - Writing audio in file.wav


                                                                    

MoviePy - Done.




In [34]:
waveform, sample_rate = torchaudio.load('file_denoised.wav')
transform = torchaudio.transforms.MFCC(sample_rate=sample_rate)
mfcc = transform(waveform)
torch.save(mfcc,'file_denoised.pt')

In [33]:
# mfcc = torch.load('file_denoised.pt')

# # Convert the audio data to a NumPy array
# audio_array = mfcc.numpy()

# # Create an AudioSegment from the NumPy array
# audio_segment = AudioSegment(audio_array.tobytes(), frame_rate=sample_rate, sample_width=2, channels=2)

# # Export the audio segment as a .wav file
# audio_segment.export('file_denoised2.wav', format='wav')
