In [60]:
import numpy as np
import pandas as pd
import os
import random
from pathlib import Path as pt
import torch
import torchaudio
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchaudio import transforms
from torchvision.transforms import v2
# from Moduls.MosreDataset import MosreDataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

DEVICE = torch.device("cuda")

MAIN = pt(os.getcwd())
DATASET_PATCH = MAIN / 'morse_dataset'
AUDIO_FILES = DATASET_PATCH / 'morse_dataset'

# Поятоянные значения выявленные в процессе анализа
MORSEALP = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ 1234567890#"
MAX_TIME = 48
SAMPLE_RATE = 8000
N_MELS = 128
N_FFT = 400
HOP_LENGTH = 180
TOP_DB = 80
FREQ_MASK = 30
TIME_MASK = 40

# Гиперпараметы обучения
SEED = 42
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001 #2e-4
WEIGHT_DECAY = 0.0001
# int_to_alph = dict(enumerate(MORSEALP))
# alph_to_int = {char:enum for enum, char in int_to_alph.items()}

#===== Import data =====
train_data = pd.read_csv(pt.joinpath(DATASET_PATCH,'train.csv'))
test_data = pd.read_csv(pt.joinpath(DATASET_PATCH,'test.csv'))
sample_data = pd.read_csv(pt.joinpath(DATASET_PATCH,'sample_submission.csv'))



In [61]:
class MosreDataset(Dataset):
    """
    Класс для обработки 
    """
    def __init__(self, data_patch, train=True, transforms=None, prev_chars = 1):
        self.is_train = train
        self.data_path = data_patch
        self.audio_paths = self.data_path / 'morse_dataset'
        self.transforms = transforms
        self.morse_alp = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ 1234567890#"
        self.int_to_alph = dict(enumerate(MORSEALP))
        self.alph_to_int = {char:enum for enum, char in self.int_to_alph.items()}
        self.prev_chars = prev_chars

        if self.is_train:
            self.data =  pd.read_csv(pt.joinpath(self.data_path,'train.csv'))
            self.messeges = self.data.message.values

        else:
            self.data =  pd.read_csv(pt.joinpath(self.data_path,'test.csv'))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        #Получение аугментрованых спектрограмм
        try:
            audio_file = self.audio_paths / self.data.id.values[index]
            waveform = self.change_time(audio_file)
            augmented_spectrogram = self.transforms(waveform)
            if self.is_train:
                #Получение отдельных one-hot векторов на наждый символ
                message = self.messeges[index]
                
                onehots = torch.eye(len(self.morse_alp))
                indices = [self.alph_to_int[char] for char in message]
                onehot_vectors = onehots[indices]
                return augmented_spectrogram, onehot_vectors
            else:
                return augmented_spectrogram
        except Exception as ex:
            print(str(ex))
        
    def change_time(self, audio_file, max_len = 384000):
        waveform, sample_rate = torchaudio.load(audio_file)
        cahanal, sig_len = waveform.shape

        if sig_len < max_len:
            pad_len = torch.zeros(max_len - sig_len).unsqueeze(0)
            waveform = torch.cat([waveform, pad_len], dim=1)

        return waveform

# Загрузка датасета

In [62]:
audio_transforms = nn.Sequential(
    transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=N_FFT, n_mels=N_MELS),
    transforms.AmplitudeToDB(top_db=TOP_DB),
    transforms.FrequencyMasking(freq_mask_param=FREQ_MASK),
    transforms.TimeMasking(time_mask_param=TIME_MASK),
    v2.RandomCrop((N_MELS, 1920)) # Обрезает последний кадр спектрограммы, в идеале надобы считать а не прописывать число
)

d_train = MosreDataset(data_patch=DATASET_PATCH, train=True, transforms=audio_transforms)

train_ds, val_ds = random_split(d_train, [0.8, 0.2])

def my_collate(batch):
    data = torch.stack([item[0] for item in batch])
    target = torch.nn.utils.rnn.pad_sequence(
                                            [item[1] for item in batch], 
                                            batch_first=True, 
                                            padding_value=0) # выравнивает последовательность до макс 
                                                            # длины в датче заполняя пропуски нулем
    return [data, target]

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=my_collate)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=my_collate)

In [54]:
next(iter(train_dl))[0].shape

torch.Size([32, 1, 128, 1920])

Вывод - torch.Size([32, 1, 128, 1921]) => нужно обрезать посдлелний кадр в спектрограмме 1921 -> 1920(torchvision.transformes.RandomCrop). Или подобрать Гиперпараметры(не вышло)

# Класс модели

In [125]:
FIRST_FE_COUNT = 16
SECOND_FE_COUNT = 32
THIRD_FE_COUNT = 64
QAD_FE_COUNT = 128
PADDING = 'same'
MAXPOOL_KERNEL = 2

NERON_COUNT = 128
# Start with 4 transforms
class MorseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net_conv = nn.Sequential(
            nn.Conv2d(in_channels=1, 
                      out_channels=FIRST_FE_COUNT, 
                      kernel_size=3, stride=1, padding=PADDING),
            nn.BatchNorm2d(FIRST_FE_COUNT),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=MAXPOOL_KERNEL), # [batch, FIRST_FE_COUNT = 32, 64, 960]

            nn.Conv2d(in_channels=FIRST_FE_COUNT, 
                      out_channels=SECOND_FE_COUNT, 
                      kernel_size=3, stride=1, padding=PADDING),
            nn.BatchNorm2d(SECOND_FE_COUNT),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=MAXPOOL_KERNEL), # [batch, FIRST_FE_COUNT = 32, 32, 480]

            nn.Conv2d(in_channels=SECOND_FE_COUNT, 
                      out_channels=THIRD_FE_COUNT, 
                      kernel_size=3, stride=1, padding=PADDING),
            nn.BatchNorm2d(THIRD_FE_COUNT),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=MAXPOOL_KERNEL), # [batch, FIRST_FE_COUNT = 32, 16, 240]

            nn.Conv2d(in_channels=THIRD_FE_COUNT, 
                      out_channels=QAD_FE_COUNT, 
                      kernel_size=3, stride=1, padding=PADDING),
            nn.BatchNorm2d(QAD_FE_COUNT),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)), # [batch=32, FIRST_FE_COUNT = 32, 8, 240](что юы сохраниить большще признаков по горизонтали)

            nn.Flatten(), # [batch=32,122880]

            nn.Linear(QAD_FE_COUNT * 8 * 240, NERON_COUNT, bias=False),
            nn.GELU(), # -> тут получается набор ...

            nn.GRU(input_size=NERON_COUNT,hidden_size=NERON_COUNT, num_layers=3, bidirectional=True),
            nn.MultiheadAttention(embed_dim=NERON_COUNT*2, 
                                  num_heads = NERON_COUNT,
                                  batch_first=True),
            nn.Linear(NERON_COUNT*2, len(MORSEALP))                      
            # # nn.GRU()
        )

    def forward(self, x):
        return(self.net_conv(x))

Переменные для обучения

In [126]:
model = MorseNet().to(DEVICE)
optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_func = nn.CTCLoss()

x, y = next(iter(train_dl))
a = model(x.to(DEVICE))

OutOfMemoryError: CUDA out of memory. Tried to allocate 480.00 MiB. GPU 0 has a total capacity of 10.00 GiB of which 0 bytes is free. Of the allocated memory 23.87 GiB is allocated by PyTorch, and 297.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [122]:
a[0].shape

torch.Size([32, 256])