In [6]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import Levenshtein
import time
import torchaudio
import librosa

from morse.models import CNNResidualBlock, TransformerResidualBlock, PoolingTransition, CNNTransformer, CTCHead
from morse.models import MySomething
from morse.models import SimpleCNN
from morse.my_datasets import ListDataset, load_tensors, filenames_to_torch
from morse.samplers import LongCTCSampler
# from morse.augmentations import rotation_transform, volume_signal_transform
from morse.augmentations import make_volume_signal_transform, make_compose_transform, make_noise_signal_transform, make_runtime_rotation_transform, make_runtime_mel_bounded_noise_transform
from morse.text_helpers import Vectorizer, encode_to_morse, decode_from_morse

from morse.my_datasets import generate_dataset, read_dataset_from_files

In [7]:
labels_dir = '../'
# data_dir = '../data/melspec_nfft512_nc64'
audio_dir = '../morse_dataset'

full_train_df = pd.read_csv(Path(labels_dir, 'train.csv'))
test_df = pd.read_csv(Path(labels_dir, 'test.csv'))
full_train_df.head()

Unnamed: 0,id,message
0,1.opus,03ЩУЫЛПИГХ
1,2.opus,ЪЛТ0ДС6А3Г
2,3.opus,5ЭКЫБЗХЯН
3,4.opus,ЖЫЦОИ68КФ
4,5.opus,32Ю7МЫ ЗЛ


In [8]:
index_to_letter = sorted(set(''.join(full_train_df['message'])))
pad_value = 0
print(index_to_letter)
letter_to_index = dict([(letter, i) for i, letter in enumerate(index_to_letter)])
dictionary_size = len(index_to_letter)
print(dictionary_size)
print(letter_to_index)

vectorizer = Vectorizer(letter_to_index, index_to_letter)
print(vectorizer.text_transform('ПРИВЕТ #'))

[' ', '#', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я']
44
{' ': 0, '#': 1, '0': 2, '1': 3, '2': 4, '3': 5, '4': 6, '5': 7, '6': 8, '7': 9, '8': 10, '9': 11, 'А': 12, 'Б': 13, 'В': 14, 'Г': 15, 'Д': 16, 'Е': 17, 'Ж': 18, 'З': 19, 'И': 20, 'Й': 21, 'К': 22, 'Л': 23, 'М': 24, 'Н': 25, 'О': 26, 'П': 27, 'Р': 28, 'С': 29, 'Т': 30, 'У': 31, 'Ф': 32, 'Х': 33, 'Ц': 34, 'Ч': 35, 'Ш': 36, 'Щ': 37, 'Ъ': 38, 'Ы': 39, 'Ь': 40, 'Э': 41, 'Ю': 42, 'Я': 43}
tensor([27, 28, 20, 14, 17, 30,  0,  1])


In [9]:
device = 'cpu'

In [10]:
# checkpoint_period = 10

# n_epochs = 3 if dev_flag else 30
# batch_size = 128

# lr = 1e-3
# step_gamma = 0.33
# dropout = 0.165

# n_pools = 4
# n_blocks_before_pool = 3
# pooling_overlap = True

# group = 'RealTune'

# run_name = 'testrun' if dev_flag else 'CNNTransformer_pretrained_30ep__lr=1e-4'



# model = SimpleCNN(d_input=64, d_model=64, d_inner=64, d_output=dictionary_size + 1, 
#               n_pools=n_pools, n_blocks_before_pool=n_blocks_before_pool, pooling_overlap=pooling_overlap,
#               dropout=dropout).to(device)

batch_size = 256
lr = 1e-4
step_gamma = 0.051
dropout = 0.0838

d_input = 64
d_model = 128
d_inner = 128
d_output = dictionary_size + 1

n_pools = 4
n_blocks_before_pool = 3
n_transformer_blocks = 5
num_heads = 4       # might be important

# config = {
#     'n_epochs': n_epochs,
#     'batch_size': batch_size,
    
#     'lr': lr,
#     'step_gamma': step_gamma,
#     'dropout': dropout,

#     'n_pools': n_pools,
#     'n_blocks_before_pool': n_blocks_before_pool,
#     'pooling_overlap': True,
#     'n_transformer_blocks': n_transformer_blocks,
#     'num_heads': num_heads,

#     'd_input': d_input,
#     'd_model': d_model,
#     'd_inner': d_inner,
#     'd_output': d_output,
# }

model = CNNTransformer(d_input = d_input, d_model=d_model,
    n_pools=n_pools, n_blocks_before_pool=n_blocks_before_pool,
    n_transformer_blocks=n_transformer_blocks,
    dropout=dropout,
    head_block=CTCHead(d_model, d_output),
    make_cnn_block=lambda: CNNResidualBlock(d_model, d_inner, dropout=dropout),
    make_transformer_block=lambda: TransformerResidualBlock(d_model, d_ffn=d_inner, dropout=dropout, num_heads=num_heads), 
    pooling_overlap=True).to(device)

model.load_state_dict(torch.load('../models/CNNTransformer_pretrained_30ep__lr=1e-4_30ep.pt', 
                                 weights_only=True, map_location=torch.device('cpu')))

<All keys matched successfully>

In [11]:
from sklearn.model_selection import train_test_split

train_index, val_index = train_test_split(np.arange(full_train_df.shape[0]), test_size=1/6, shuffle=True, 
                                           random_state=42)
real_val_set = read_dataset_from_files(audio_dir, 
                                       filenames = full_train_df.iloc[val_index]['id'], 
                                       labels=list(full_train_df.iloc[val_index]['message']))
print(len(real_val_set))

# real_train_set = read_dataset_from_files(audio_dir, 
#                                        filenames = full_train_df.iloc[train_index]['id'], 
#                                        labels=list(full_train_df.iloc[train_index]['message']))
# print(len(real_train_set))

100%|██████████| 5000/5000 [01:03<00:00, 78.33it/s]

5000





In [12]:
# def batch_text_transform(texts):
#     vecs, lengths = vectorizer.batch_text_transform(texts, pad_value=pad_value)
#     return vecs + 1, lengths

# from sklearn.model_selection import train_test_split

# train_index, val_index = train_test_split(np.arange(full_train_df.shape[0]), test_size=1/6, shuffle=True, 
#                                            random_state=42)
# print(train_index.shape, val_index.shape)
# val_features = list(tqdm(load_tensors(data_dir, filenames_to_torch(list(full_train_df.iloc[val_index]['id'])))))
# val_labels = list(full_train_df.iloc[val_index]['message'])
# valset = ListDataset(val_features, val_labels)
# print(len(valset))

In [13]:
generator = (real_val_set[i] for i in range(1000))

val_ctc_probs = []
val_ctc_labels = []

model.eval()
with torch.no_grad():
    for features, labels in tqdm(generator):
        features = features.to(device)
        outs = model(features[None]).squeeze().to('cpu')
        probs = F.softmax(outs, dim=0)
        val_ctc_probs.append(probs)
        val_ctc_labels.append(labels)

print(len(val_ctc_probs))

1000it [00:10, 92.56it/s]

1000





In [None]:
distance_buffer = []
for prob, label in zip(tqdm(val_ctc_probs), val_ctc_labels):
    seqs, likelihood = LongCTCSampler.sample(prob, beam_size=10)
    text = vectorizer.from_tensor(torch.tensor(seqs) - 1)
    # print(text, label)
    decoded_message = text
    dist = Levenshtein.distance(decoded_message, label)
    # print(dist)
    distance_buffer.append(dist)
val_mean_dist = np.mean(distance_buffer)
print(val_mean_dist)

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [02:06<00:00,  7.90it/s]

0.324





In [16]:
test_set = read_dataset_from_files(audio_dir, 
                                       filenames = test_df['id'], 
                                       labels=['' for i in range(5000)])
print(len(test_set))

100%|██████████| 5000/5000 [01:00<00:00, 82.55it/s]

5000





In [18]:
test_features = [test_set[i][0] for i in range(len(test_set))]
print(len(test_features))

5000


In [19]:
test_ctc_probs = []
# val_ctc_labels = []

model.eval()
with torch.no_grad():
    for features in tqdm(test_features):
        features = features.to(device)
        outs = model(features[None]).squeeze().to('cpu')
        probs = F.softmax(outs, dim=0)
        test_ctc_probs.append(probs)
        # val_ctc_labels.append(labels)

print(len(test_ctc_probs))

100%|██████████| 5000/5000 [00:53<00:00, 93.98it/s] 

5000





In [20]:
test_decoded_list = []
for prob in tqdm(test_ctc_probs):
    seqs, likelihood = LongCTCSampler.sample(prob, beam_size=10)
    text = vectorizer.from_tensor(torch.tensor(seqs) - 1)
    test_decoded_list.append(text)
print(len(test_decoded_list))

100%|██████████| 5000/5000 [05:07<00:00, 16.27it/s]

5000





In [21]:
submission_df = pd.DataFrame({'id': test_df['id'], 'message': test_decoded_list})
submission_df.head()

Unnamed: 0,id,message
0,30001.opus,ЯЮ6ЛИТЖБШ
1,30002.opus,КЩ В9Ю 9
2,30003.opus,Ы65Ф61Я
3,30004.opus,ЖЖНЖ9РЫНЦ3
4,30005.opus,ЕЯФ4ЮЧЬ


In [23]:
submission_df.to_csv('../submission.csv', index=False)