In [None]:
# run in kaggle to fetch repo

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

GITHUB_TOKEN = user_secrets.get_secret("GITHUB_MORSE_TOKEN")
USER = "SwedishSquid"
REPO_NAME = 'KC25_morse'
CLONE_URL = f"https://{USER}:{GITHUB_TOKEN}@github.com/{USER}/{REPO_NAME}.git"
get_ipython().system(f"git clone {CLONE_URL}")

import sys
sys.path.append("/kaggle/working/KC25_morse/src")

import morse

In [None]:
!pip install Levenshtein

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import Levenshtein
import time

from morse.models import MySomething
from morse.my_datasets import ListDataset, load_tensors, filenames_to_torch
from morse.samplers import LongCTCSampler
from morse.augmentations import rotation_transform
from morse.text_helpers import Vectorizer

In [None]:
import wandb
import os
from kaggle_secrets import UserSecretsClient

secret_value_0 = UserSecretsClient().get_secret('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = secret_value_0

common_wandb_kvals = {
    'project': 'KC25',
    'entity': 'fishwere',
}

# let there be no noise
os.environ["WANDB_SILENT"] = "true"

In [None]:
dev_flag = False

all_data_dir = '/kaggle/input/kc25-preprocessed-data'
labels_dir = '/kaggle/input/kc25-dataset-copy'

In [None]:
full_train_df = pd.read_csv(Path(labels_dir, 'train.csv'))
test_df = pd.read_csv(Path(labels_dir, 'test.csv'))
full_train_df.head()

In [None]:
index_to_letter = sorted(set(''.join(full_train_df['message'])))
pad_value = 0
print(index_to_letter)
letter_to_index = dict([(letter, i) for i, letter in enumerate(index_to_letter)])
dictionary_size = len(index_to_letter)
print(dictionary_size)
print(letter_to_index)

vectorizer = Vectorizer(letter_to_index, index_to_letter)
print(vectorizer.text_transform('ПРИВЕТ #'))

In [None]:
def load_data(data_dir):
    train_index, val_index = train_test_split(np.arange(full_train_df.shape[0]), test_size=1/6, shuffle=True, 
                                           random_state=42)
    if dev_flag:
        train_index = train_index[:1000]    # to save memory
        val_index = val_index[:1000]

    print(train_index.shape, val_index.shape)

    train_features = list(tqdm(load_tensors(data_dir, filenames_to_torch(list(full_train_df.iloc[train_index]['id'])))))
    val_features = list(tqdm(load_tensors(data_dir, filenames_to_torch(list(full_train_df.iloc[val_index]['id'])))))
    train_labels = list(full_train_df.iloc[train_index]['message'])
    val_labels = list(full_train_df.iloc[val_index]['message'])

    assert len(train_features) == len(train_labels)
    assert len(val_features) == len(val_labels)

    trainset = ListDataset(train_features, train_labels, transform=rotation_transform)
    valset = ListDataset(val_features, val_labels)

    return trainset, valset

    # print(len(trainset), len(valset))
    # plt.imshow(valset[0][0])
    # print(valset[0][1])

In [None]:
trainset, valset = load_data(Path(all_data_dir, 'melspec_nfft512_nc64'))

# model

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, dim, num_heads=4, ff_dim=256, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(dim, num_heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(dim)
        self.ff = nn.Sequential(
            nn.Linear(dim, ff_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, dim),
            nn.Dropout(dropout)
        )
        self.norm2 = nn.LayerNorm(dim)
        
    def forward(self, x):
        # x shape: (seq_len, batch, features)
        attn_out, _ = self.attn(x, x, x)
        x = self.norm1(x + attn_out)
        ff_out = self.ff(x)
        return self.norm2(x + ff_out)

class CNNTransformerModel(nn.Module):
    def __init__(self, input_size=64, inner_size=64, output_size=5):
        super().__init__()
        # CNN feature extractor
        self.cnn = nn.Sequential(
            nn.Conv1d(input_size, inner_size, 3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(inner_size),
            nn.MaxPool1d(2, stride=2),
            nn.Conv1d(inner_size, inner_size, 3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(inner_size),
            nn.MaxPool1d(2, stride=2)
        )
        
        # Transformer
        self.transformer = TransformerBlock(inner_size)
        
        # Classifier
        self.classifier = nn.Sequential(
            # nn.AdaptiveAvgPool1d(1),
            # nn.Flatten(),
            nn.Linear(inner_size, output_size)
        )
        
    def forward(self, x):
        # CNN feature extraction
        x = self.cnn(x)  # (batch, channels, seq_len)
        
        # Prepare for transformer
        x = x.permute(2, 0, 1)  # (seq_len, batch, channels)
        
        # Transformer
        x = self.transformer(x)
        
        # Classifier
        x = x.permute(1, 2, 0)  # (batch, channels, seq_len)
        return self.classifier(x)

# train

In [None]:
device = 0 if torch.cuda.is_available() else 'cpu'
device

In [None]:
def batch_text_transform(texts):
    vecs, lengths = vectorizer.batch_text_transform(texts, pad_value=pad_value)
    return vecs + 1, lengths

In [None]:
def calculate_target_metric(valset, model):
    model.eval()
    with torch.no_grad():
        distance_buffer = []
        for features, labels in tqdm([valset[i] for i in range(250)]):
            features = features.to(device)
            outs = model(features[None]).squeeze().to('cpu')
            probs = F.softmax(outs, dim=0)
            seqs, likelihood = LongCTCSampler.sample(probs, beam_size=10)
            text = vectorizer.from_tensor(torch.tensor(seqs) - 1)
            decoded_message = text
            dist = Levenshtein.distance(decoded_message, labels)
            distance_buffer.append(dist)
        mean_dist = np.mean(distance_buffer)
    return mean_dist

In [None]:
n_epochs = 30
batch_size = 128
lr = 5e-3
inner_size = 64
step_gamma = 0.33
p_dropout = 0.15
input_size = 64

group = 'CNNTransformer'
run_name = 'proof_of_concept'

config = {
    'n_epochs': n_epochs,
    'batch_size': batch_size,
    'lr': lr,
    'inner_size': inner_size,
    'step_gamma': step_gamma,
    'p_dropout': p_dropout,
}



model = CNNTransformerModel(input_size=input_size, inner_size=inner_size, output_size=dictionary_size + 1)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=step_gamma)
ctc_loss = nn.CTCLoss()


train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

final_loss = 100
with wandb.init(
        **common_wandb_kvals,
        group=group,
        config=config,
        name=run_name,
        ) as run:
    for epoch in range(n_epochs):
        model.train()
        train_loss_buffer = []
        epoch_start_time = time.perf_counter()
        for features, labels in tqdm(train_loader):
            features = features.to(device)
            targets, target_lengths = batch_text_transform(labels)
            targets, target_lengths = targets.to(device), target_lengths.to(torch.int32).to(device)
            outs = model(features).transpose(0, 2).transpose(1, 2)
            inputs = F.log_softmax(outs, dim=2)
            input_lengths = torch.full(size=(inputs.shape[1],), fill_value=inputs.shape[0], dtype=torch.int32).to(device)
            loss = ctc_loss(inputs, targets, input_lengths, target_lengths)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_loss_buffer.append(loss.detach())
        scheduler.step()
        model.eval()
        test_loss_buffer = []
        with torch.no_grad():
            for features, labels in tqdm(val_loader):
                features = features.to(device)
                targets, target_lengths = batch_text_transform(labels)
                targets, target_lengths = targets.to(device), target_lengths.to(torch.int32).to(device)
                outs = model(features).transpose(0, 2).transpose(1, 2)
                inputs = F.log_softmax(outs, dim=2)
                input_lengths = torch.full(size=(inputs.shape[1],), fill_value=inputs.shape[0], dtype=torch.int32).to(device)
                loss = ctc_loss(inputs, targets, input_lengths, target_lengths)
                test_loss_buffer.append(loss.detach())
        train_loss_value = torch.mean(torch.stack(train_loss_buffer)).item()
        test_loss_value = torch.mean(torch.stack(test_loss_buffer)).item()
        final_loss = test_loss_value
        wandb.log({
            'train_loss': train_loss_value,
            'test_loss': test_loss_value,
            'lr': scheduler.get_last_lr()[0],
            'epoch_duration': (time.perf_counter() - epoch_start_time),
        })
    print('calculating target metric')
    target_metric = calculate_target_metric(valset, model)
    wandb.log({
        'Levenshtein_distance': target_metric,
    })