<a href="https://colab.research.google.com/github/Pai-Ya-Ting/Deep-Learning/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")\
df = pd.read_csv('train.csv',  dtype = {'file': str, 'scenario': str, 'sentence':str})

In [None]:
import librosa
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data   
import os

In [None]:
def read_audio(file_id, mode):
    if mode == 'train':
        filename = os.path.join(os.path.abspath('train/')+str('/' + file_id.file)+'.wav')
    elif mode == 'test':
        filename = os.path.join(os.path.abspath('test/')+'/'+str(file_id).zfill(6)+'.wav')
    print(filename)
    y, sr = librosa.load(filename)
    if 0 < len(y): 
        y, _ = librosa.effects.trim(y)
        y = librosa.util.fix_length(y, int(5*22050))
        mfcc = librosa.feature.mfcc(y=y, n_mfcc = 40)
    return mfcc

for mode in ['train', 'test']:
    X = []
    cnt = 0
    
    if mode == 'train':
        for i in df.itertuples():
            x = read_audio(i, mode)
            X.append(x)
        np.save('MFCC_40_x_train', X)
        print(np.array(X).shape)
        
    elif mode == 'test':
        for i in range(4721):
            x = read_audio(i, mode)
            X.append(x)
        np.save('MFCC_40_x_test', X)
        print(np.array(X).shape)

In [None]:
def build_dict():
    category.build_vocab(train_data)
    title.build_vocab(train_data)#, unk_init=torch.Tensor.normal_)
    
    vocab = title.vocab
    vocab_size = len(title.vocab)
    n_class = len(category.vocab)
    
    PAD_IDX= vocab.stoi[title.pad_token]
    UNK_IDX = vocab.stoi[title.unk_token]
    
    return vocab_size, vocab

In [None]:
category = data.Field(batch_first=True, pad_token=None, unk_token=None) 
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
title = data.Field(fix_length=30, stop_words=stop_words, 
                   batch_first=True, lower= True, include_lengths=True, init_token='<sos>', eos_token='<eos>')

train_data = data.TabularDataset(
   path = './train.csv',
   format = 'csv',
   fields = [(None, None), ('category', category), ('title', title)],
   skip_header = True
)

vocab_size, vocab = build_dict()

In [None]:
# df = pd.read_csv('./train.csv')
df_sentence = []
for i in range(df.shape[0]):
  s = df['sentence'][i].split()
  s = ["<sos>"] + s + ["<eos>"] + ["<pad>"]*(30 - len(s))
  df_sentence.append(s)
  #  break
df_sentence = np.array(df_sentence)

In [None]:
y = []
max_len = 30
for i in df_sentence:
  s = list(map(lambda x: vocab.stoi[x],i))
  s = s + [vocab.stoi["<PAD>"]]*(max_len - len(s))
  y.append(s)
  # print(s)
y = np.array(y)

In [None]:
X = np.load('MFCC_40_x_train.npy', allow_pickle=True)
# y = np.load('./sample_rate_22050/y.npy', allow_pickle=True).astype(np.float32)
label = np.load('label.npy', allow_pickle=True).astype(np.float32)

In [None]:
X.shape, y.shape

((18052, 40, 216), (18052, 32))

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify = label)

In [None]:
X_train, y_train = torch.from_numpy(X_train), torch.from_numpy(y_train)
train_dataset = TensorDataset(X_train, y_train)
X_val, y_val = torch.from_numpy(X_val), torch.from_numpy(y_val)
val_dataset = TensorDataset(X_val, y_val)

In [None]:
train_loader = DataLoader(train_dataset, batch_size = 128)
val_loader = DataLoader(val_dataset, batch_size = 128)

In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()

        self.hidden_size = 40
        self.dim = 40
        self.rnn = nn.LSTM(self.dim, self.hidden_size, 5, batch_first=True, bidirectional=True)
        
        self.dropout = nn.Dropout(0.1)

    def forward(self, outputs, hidden = None):
        outputs = outputs.to(device)
        outputs = self.dropout(outputs)
        outputs, hidden = self.rnn(outputs, hidden)
        ln = nn.LayerNorm(outputs.size()[1:])
        output = ln(outputs)
        
        return self.dropout(outputs), hidden

In [None]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()

        self.max_length = 20
        self.hidden_size = 40
        self.dim = 40
#         self.output_size = 256

        self.rnn = nn.LSTM(self.dim, self.hidden_size, 5, batch_first=True, bidirectional=True)
        self.embedding = nn.Embedding(vocab_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2 * 2, self.hidden_size)

        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
        self.out = nn.Linear(self.hidden_size, self.hidden_size)

    def forward(self, outputs, hidden, enc_outputs):
        outputs = self.embedding(outputs.long())
        outputs = self.dropout(outputs)
        outputs, hidden = self.rnn(outputs[0], hidden)
        
        attn_weights = torch.sum(outputs*enc_outputs, dim=2)#.transpose(1, 0)
        context = torch.bmm(attn_weights.unsqueeze(1), enc_outputs)
        cats = self.attn_combine(torch.cat((outputs, context), dim=2))
#         pred = F.log_softmax(self.out(cats.tanh().squeeze(0)), dim=1)
        pred = self.out(cats.tanh().squeeze(0))
        ln = nn.LayerNorm(pred.size()[1:])
        pred = ln(pred)
        
        return pred, hidden

In [None]:
import random
class Seq2Seq(nn.Module):
    def __init__(self, MAX_LENGTH=10, learning_rate=0.001):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder().to(device)
        self.decoder = Decoder().to(device)

        self.encoder_optimizer = optim.AdamW(self.encoder.parameters(), lr=learning_rate, weight_decay=1e-3)
        self.decoder_optimizer = optim.AdamW(self.decoder.parameters(), lr=learning_rate, weight_decay=1e-3)
#         self.criterion = nn.CTCLoss(blank=2, reduction='mean') #nn.CrossEntropyLoss()
        self.criterion = nn.MSELoss()
#     nn.CrossEntropyLoss(ignore_index=2)

    def forward(self, source, target, teacher_forcing_ratio=0.5, MAX_LENGTH=20):
        input_length = source.size(1)
        target_length = target.size(1)

        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        encoder_outputs = torch.zeros(source.size(0), max(input_length, MAX_LENGTH), 2 * self.encoder.hidden_size, device=device)#, requires_grad=True)

        for i in range(input_length):
            if i == 0:
                encoder_output, encoder_hidden = self.encoder(source[:,[i],:])
            else:
                encoder_output, encoder_hidden = self.encoder(source[:,[i],:], encoder_hidden)
#             print(input_length, encoder_outputs[:,i].shape, encoder_output[:,0].shape)
            encoder_outputs[:,i] = encoder_output[:,0]
        
        # encoder hidden -> decoder hidden
        decoder_hidden = encoder_hidden#.to(device)
        decoder_input = torch.zeros(1, source.size(0), 1, device = device)
        loss = 0
        
        record = np.zeros(128, dtype=np.bool)
        out = torch.zeros(source.size(0), target_length, device = device)
        for i in range(target_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            teacher_forcing = (random.random() < teacher_forcing_ratio)
            out[:, [i]] = self.find_word(decoder_output).float()#.numpy()
    
            decoder_input = (target[:,[i]] if teacher_forcing else out[:,[i]].detach())
#             print(target[:,[i]].shape)
            target_em = self.decoder.embedding(target[:,[i]])
            loss += self.criterion(decoder_output, target_em)
            
#             print(out[:10, i], target[:10, i])
            
            record[torch.where(decoder_input.cpu().squeeze() == 1)] = True
            if record.all():
                break
            decoder_input = decoder_input.reshape(1,-1,1)

        loss.backward()

        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

        return (loss.item() / target_length)
    
    def find_word(self, sample):
        distance = torch.norm(self.decoder.embedding.weight.data - sample, dim=1)
        nearest = torch.argmin(distance, dim=1).unsqueeze(1)
        return nearest

    def train(self, n_epoch):
        for iter in range(1, n_epoch + 1):    
            for step, (data, label) in enumerate(train_loader):
                data = torch.transpose(data,1,2)
                loss = model(data, label)
            print(f'Epoch {iter} loss: {loss/len(train_loader)}')
    
    def evaluate(self, n_epoch):
        epoch_loss = 0
        epoch_acc = 0

        model.eval()

        with torch.no_grad():
            for iter in range(1, n_epoch + 1):
                for step, (data, label) in enumerate(test_loader):
                    data = torch.transpose(data,1,2)
                    loss = model(data, label)
                    epoch_loss += batch_loss.item()
                print(f'Evaluate {iter} loss: {epoch_loss / len(data)}')
#                 epoch_acc += batch_acc.item()
#             return epoch_loss / len(data), epoch_acc / len(data)

    def predict(self, source):
        epoch_loss = 0
        epoch_acc = 0

        model.eval()
        ans = []

        with torch.no_grad():
            
            target_length = 30
#             for source in data: 
            input_length = source.size(1)
            encoder_outputs = torch.zeros(source.size(0), max(input_length, 20), 2*self.encoder.hidden_size, device=device)#, requires_grad=True)
            out = torch.zeros(source.size(0), target_length, device = device)
            
            for i in range(input_length):
                if i == 0:
                    encoder_output, encoder_hidden = self.encoder(source[:,[i],:])
                else:
                    encoder_output, encoder_hidden = self.encoder(source[:,[i],:], encoder_hidden)
                encoder_outputs[:,i] = encoder_output[:,0]

            decoder_hidden = encoder_hidden#.to(device)
            decoder_input = torch.zeros(1, source.size(0), 1, device = device)
            record = np.zeros(source.size(0), dtype=np.bool)

            for i in range(target_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
#                 print(decoder_output)
                out[:, [i]] = self.find_word(decoder_output).float()#.numpy()

                decoder_input = out[:, [i]].detach()
                record[torch.where(decoder_input.cpu().squeeze() == 1)] = True
                if record.all():
                    break
                decoder_input = decoder_input.reshape(1,-1,1)
#                 print(out[:, [i]])
#                 print(out[:, [i]].numpy().shape)
                if i == 0:
                    ans = out[:, [i]].numpy()
                else:
                    ans = np.concatenate((ans, out[:, [i]].numpy()), axis=1)

        return ans

In [None]:
n_epoch = 20
model = Seq2Seq().to(device)

In [None]:
model.train(n_epoch)

Epoch 1 loss: 0.004541007154866269
Epoch 2 loss: 0.004026145370383012
Epoch 3 loss: 0.0033144442658675343
Epoch 4 loss: 0.002636147800244783
Epoch 5 loss: 0.00223097236532914
Epoch 6 loss: 0.0019250065088272095
Epoch 7 loss: 0.00175166318291112
Epoch 8 loss: 0.001670705801562259
Epoch 9 loss: 0.0014947892803894847
Epoch 10 loss: 0.0014445830332605462
Epoch 11 loss: 0.0013085918206917613
Epoch 12 loss: 0.0012229103006814656
Epoch 13 loss: 0.001137005106398934
Epoch 14 loss: 0.0011573891106404757
Epoch 15 loss: 0.0010197523393129047
Epoch 16 loss: 0.0009744279478725635
Epoch 17 loss: 0.000914025698837481
Epoch 18 loss: 0.0008442620697774385
Epoch 19 loss: 0.0008205631061604148


In [None]:
dd = [[]]
for i, (d, _) in enumerate(val_loader):
    d = torch.transpose(d,1,2)
    if i == 0:
        dd = model.predict(d)
    else:
        dd = np.concatenate((dd,model.predict(d)), axis = 0)

    for j in set(dd.flatten()):
        print(i, vocab.itos[int(j)])

In [None]:
df_test = pd.DataFrame(index=range(len(X_val)), columns = ['title'])
for i in range(len(X_val)):
    s = []
    for j in range(30):
        s.append(vocab.itos[int(dd[i, j])])
    df_test['title'][i] = str(' '.join(s))
df_test.head(30)

In [None]:
X_test = np.load('MFCC_40_x_test.npy', allow_pickle=True)
tryloader = DataLoader(X_test, batch_size = len(X_test), shuffle = False)

In [None]:
for d in tryloader:
    d = torch.transpose(d,1,2)
    dd = model.predict(d)

In [None]:
set(dd.flatten())
for i in set(dd.flatten()):
    print(i, vocab.itos[int(i)])

In [None]:
dd.shape

In [None]:
df_test = pd.DataFrame(index=range(len(X_test)), columns = ['title'])
for i in range(len(X_test)):
    s = []
    for j in range(30):
        voc = vocab.itos[int(dd[i, j])]
        if voc == '<sos>' or voc == '<pad>' or voc == '<eos>':
            continue
        s.append(vocab.itos[int(dd[i, j])])
#         list(dict.fromkeys(s))
    df_test['title'][i] = str(' '.join(s))
df_test.head(100)

In [None]:
df_test.to_csv('df_test.csv', index=False)

# Classifier

In [None]:
import torch   
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data
import numpy as np
import matplotlib.pyplot as plt
import random

SEED = 2021
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def accuracy(y_pred, y_test):
    pred = torch.argmax(y_pred, dim = 1, keepdim = True).squeeze(1)
    return (pred == y_test).sum()/len(y_test)

def train_cls(data, optimizer_cls, criterion_cls):
    epoch_loss = 0
    epoch_acc = 0

    model_cls.train()
    for batch in data:
        optimizer_cls.zero_grad()
        
        text, text_len = batch.title
        
        prediction = model_cls(text)#.squeeze(1)
#         print(prediction, batch.category)
        batch_loss = criterion_cls(prediction, batch.category.squeeze(1))
        batch_acc = accuracy(prediction, batch.category.squeeze(1))

        batch_loss.backward()
        nn.utils.clip_grad_norm_(model_cls.parameters(), 0.01)
        optimizer_cls.step()

        epoch_loss += batch_loss.item()
        epoch_acc += batch_acc.item()

    return epoch_loss / len(data), epoch_acc / len(data)

def predict_cls(data, model_cls):
    epoch_loss = 0
    epoch_acc = 0
    
    model_cls.eval()
    ans = []
    
    with torch.no_grad():
        for batch in data:
            text, text_len = batch.title
            prediction = model_cls(text)#.squeeze(1)
            pred = torch.argmax(prediction, dim = 1, keepdim = True).squeeze(1)
            ans.extend(list(map(lambda x: category.vocab.itos[x], pred)))
            
        return pd.DataFrame(ans, columns=['Category'])

def evaluate_cls(data):
    epoch_loss = 0
    epoch_acc = 0
    
#     model.eval()
    
    with torch.no_grad():
        for batch in data:
            text, text_len = batch.title
            prediction = model_cls(text)#.squeeze(1)

            batch_loss = criterion_cls(prediction, batch.category.squeeze(1))
            batch_acc = accuracy(prediction, batch.category.squeeze(1))
            
            epoch_loss += batch_loss.item()
            epoch_acc += batch_acc.item()
        return epoch_loss / len(data), epoch_acc / len(data)
    
def prepare_data(train, test):
    train = data.BucketIterator(
      (train),
      sort_key = lambda x: len(x.title),
      sort = True,
      sort_within_batch=True,
      batch_size = 128,
#     shuffle=False,
      device = device
    )

    test = data.BucketIterator(
      (test),
      batch_size = 128,
        sort = False,
        sort_within_batch=False,
        shuffle=False,
      device = device
    )
    
    return train, test

In [None]:
class Transformerlayer(nn.Module):
    def __init__(self):
    
        super(Transformerlayer, self).__init__()
        dropout = 0.1
        nheads = 4
        dim_feedforward = 128
        
        self.embedding_dim = 64
        self.attn = nn.MultiheadAttention(embed_dim = self.embedding_dim, dropout = dropout, num_heads = nheads)
        self.linear1 = nn.Linear(self.embedding_dim, dim_feedforward)
        self.dropout1 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, self.embedding_dim)
        self.norm1 = nn.LayerNorm(self.embedding_dim)
        self.norm2 = nn.LayerNorm(self.embedding_dim)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x, src_mask, padding_mask):
        x1, weights = self.attn(x, x, x, attn_mask=src_mask, key_padding_mask= padding_mask)
        x = x + self.dropout2(x1)
        x = self.norm1(x)
        x1 = self.linear2(self.dropout1(F.relu(self.linear1(x))))
        x = x + self.dropout3(x1)
        x = self.norm2(x)
        
        return x, weights

class Transformer(nn.Module):
    def __init__(self):
    
        super(Transformer, self).__init__()

        self.embedding_dim = 64
        self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
        single_encoder = Transformerlayer()
        self.Encoder = nn.ModuleList([single_encoder for i in range(2)])

        self.Decoder = nn.Linear(self.embedding_dim, len(category.vocab))
        self.pos = PositionalEncoding(self.embedding_dim)

    def forward(self, text):
        outputs = (self.embedding(text)) * np.sqrt(self.embedding_dim) # (batch_size, title_len, embedding_dim)
        outputs = self.pos(outputs.transpose(0, 1)).to(device)  # (batch_size, title_len, embedding_dim)
        padding_mask = (torch.zeros((outputs.shape[1], outputs.shape[0])) == vocab.stoi['<pad>']).to(device)
        
        weights = []
        for m in self.Encoder:
            outputs, weight = m(outputs, None, None)
            weights.append(weight)

        outputs = outputs.transpose(0, 1)
        outputs = torch.mean(outputs, dim = 1)
        outputs = F.softmax(self.Decoder(outputs), dim=1)

        return outputs

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=30):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(max_len, dtype=torch.float).unsqueeze(1)
        
        term1, term2 = torch.arange(0, d_model, 2).float(), (- np.log(10000.0) / d_model)
        div_term = torch.exp(term1*term2)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x.to(device)
        x = x + (self.pe[:x.size(0), :]).to(device)
        return self.dropout(x)

In [None]:
N_EPOCHS = 25
best_loss = float('inf')
train_loss = train_acc = 0
l_train, acc_train = [], []
l_valid, acc_valid = [], []

vocab_size, vocab = build_dict()
model_cls = Transformer().to(device)

optimizer_cls = optim.AdamW(model_cls.parameters(), lr=1e-3, betas=(0.9, 0.98), eps = 1e-9, weight_decay=1e-3)
criterion_cls = nn.CrossEntropyLoss()#ignore_index = vocab.stoi['<pad>'])
learning_rate = []

train_set, valid_set = train_data.split(split_ratio=0.8, random_state=random.getstate())
training_data, testing_data = prepare_data(train_set, valid_set)

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train_cls(training_data, optimizer_cls, criterion_cls)
    valid_loss, valid_acc = evaluate_cls(testing_data)
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        best_model = model
    
    acc_train.append(train_acc)
    l_train.append(train_loss)

    acc_valid.append(valid_acc)
    l_valid.append(valid_loss)

    learning_rate.append(optimizer_cls.param_groups[0]['lr'])

    # if (epoch+1) % 5 == 0:
    print(f'Epoch: {epoch+1}')
    print('learning rate: ', optimizer_cls.param_groups[0]['lr'])
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tValid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')

In [None]:
plt.plot(l_train)
plt.plot(l_valid)
plt.show()

In [None]:
plt.plot(acc_train)
plt.plot(acc_valid)
plt.show()

In [None]:
test_data = data.TabularDataset(
   path = './df_test.csv',
   format = 'csv',
   fields = [('title', title)],
   skip_header = True
)

training_data, testing_data = prepare_data(train_data, test_data)

ans = predict_cls(testing_data, model_cls)
ans.insert(0, column="File", value = ans.index.values)
ans.head()

In [None]:
ans.to_csv('ans1.csv', index=False)