In [1]:
import os
from os.path import join
import numpy as np

from utils.main_utils import get_readers, collect_paths_with_meta
from utils.scoring import calc_scores
from utils.generate_mcc_images import generate_mcc_images

import torch
import torchvision.transforms as transforms

import torch.nn as nn
from torch.nn import Sequential, Conv2d, ReLU, BatchNorm2d, MaxPool2d, Linear, Dropout

import torch.optim as optim

from PIL import Image
from torch.utils.data import Dataset, DataLoader


from tqdm import tqdm

In [3]:
class MCCDataset(Dataset):
    '''
    Класс датасета. Концепция заключается в том, что хранилище изображений 
    одинаково для всех классов, разделение на тренировочный, валидацинный и тестовый 
    наборы происходит по списку названий файлов
    '''
    def __init__(self, path, transforms, file_names):
        self.path = path
        self.transforms = transforms
        self.imgs = list(file_names)
        
    def __len__(self):
        return len(self.imgs)
    
    def __getitem__(self, idx):
        img_name = self.imgs[idx]
        target = int(img_name.split('_')[0] == 'F')
        img_path = join(self.path, img_name)
        
        img = Image.open(img_path).convert("L")
        img = self.transforms(img)

        return img, target


In [4]:
class Model(nn.Module):
    '''
    Класс сверточной нейронной сети. Батч-нормализация давала нестабильное обучение, 
    поэтому не используется
    '''
    def __init__(self):
        super().__init__()
        self.feature_extractor = Sequential(
            Conv2d(1, 6, 5), ReLU(), MaxPool2d(2,2),
            Conv2d(6, 16, 5), ReLU(), MaxPool2d(2,2),
            Conv2d(16, 32, 3), ReLU(), MaxPool2d(2,2),
            Conv2d(32, 64, 3), ReLU(), MaxPool2d(2,2)  
        )
        
        self.classifier = Sequential(
            Linear(38016, 2048), ReLU(), Dropout(0.5),
            Linear(2048, 256), ReLU(), Dropout(0.5),
            Linear(256, 64), ReLU(), Dropout(0.5),
            Linear(64, 2)
        )


    def forward(self, x):
        x = torch.flatten(self.feature_extractor(x), 1)
        x = self.classifier(x)
        return x

In [5]:
def evaluate(model, valloader):
    preds, y = [], []
    device = next(model.parameters()).device
    model.eval()
    with torch.no_grad():
        for inputs, labels in valloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs).detach().cpu()
            preds.extend(np.argmax(outputs,axis=1).tolist())
            y.extend(labels.tolist())
    model.train()
    return calc_scores(np.array(y), np.array(preds))

In [6]:
audio_path = 'data/dev-clean/'
SEED = 42

device = torch.device('cuda:0')

mcc_images_path = 'mcc_images/'

split_ratio = 0.6

In [7]:
torch.manual_seed(SEED)
np.random.seed(SEED)

In [8]:
# получим список дикторов с путями до их записей
readers = get_readers('data/speakers.tsv', audio_path)
meta_paths = collect_paths_with_meta(audio_path, readers)
meta_paths.head()

Unnamed: 0,reader,gender,path
0,1919,F,data/dev-clean/1919/142785/1919_142785_000005_...
1,1919,F,data/dev-clean/1919/142785/1919_142785_000118_...
2,1919,F,data/dev-clean/1919/142785/1919_142785_000035_...
3,1919,F,data/dev-clean/1919/142785/1919_142785_000064_...
4,1919,F,data/dev-clean/1919/142785/1919_142785_000071_...


In [9]:
# предпосчитаем и сохраним спектрограммы, если они еще не вычисленны
if not os.path.exists(mcc_images_path):
    generate_mcc_images(mcc_images_path, meta_paths, n_jobs=12, batch_size=100, hop_length=HOP_LENGTH)

In [10]:
model = Model().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

In [11]:
# Вычислим по сколько дикторов будет приходится на каждую из подгрупп данных 
all_readers = np.unique(meta_paths.reader)
readers_number = all_readers.shape[0]
train_size = int(readers_number*split_ratio)
val_size = (readers_number - train_size)//2
test_size = readers_number - train_size - val_size
train_size, val_size, test_size

(24, 8, 8)

In [12]:
# Извлечем названия изображений каждой из подгрупп данных
train_readers = set(np.random.choice(all_readers, train_size, replace=False).tolist())
val_readers = set(np.random.choice(list(set(all_readers)-train_readers), val_size, replace=False).tolist())
test_readers = set(all_readers)-train_readers-val_readers

train_paths = meta_paths[[el in train_readers for el in meta_paths.reader]][['gender', 'path']]
train_paths = [f'{row.gender}_{os.path.basename(row.path).split(".")[0]}.png' for _, row in train_paths.iterrows()]

val_paths = meta_paths[[el in val_readers for el in meta_paths.reader]][['gender', 'path']]
val_paths = [f'{row.gender}_{os.path.basename(row.path).split(".")[0]}.png' for _, row in val_paths.iterrows()]

test_paths = meta_paths[[el in test_readers for el in meta_paths.reader]][['gender', 'path']]
test_paths = [f'{row.gender}_{os.path.basename(row.path).split(".")[0]}.png' for _, row in test_paths.iterrows()]

len(train_paths), len(val_paths), len(test_paths)

(3312, 1322, 1102)

In [13]:
trans = transforms.Compose(
    [
     transforms.Resize(size=(225,225*4)),
     transforms.ToTensor(),
     transforms.Normalize((0.5), (0.5))])

batch_size = 64

trainset = MCCDataset(mcc_images_path, trans, train_paths)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

valset = MCCDataset(mcc_images_path, trans, val_paths)
valloader = DataLoader(valset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = MCCDataset(mcc_images_path, trans, test_paths)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=2)

In [14]:
verbose_index = len(trainloader) // 5 
verbose_index

10

In [15]:
early_stop = [0, -1, False]
for epoch in range(5):
    running_loss = 0.0
    for batch_ind, (inputs, labels) in enumerate(trainloader, 1):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if not (batch_ind % verbose_index):   
            print(f'epoch/batch: {epoch}/{batch_ind} \n\t train loss: {running_loss/verbose_index}')
            running_loss = 0.0
            
            scores = evaluate(model, valloader)
            print('\t', scores)
            tracking_score = scores['matthews']
            if early_stop[1] >= tracking_score:
                early_stop[0] += 1
            else:
                early_stop = [0, tracking_score, False]
            
            if epoch > 1 and early_stop[0] >= 3:
                early_stop[2] = True
                break
            
    if early_stop[2]:
        break

epoch/batch: 0/10 
	 train loss: 0.6980705440044404


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


	 {'accuracy': 0.716, 'f1': 0.834, 'matthews': 0.0}
epoch/batch: 0/20 
	 train loss: 0.6885713100433349


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


	 {'accuracy': 0.716, 'f1': 0.834, 'matthews': 0.0}
epoch/batch: 0/30 
	 train loss: 0.6978034198284149


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


	 {'accuracy': 0.716, 'f1': 0.834, 'matthews': 0.0}
epoch/batch: 0/40 
	 train loss: 0.6899414002895355


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


	 {'accuracy': 0.716, 'f1': 0.834, 'matthews': 0.0}
epoch/batch: 0/50 
	 train loss: 0.6719423770904541


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


	 {'accuracy': 0.716, 'f1': 0.834, 'matthews': 0.0}
epoch/batch: 1/10 
	 train loss: 0.6257011234760285
	 {'accuracy': 0.844, 'f1': 0.888, 'matthews': 0.636}
epoch/batch: 1/20 
	 train loss: 0.5208333224058151
	 {'accuracy': 0.89, 'f1': 0.923, 'matthews': 0.73}
epoch/batch: 1/30 
	 train loss: 0.3992552518844604
	 {'accuracy': 0.743, 'f1': 0.781, 'matthews': 0.58}
epoch/batch: 1/40 
	 train loss: 0.32886877954006194
	 {'accuracy': 0.782, 'f1': 0.82, 'matthews': 0.628}
epoch/batch: 1/50 
	 train loss: 0.35269259810447695
	 {'accuracy': 0.87, 'f1': 0.9, 'matthews': 0.748}
epoch/batch: 2/10 
	 train loss: 0.35853367000818254
	 {'accuracy': 0.87, 'f1': 0.9, 'matthews': 0.749}
epoch/batch: 2/20 
	 train loss: 0.2792656645178795
	 {'accuracy': 0.852, 'f1': 0.885, 'matthews': 0.723}
epoch/batch: 2/30 
	 train loss: 0.2512901671230793
	 {'accuracy': 0.677, 'f1': 0.709, 'matthews': 0.507}
epoch/batch: 2/40 
	 train loss: 0.3354091927409172
	 {'accuracy': 0.844, 'f1': 0.878, 'matthews': 0.711}


In [16]:
# Посмотрим на метрики на тестовом наборе
evaluate(model, testloader)

{'accuracy': 0.968, 'f1': 0.93, 'matthews': 0.909}

In [17]:
# Посмотрим на метрики на тренировочном наборе
evaluate(model, trainloader)

{'accuracy': 0.91, 'f1': 0.919, 'matthews': 0.82}

In [18]:
# Посмотрим на метрики на валидационном наборе
evaluate(model, valloader)

{'accuracy': 0.844, 'f1': 0.878, 'matthews': 0.711}

In [None]:
!Z