In [1]:
import pandas as pd
import warnings
import os
import torch
import torchaudio
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from rich.progress import track
from sklearn.metrics import accuracy_score
import time
import wandb

warnings.filterwarnings('ignore')

In [2]:
from datetime import datetime

parameters = {
    "num_class": 5,
    "time": str(datetime.now()).replace(" ", "_"),
    "model_name": 'CNN',
    "learning_rate": 1e-3,
    "epochs": 1000,
    "batch_size": 16,
    "dropout": 0.1,
    "wandb": False
}

In [3]:
def init_wandb():
    # start a new wandb run to track this script
    parameters["wandb"] = True
    wandb.init(
        # set the wandb project where this run will be logged
        project="CNN1D",

        # track hyperparameters and run metadata
        config={
            "learning_rate": 0.001,
            "architecture": "CNN",
            "dataset": "TBRAIN",
            "epochs": parameters['epochs'],
        }
    )

[34m[1mwandb[0m: Currently logged in as: [33mxunhaoz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
def gen_new_data(df_source_path="./Training Dataset/training datalist.csv", df_dest_path="./NewGenData/NewGenData.csv",
                 dest="./NewGenData/voice"):
    """
    資料擴增，將所有音檔切成每段0.5秒
    """
    all_df = pd.read_csv(df_source_path)
    source_path = "./Training Dataset/training_voice_data"
    dest_path = dest

    expand_data = []
    count = 5000

    for row in all_df.iterrows():
        waveform, sample_rate = torchaudio.load(os.path.join(source_path, row[1]["ID"] + ".wav"))

        chunks = None
        if waveform.size(1) / sample_rate == 1:
            chunks = torch.chunk(waveform, 2, dim=1)
        elif waveform.size(1) / sample_rate == 1.5:
            chunks = torch.chunk(waveform, 3, dim=1)
        elif waveform.size(1) / sample_rate == 2:
            chunks = torch.chunk(waveform, 4, dim=1)
        elif waveform.size(1) / sample_rate == 3:
            chunks = torch.chunk(waveform, 6, dim=1)

        for chunk in chunks:
            count += 1
            row[1]["ID"] = str(count) + ".wav"
            torchaudio.save(os.path.join(dest_path, row[1]["ID"]), chunk, sample_rate)
            expand_data.append(row[1].copy())

    expand_df = pd.concat(expand_data, axis=1, ignore_index=True)
    expand_df.T.to_csv(df_dest_path)


In [5]:
def split_new_data(source="NewGenData/NewGenData.csv", train_dest="NewGenData/train_df.csv",
                   val_dest="NewGenData/val_df.csv"):
    """
    資料分割，確保所有類別資料被分為 0.8 0.2
    """
    all_df = pd.read_csv(source)
    cat1_df = all_df[all_df["Disease category"] == 1]
    cat2_df = all_df[all_df["Disease category"] == 2]
    cat3_df = all_df[all_df["Disease category"] == 3]
    cat4_df = all_df[all_df["Disease category"] == 4]
    cat5_df = all_df[all_df["Disease category"] == 5]
    train_cat1_df, val_cat1_df = train_test_split(cat1_df, train_size=0.8)
    train_cat2_df, val_cat2_df = train_test_split(cat2_df, train_size=0.8)
    train_cat3_df, val_cat3_df = train_test_split(cat3_df, train_size=0.8)
    train_cat4_df, val_cat4_df = train_test_split(cat4_df, train_size=0.8)
    train_cat5_df, val_cat5_df = train_test_split(cat5_df, train_size=0.8)

    train_df = pd.concat([train_cat1_df, train_cat2_df, train_cat3_df, train_cat4_df, train_cat5_df],
                         ignore_index=True).fillna(0)
    val_df = pd.concat([val_cat1_df, val_cat2_df, val_cat3_df, val_cat4_df, val_cat5_df], ignore_index=True).fillna(0)

    train_df.to_csv(train_dest)
    val_df.to_csv(val_dest)


In [6]:
class AudioDataset(Dataset):
    def __init__(self, df, args, voice_path="NewGenData/voice"):
        self.df = df
        self.voice_path = voice_path
        self.num_class = args["num_class"]

    def __len__(self):
        return len(self.df)

    def one_hot_label(self, label):
        return nn.functional.one_hot(torch.tensor(label - 1), num_classes=self.num_class)

    def __getitem__(self, index):
        pop_list = ['Unnamed: 0.1', 'Unnamed: 0', 'ID', 'Disease category']
        pow_2_15 = 32768

        waveform, sample_rate = torchaudio.load(
            os.path.join(self.voice_path, self.df["ID"][index])
        )
        label = self.df["Disease category"][index]

        table_info = self.df.iloc[index].copy().to_dict()
        for pop_item in pop_list:
            table_info.pop(pop_item)

        """擴展16倍"""
        extend_list_info = []
        for item in list(table_info.values()):
            extend_list_info.extend([item] * 16)

        waveform = torch.cat((waveform.view(-1), torch.tensor(extend_list_info)))
        padding = (pow_2_15 - waveform.size(0))
        waveform = nn.functional.pad(waveform, (padding // 2, padding - (padding // 2)), mode='constant', value=0)
        waveform = waveform.unsqueeze(0)

        return waveform, self.one_hot_label(label).float()

In [7]:
"""記得接softmax"""
"""one hot encoding 距離權重"""


class SoundClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )
        self.dropout = nn.Dropout(parameters["dropout"])
        self.flatten = nn.Flatten()
        self.fc1 = nn.Sequential(
            nn.Linear(64 * 256, 16 * 256),
            nn.ReLU(),
            nn.Linear(16 * 256, 2 * 256),
            nn.ReLU(),
            nn.Linear(2 * 256, 64),
            nn.ReLU(),
            nn.Linear(64, parameters["num_class"]),
            nn.Softmax()
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc1(x)
        return x


class SoundClassifierSmaller(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )
        self.dropout = nn.Dropout(parameters["dropout"])
        self.flatten = nn.Flatten()
        self.fc1 = nn.Sequential(
            nn.Linear(64 * 4096, 2 ** 9),
            nn.ReLU(),
            nn.Linear(2 ** 9, 2 ** 4),
            nn.ReLU(),
            nn.Linear(2 ** 4, parameters["num_class"])
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc1(x)
        return x


class SoundClassifierFewer(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=4, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=4, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )
        self.dropout = nn.Dropout(parameters["dropout"])
        self.flatten = nn.Flatten()
        self.fc1 = nn.Sequential(
            nn.Linear(16 * 256, 4 * 256),
            nn.ReLU(),
            nn.Linear(4 * 256, 2 * 256),
            nn.ReLU(),
            nn.Linear(2 * 256, 64),
            nn.ReLU(),
            nn.Linear(64, parameters["num_class"])
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc1(x)
        return x


class FromKaggle1(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.MaxPool1d(2),
        )
        self.dropout = nn.Dropout(parameters["dropout"])
        self.flatten = nn.Flatten()
        self.fc1 = nn.Sequential(
            nn.Linear(128 * 8192, 1024),
            nn.Tanh(),
            nn.Linear(1024, parameters["num_class"]),
            nn.Softmax(),
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc1(x)
        return x


In [8]:
def get_pred(output):
    return torch.argmax(output, dim=1)


# calculate confusion metrics
def cal_acc(pred, ans):
    pred = get_pred(pred).cpu().numpy()
    ans = get_pred(ans).cpu().numpy()
    accuracy = accuracy_score(ans, pred)
    return accuracy

In [9]:
def evaluate(model, data_loader, device, loss_fct):
    step_count = val_loss = val_acc = 0
    model.eval()
    with torch.no_grad():
        for batch_idx, (inputs, labels) in enumerate(data_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            logits = model(inputs)
            loss = loss_fct(logits, labels)
            val_loss += loss.item()
            val_acc += cal_acc(logits, labels)
            step_count += 1

        val_loss = val_loss / step_count
        val_acc = val_acc / step_count

    return val_loss, val_acc

In [10]:
def train(model, data_loader, device, loss_fct, optimizer):
    step_count = train_loss = train_acc = 0

    model.train()
    for batch_idx, (inputs, labels) in enumerate(data_loader):
        optimizer.zero_grad()
        inputs = inputs.to(device)
        labels = labels.to(device)
        logits = model(inputs)

        loss = loss_fct(logits, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += cal_acc(logits, labels)
        step_count += 1

    train_loss = train_loss / step_count
    train_acc = train_acc / step_count

    return train_loss, train_acc

In [11]:
train_df = pd.read_csv("NewGenData1/train_df.csv")
vel_df = pd.read_csv("NewGenData1/val_df.csv")

train_dataset = AudioDataset(train_df, parameters, voice_path="NewGenData1/voice")
dev_dataset = AudioDataset(vel_df, parameters, voice_path="NewGenData1/voice")

train_dataLoader = DataLoader(train_dataset, batch_size=parameters["batch_size"], shuffle=True)
dev_dataLoader = DataLoader(dev_dataset, batch_size=parameters["batch_size"], shuffle=True)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SoundClassifier().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=parameters['learning_rate'], momentum=0.9)
# optimizer = torch.optim.Adam(model.parameters(), lr=parameters['learning_rate'], betas=(0.9, 0.999), eps=1e-9)
loss_fct = torch.nn.CrossEntropyLoss()

In [None]:
for epoch in range(parameters["epochs"]):
    st_time = time.time()
    train_loss, train_acc = train(model, train_dataLoader, device, loss_fct, optimizer)
    val_loss, val_acc = evaluate(model, dev_dataLoader, device, loss_fct)

    print('[epoch %d] cost time: %.4f s' % (epoch + 1, time.time() - st_time))
    print('         loss        acc')
    print(f'train | {train_loss: .4f},\t{train_acc: .4f}')
    print(f'val   | {val_loss: .4f},\t{val_acc: .4f}\n')
    if parameters["wandb"]:
        wandb.log({"train_acc": train_acc, "train_loss": train_loss, "test_acc": val_acc, "test_loss": val_loss})

[epoch 1] cost time: 21.9150 s
         loss        acc
train |  1.6126,	 0.1946
val   |  1.6074,	 0.2245

[epoch 2] cost time: 13.3580 s
         loss        acc
train |  1.6012,	 0.2313
val   |  1.5954,	 0.2245

[epoch 3] cost time: 13.3749 s
         loss        acc
train |  1.5891,	 0.5138
val   |  1.5822,	 0.5724

[epoch 4] cost time: 13.5617 s
         loss        acc
train |  1.5753,	 0.5583
val   |  1.5672,	 0.5700

[epoch 5] cost time: 13.5553 s
         loss        acc
train |  1.5594,	 0.5565
val   |  1.5488,	 0.5700

[epoch 6] cost time: 13.5354 s
         loss        acc
train |  1.5390,	 0.5586
val   |  1.5261,	 0.5651

[epoch 7] cost time: 13.7013 s
         loss        acc
train |  1.5107,	 0.5576
val   |  1.4893,	 0.5627

[epoch 8] cost time: 13.5785 s
         loss        acc
train |  1.4561,	 0.5576
val   |  1.4088,	 0.5627

[epoch 9] cost time: 14.0470 s
         loss        acc
train |  1.3761,	 0.5569
val   |  1.3498,	 0.5651

[epoch 10] cost time: 13.7627 s
     