In [1]:
import torch, torchvision
import sys # Python system library needed to load custom functions
import math # module with access to mathematical functions
import os # for changing the directory

import numpy as np  # for performing calculations on numerical arrays
import pandas as pd  # home of the DataFrame construct, _the_ most important object for Data Science
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt  # allows creation of insightful plots

import librosa

sys.path.append('../../audio_preprocessing')
sys.path.append('../../src')
sys.path.append('../../model_training_utils')


import preprocessing_func_3
from generator_to_dataset_3 import NormalisedDataSet
from gdsc_utils import PROJECT_DIR
import model_training
import model_eval

os.chdir(PROJECT_DIR) # changing our directory to root

In [2]:
df_big_data = pd.read_csv('data/big_processed_data.csv')
df_big_argumented_data = pd.read_csv('data/big_argumentation_data.csv')
df = pd.concat([df_big_data, df_big_argumented_data], ignore_index=True)
df.tail()

Unnamed: 0.1,Unnamed: 0,file_path,label
66835,44959,data/big_data_upsample/44959.wav,65
66836,44960,data/big_data_upsample/44960.wav,65
66837,44961,data/big_data_upsample/44961.wav,65
66838,44962,data/big_data_upsample/44962.wav,65
66839,44963,data/big_data_upsample/44963.wav,65


In [3]:
# import json

# with open('audio_preprocessing/saved_data/new_data.json') as f:
#     my_info = json.load(f)

# mean, std, class_weights = my_info["mean"], my_info["std"], my_info["weights"]

In [4]:
# def resize_function(output_shape=(40,256)):
#     return torchvision.transforms.Resize(size=output_shape, antialias=False)

# def calculate_melsp(x, n_fft=1024, hop_length=128):
#     stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2
#     log_stft = librosa.power_to_db(stft)
#     melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=40)
#     return melsp

In [5]:
train_df_list = []
val_df_list = []

for i in range(66):
    my_df = df[df["label"] == i]
    current_train_df, current_val_df = train_test_split(my_df, test_size=0.2)
    train_df_list.append(current_train_df)
    val_df_list.append(current_val_df)

df_train = pd.concat(train_df_list, ignore_index=True)
df_val = pd.concat(val_df_list, ignore_index=True)

In [6]:
df_train.shape, df_val.shape

((53441, 3), (13399, 3))

In [7]:
paths, labels = list(df_train["file_path"]), list(df_train["label"])

non_normal_gen = preprocessing_func_3.non_normalised_data_generator(
    paths=paths,
    labels=labels,
#     image_preprocess_fn=resize_function(output_shape=(40,256)),
#     mel_transform_fn=calculate_melsp
)

In [8]:
mean, std, class_wights = preprocessing_func_3.get_stats_and_class_weights_of_non_normalised_data_gen(
    non_normal_gen, (128, 512))

In [9]:
mean, std, class_wights

(tensor([-1.0922]),
 tensor([0.7937]),
 array([1.01087656, 1.01214015, 1.00961611, 1.00961611, 1.00961611,
        1.01087656, 1.01087656, 1.01087656, 1.01214015, 1.01214015,
        1.01214015, 1.00961611, 1.01214015, 1.01087656, 1.00961611,
        1.01214015, 1.01087656, 1.01214015, 1.00961611, 0.81954668,
        1.01214015, 1.01087656, 1.01214015, 1.01087656, 1.01214015,
        1.00710463, 1.00961611, 1.00961611, 1.01087656, 0.6747601 ,
        1.01214015, 1.00961611, 1.01214015, 1.01214015, 1.00961611,
        1.01087656, 1.01214015, 1.00961611, 1.01214015, 1.01087656,
        1.01214015, 1.01214015, 1.01214015, 1.01087656, 1.00961611,
        1.01214015, 1.01087656, 1.01214015, 1.01087656, 1.01214015,
        1.00961611, 1.01214015, 1.01087656, 1.01214015, 1.01214015,
        1.01087656, 1.00961611, 1.01087656, 1.01087656, 1.01214015,
        1.01214015, 1.01214015, 1.01087656, 1.01087656, 1.01214015,
        1.01087656]))

In [10]:
import json

def save_as_json(path, description, mean, std, weights):
    my_dict = {
        "description": description,
        "mean": float(mean),
        "std": float(std),
        "weights": list(class_wights.astype(float)),
    }
    with open(path, 'w') as f:
        json.dump(my_dict, f)

save_as_json(
    "audio_preprocessing/saved_data/upsampled_data_size_128_512.json", 
    "seconds 1.5, image shape (128,512)", mean, std, class_wights)

In [11]:
train_dataset = NormalisedDataSet(
    non_normalised_data_generator_fn=preprocessing_func_3.non_normalised_data_generator, 
    normalised_data_generator_fn=preprocessing_func_3.normalised_data_generator,
    df=df_train, 
    mean=mean,
    std=std,
    shuffle=True,
#     image_preprocess_fn=resize_function(output_shape=(40,256)),
#     mel_transform_fn=calculate_melsp,
)

val_dataset = NormalisedDataSet(
    non_normalised_data_generator_fn=preprocessing_func_3.non_normalised_data_generator, 
    normalised_data_generator_fn=preprocessing_func_3.normalised_data_generator,
    df=df_val, 
    mean=mean,
    std=std,
    shuffle=False,
#     image_preprocess_fn=resize_function(output_shape=(40,256)),
#     mel_transform_fn=calculate_melsp,
)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=28)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=28)

In [12]:
device = model_training.get_device()

In [13]:
from torchvision.models import resnet34, ResNet34_Weights
import torch.nn as nn
import torch.optim as optim

#resnet_model = resnet34(weights=ResNet34_Weights.DEFAULT)
resnet_model = resnet34()
resnet_model.fc = nn.Linear(512, 66)
resnet_model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet_model = resnet_model.to(device)

In [14]:
optimizer = optim.Adam(resnet_model.parameters(), amsgrad=True)
loss = nn.CrossEntropyLoss()

In [15]:
model_training.training(
    model=resnet_model, 
    optimizer=optimizer, 
    loss_fn=loss, 
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader, 
    model_path="models/resnet34", 
    start_epoch=5,
    epochs=500,
    early_stop_thresh=50,
)

End of epoch 0: training accuracy = 69.80%, training loss = 1.0375653201252644, training time taken = 566.14 seconds
End of epoch 0: validation accuracy = 85.80%, validation loss = 0.4922554972780249, validation time taken = 204.67 seconds
End of epoch 1: training accuracy = 91.09%, training loss = 0.29849428516671633, training time taken = 849.23 seconds
End of epoch 1: validation accuracy = 94.50%, validation loss = 0.18138490816456476, validation time taken = 181.40 seconds
End of epoch 2: training accuracy = 94.96%, training loss = 0.16582944459895832, training time taken = 849.01 seconds
End of epoch 2: validation accuracy = 95.96%, validation loss = 0.1317654311385667, validation time taken = 173.81 seconds
End of epoch 3: training accuracy = 96.51%, training loss = 0.11546299124990077, training time taken = 773.79 seconds
End of epoch 3: validation accuracy = 97.39%, validation loss = 0.08546473813751072, validation time taken = 176.42 seconds
End of epoch 4: training accuracy =

KeyboardInterrupt: 

In [None]:
#torch.save(resnet_model, 'models/resnet34/resnet34_model_input_40_256_epoch_14.pth')

In [None]:
import preprocessing_func_2

df_big_long_wav = pd.read_csv('data/metadata.csv')
df_val_long_wav = df_big_long_wav[df_big_long_wav["subset"]=="validation"]

def get_df_from_class(class_num, df):
    new_df = df[df["label"] == class_num]
    return new_df

for i in range(66):
    df_val_new = get_df_from_class(i, df_val_long_wav)
    paths, labels = list(df_val_new["path"]), list(df_val_new["label"])
    non_normalised_generator = preprocessing_func_2.non_normalised_data_generator(
        paths=paths, 
        labels=labels,
#         image_preprocess_fn=resize_function(output_shape=(40,256)),
#         mel_transform_fn=calculate_melsp,
    )
    normalised_generator = preprocessing_func_2.normalised_data_generator(
        non_normalised_generator, mean, std)
    print(f"we are now in class {i}")
    final_pred = model_eval.evaluation(resnet_model, normalised_generator)
    print()

In [None]:
df_big_long_wav = pd.read_csv('data/metadata.csv')
df_val_long_wav = df_big_long_wav[df_big_long_wav["subset"]=="validation"]


paths, labels = list(df_val_long_wav["path"]), list(df_val_long_wav["label"])
non_normalised_generator = preprocessing_func_2.non_normalised_data_generator(
    paths=paths, 
    labels=labels,
#     image_preprocess_fn=resize_function(output_shape=(40,256)),
#     mel_transform_fn=calculate_melsp,
)
normalised_generator = preprocessing_func_2.normalised_data_generator(non_normalised_generator, mean, std)
model_eval.evaluation(resnet_model, normalised_generator)