In [1]:
import tensorflow  as tf
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import audio_utils
import my_utils
print(tf.config.list_physical_devices('GPU'))


[]


In [2]:
# Constants
DATA_FOLDER = 'data'
FLAC_FOLDER = DATA_FOLDER+'/flac' # trainY
ENCODED_FOLDER = DATA_FOLDER+'/encoded' # trainX
CACHED_FOLDER = 'cached'
POINTS_PER_SAMPLE = 16000 # audio samples

QUANTIZE_SAMPLE_RATE = 44100

training_dir = dict() # trainX -> trainY
for file in os.listdir(FLAC_FOLDER):
    if file.endswith('.flac'):
        filename = file.split('.')[0]
        training_dir[filename] = []

count_values = 0
for file in os.listdir(ENCODED_FOLDER):
    if file.endswith('.mp3'):
        filename = file.split('.')[0]
        bitdepth = file.split('.')[-2]
        file_ext = file.split('.')[-1]
        if filename in training_dir:
            count_values += 1
            training_dir[filename].append(file)
        else:
            print("File %s not found in training_dir" % file)


print(len(training_dir.keys()),"->",count_values)

1 -> 3


In [3]:
DO_CACHING = False
# load from training_dir
trainX, trainY = my_utils.load_from_cache(CACHED_FOLDER)
if trainX is not None and trainY is not None and DO_CACHING:
    print("Loaded from cache")
if trainX is None or trainY is None or not DO_CACHING:
    trainX = []
    trainY = []
    for flac_name in tqdm(training_dir, desc='Loading FLAC files', unit='files'):
        flac_path = FLAC_FOLDER+'/'+flac_name+'.flac'
        flac_data, samplerate = audio_utils.load_flac_to_numpy(flac_path)
        flac_slices = audio_utils.slice_audio(flac_data, POINTS_PER_SAMPLE)
        
        for mp3_name in training_dir[flac_name]:
            mp3_path = ENCODED_FOLDER+'/'+mp3_name
            mp3_data, samplerate = audio_utils.load_mp3_to_numpy(mp3_path)
            mp3_slices = audio_utils.slice_audio(mp3_data, POINTS_PER_SAMPLE)
            if len(mp3_slices) != len(flac_slices):
                print("Slices not equal:",mp3_name,flac_name)
                continue
            trainX.extend(mp3_slices)
            trainY.extend(flac_slices)
    my_utils.save_to_cache(CACHED_FOLDER, trainX, trainY)

    print(len(trainX), len(trainY))

Loading FLAC files: 100%|██████████| 1/1 [00:04<00:00,  4.03s/files]


2556 2556


In [4]:
# (2556, 16000, 2) (2556, 16000, 2)
trainX = np.array(trainX)
trainY = np.array(trainY)
print(trainX.shape,trainY.shape)
# (2556, 32000, 2) (2556, 32000, 2)
trainX = trainX.reshape(trainX.shape[0], trainX.shape[1] * trainX.shape[2])
trainY = trainY.reshape(trainY.shape[0], trainY.shape[1] * trainY.shape[2])
print(trainX.shape,trainY.shape)

(2556, 16000, 2) (2556, 16000, 2)
(2556, 32000) (2556, 32000)


In [5]:
# normalize data between 0 and 1 from -1 to 1
trainX = trainX / np.max(np.abs(trainX))
trainY = trainY / np.max(np.abs(trainY))

# make bigger numbers!
trainX = trainX * 5
trainY = trainY * 5

# avg value of trainX and trainY
avg_trainX = np.mean(trainX)
print("avg_trainX:",avg_trainX)
avg_trainY = np.mean(trainY)
print("avg_trainY:",avg_trainY)

avg_trainX: -1.79e-05
avg_trainY: -5.6e-06


In [7]:
# make keras model
model = tf.keras.Sequential()
# input layer
model.add(tf.keras.layers.Input(shape=(trainX.shape[1],)))
# hidden layers
model.add(tf.keras.layers.Dense(2048, activation='relu'))
model.add(tf.keras.layers.Dense(1024, activation='relu'))
# output layer
model.add(tf.keras.layers.Dense(trainY.shape[1], activation='relu'))
model.compile(optimizer='adam', loss='mse')

In [8]:
# train
model.fit(trainX, trainY, epochs=3, batch_size=8, validation_split=0.1, shuffle=True)


Epoch 1/3
 19/288 [>.............................] - ETA: 2:30 - loss: 0.1760

KeyboardInterrupt: 