In [1]:
## Writing to s3
from io import BytesIO
import numpy as np
from urllib.parse import urlparse
import boto3
client = boto3.client("s3")

def to_s3_npy(data: np.array, s3_uri: str):
    # s3_uri looks like f"s3://{BUCKET_NAME}/{KEY}"
    bytes_ = BytesIO()
    np.save(bytes_, data, allow_pickle=True)
    bytes_.seek(0)
    parsed_s3 = urlparse(s3_uri)
    client.upload_fileobj(
        Fileobj=bytes_, Bucket=parsed_s3.netloc, Key=parsed_s3.path[1:]
    )
    return True

def from_s3_npy(s3_uri: str):
    bytes_ = BytesIO()
    parsed_s3 = urlparse(s3_uri)
    client.download_fileobj(
        Fileobj=bytes_, Bucket=parsed_s3.netloc, Key=parsed_s3.path[1:]
    )
    bytes_.seek(0)
    return np.load(bytes_, allow_pickle=True)



In [2]:
from glob import glob

import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(glob("../data/LibriSpeech/dev-clean/*/*/*") + glob("../data/LibriSpeech/train-clean-100/*/*/*"))
df["split"] = df[0].apply(lambda x:x.split("/")[3])
df["id"] = df[0].apply(lambda x:x.split("/")[-1].split(".")[0])
df.drop(0, axis=1, inplace=True)


base_dir = "../data/SV2TTS"

dirs = {
    "audio_dir": base_dir + "/synthesizer/audio",
    "mels_dir": base_dir + "/vocoder/mels_gta"
}

# with open(f"{base_dir}/vocoder/synthesized.txt", "r") as fh:
#     data = [line.split("|")[: 2] for line in fh.readlines()]

data = []
with open(f"{base_dir}/vocoder/synthesized.txt", "r") as fh:
    for line in fh.readlines():
        line = line.split("|")
        line.insert(0, "-".join(line[0].split("-")[1:]).split(".")[0])
        data.append(line[:3])    

df = df.merge(pd.DataFrame(data, columns=["id", "audio_path", "mels_path"]))
    
df.head()

Unnamed: 0,split,id,audio_path,mels_path
0,dev-clean,5536-43363-0010,audio-5536-43363-0010.npy,mel-5536-43363-0010.npy
1,dev-clean,5536-43363-0007,audio-5536-43363-0007.npy,mel-5536-43363-0007.npy
2,dev-clean,5536-43363-0014,audio-5536-43363-0014.npy,mel-5536-43363-0014.npy
3,dev-clean,5536-43363-0001,audio-5536-43363-0001.npy,mel-5536-43363-0001.npy
4,dev-clean,5536-43363-0017,audio-5536-43363-0017.npy,mel-5536-43363-0017.npy


In [3]:
df.split.value_counts()

train-clean-100    10280
dev-clean           2333
Name: split, dtype: int64

In [4]:
tmp = df[df.split=="dev-clean"]

for name in ["audio", "mels"]:
    tmp[name] = tmp[f"{name}_path"].apply(lambda x: np.load(f"{dirs[f'{name}_dir']}/{x}"))\
    
tmp = tmp[["audio", "mels"]]
    
tmp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,audio,mels
0,"[-0.0007731959, -0.00014497421, 9.6649484e-05,...","[[-3.8337, -3.8927107, -3.9315298, -3.8083508,..."
1,"[-0.002602506, -0.0030028915, -0.0036034698, -...","[[-3.6453233, -3.7025533, -3.7461853, -3.60947..."
2,"[0.0038086134, 0.0033171796, 0.0042386185, 0.0...","[[-3.5119064, -3.6034284, -3.6512876, -3.47596..."
3,"[-0.0020899854, -0.0024557328, -0.0019854861, ...","[[-3.624129, -3.715977, -3.7884045, -3.6129699..."
4,"[0.00037834997, 0.0006148187, 0.00070940616, 0...","[[-3.6661463, -3.747033, -3.805007, -3.6739264..."


In [7]:
to_s3_npy(tmp.to_numpy(), "s3://rtvc-data/preprocessed/vocoder_librispeech_valid.npy")

True

In [8]:
del tmp

In [9]:
import gc
gc.collect()

3692

In [4]:
tmp = df[df.split!="dev-clean"]

for name in ["audio", "mels"]:
    tmp[name] = tmp[f"{name}_path"].apply(lambda x: np.load(f"{dirs[f'{name}_dir']}/{x}"))
    
tmp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,split,id,audio_path,mels_path,audio,mels
2333,train-clean-100,1898-145720-0010,audio-1898-145720-0010.npy,mel-1898-145720-0010.npy,"[-0.0014204666, -0.0014588577, -0.0013052936, ...","[[-4.01692, -4.0515966, -4.0459785, -3.9945538..."
2334,train-clean-100,1898-145720-0011,audio-1898-145720-0011.npy,mel-1898-145720-0011.npy,"[0.008353918, 0.007141253, 0.006332809, 0.0061...","[[-3.5591335, -3.626139, -3.6655426, -3.614958..."
2335,train-clean-100,1898-145720-0000,audio-1898-145720-0000.npy,mel-1898-145720-0000.npy,"[0.0044325194, 0.0037047928, 0.0019847103, 0.0...","[[-3.8294525, -3.913415, -3.9581838, -3.890682..."
2336,train-clean-100,1898-145720-0007,audio-1898-145720-0007.npy,mel-1898-145720-0007.npy,"[-0.0013790461, -0.0035625359, -0.0052863434, ...","[[-3.2182734, -3.3049483, -3.3779144, -3.32378..."
2337,train-clean-100,1898-145720-0018,audio-1898-145720-0018.npy,mel-1898-145720-0018.npy,"[-0.008375032, -0.0154069, -0.030260732, -0.03...","[[-3.0336277, -3.0452774, -3.0331461, -2.99663..."


In [None]:
to_s3_npy(tmp.to_numpy(), "s3://rtvc-data/preprocessed/vocoder_librispeech_train.npy")

In [6]:
mapper = pd.DataFrame(glob("../data/LibriSpeech/dev-clean/*/*/*") + glob("../data/LibriSpeech/train-clean-100/*/*/*"))

mapper["split"] = mapper[0].apply(lambda x:x.split("/")[3])

mapper["id"] = mapper[0].apply(lambda x:x.split("/")[-1].split(".")[0])

mapper.drop(0, axis=1, inplace=True)

mapper

Unnamed: 0,split,id
0,dev-clean,5536-43363-0006
1,dev-clean,5536-43363-0010
2,dev-clean,5536-43363-0007
3,dev-clean,5536-43363-0014
4,dev-clean,5536-43363-0001
...,...,...
31919,train-clean-100,839-130898-0069
31920,train-clean-100,839-130898-0046
31921,train-clean-100,839-130898-0049
31922,train-clean-100,839-130898-0097


In [8]:
tmp = df.merge(mapper)

In [9]:
tmp.split.value_counts()

train-clean-100    10280
dev-clean           2333
Name: split, dtype: int64

In [12]:
tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12613 entries, 0 to 12612
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   audio_path  12613 non-null  object
 1   mels_path   12613 non-null  object
 2   id          12613 non-null  object
 3   audio       12613 non-null  object
 4   mels        12613 non-null  object
 5   split       12613 non-null  object
dtypes: object(6)
memory usage: 689.8+ KB


In [16]:
tmp[tmp.split!="dev-clean"]

train-clean-100    10280
Name: split, dtype: int64

In [10]:
np.savez("data/vocoder_librispeech_valid.npz", tmp[tmp.split=="dev-clean"])

In [17]:
np.savez("data/vocoder_librispeech_train.npz", tmp[tmp.split!="dev-clean"])

MemoryError: 