<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#uberduck_ml_dev.exec.select_speakers" data-toc-modified-id="uberduck_ml_dev.exec.select_speakers-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>uberduck_ml_dev.exec.select_speakers</a></span></li></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#uberduck_ml_dev.exec.select_speakers" data-toc-modified-id="uberduck_ml_dev.exec.select_speakers-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>uberduck_ml_dev.exec.select_speakers</a></span></li></ul></div>

In [None]:
# default_exp exec.select_speakers

# uberduck_ml_dev.exec.select_speakers

In [None]:
# export
import argparse
from collections import namedtuple
from dataclasses import dataclass
import json
import os
from pathlib import Path
from shutil import copyfile, copytree
import sys
from typing import List, Optional, Set

import sqlite3
from tqdm import tqdm

from uberduck_ml_dev.data.cache import ensure_filelist_in_cache, ensure_speaker_table
from uberduck_ml_dev.utils.audio import convert_to_wav
from uberduck_ml_dev.utils.utils import parse_vctk

CACHE_LOCATION = Path.home() / Path(".cache/uberduck/uberduck-ml-dev.db")


@dataclass
class Filelist:
    path: str
    sql: Optional[str] = None
    speaker_ids: Optional[List[int]] = None
    speakers: Optional[List[str]] = None
    speaker_idx_in_path: int = None


def _get_speaker_ids(filelist: Filelist) -> Set[int]:
    if filelist.speaker_ids:
        return set(filelist.speaker_ids)

    path = os.path.expanduser(filelist.path)
    ensure_speaker_table()
    ensure_filelist_in_cache(path, speaker_idx_in_path=filelist.speaker_idx_in_path)
    if filelist.speakers:
        # conn =
        conn = sqlite3.connect(str(CACHE_LOCATION))
        cursor = conn.cursor()
        params = ",".join("?" for _ in filelist.speakers)
        results = cursor.execute(
            f"SELECT speaker_id FROM speakers where filepath = ? AND name in ({params})",
            [path, *filelist.speakers],
        ).fetchall()
        speaker_ids = set([speaker_id for (speaker_id, *_) in results])
        return speaker_ids
    elif filelist.sql:
        if not CACHE_LOCATION.exists():
            msg = "Filelist cache does not exist! You must generate it."
            print(msg)
            raise Exception(msg)
        conn = sqlite3.connect(str(CACHE_LOCATION))
        cursor = conn.cursor()
        results = cursor.execute(filelist.sql).fetchall()

        speaker_ids = set([speaker_id for (speaker_id, *_) in results])
        return speaker_ids


def select_speakers(filelists: List[Filelist], output_filelist: str):
    new_speaker_id = 0
    seen_speaker_ids = dict()
    with open(output_filelist, "w") as f_out:
        for filelist in tqdm(filelists):
            speaker_ids = _get_speaker_ids(filelist)
            if filelist.path:
                filelist.path = os.path.expanduser(filelist.path)
            with open(filelist.path, "r") as f_in:
                for line in f_in.readlines():
                    path, txn, original_speaker_id = line.strip().split("|")
                    if (
                        speaker_ids is not None
                        and int(original_speaker_id) not in speaker_ids
                    ):
                        continue
                    if (filelist.path, original_speaker_id) not in seen_speaker_ids:
                        seen_speaker_ids[
                            (filelist.path, original_speaker_id)
                        ] = new_speaker_id
                        current_speaker_id = new_speaker_id
                        new_speaker_id += 1
                    else:
                        current_speaker_id = seen_speaker_ids[
                            (filelist.path, original_speaker_id)
                        ]
                    f_out.write(f"{path}|{txn}|{current_speaker_id}\n")


def parse_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="path to JSON config")
    return parser.parse_args(args)


try:
    from nbdev.imports import IN_NOTEBOOK
except:
    IN_NOTEBOOK = False

if __name__ == "__main__" and not IN_NOTEBOOK:
    args = parse_args(sys.argv[1:])
    if args.config:
        with open(args.config) as f:
            config = json.load(f)
        filelists = [Filelist(**f) for f in config["filelists"]]
        output_filelist = config["output"]
    else:
        raise Exception("You must pass a config file!")
    select_speakers(filelists, output_filelist)

In [None]:
# skip
_get_speaker_ids(
    Filelist(
        "/Users/zwf/data/voice/dvc-managed/uberduck-multispeaker/list.txt",
        speakers=["Carl_Wheezer"],
        speaker_idx_in_path=-1,
    ),
)



{41}

In [None]:
args = parse_args(["--config", "foo.json"])
assert args.config == "foo.json"

In [None]:
# sam stuff
import sqlite3
import os
import pandas as pd


def get_filelist(database, speakerjson):
    """
    Take a list of speakers and create a filelist
    """


def subset_speakers(database, seed, speakerlist=None, nspeakers=None):
    """
    Takes a filelist and saves another filelist with either a random subset of speakers or speakers from the list
    """

In [None]:
vctk_filelist2, namelist = parse_vctk2(vctk_folder)
print(namelist)

NameError: name 'parse_vctk2' is not defined

In [None]:
conn = sqlite3.connect("test.db")

conn.execute(
    """CREATE TABLE DATAINFO
         (ID INT PRIMARY KEY     NOT NULL,
         NAME           TEXT     NOT NULL,
         SOURCE         TEXT     NOT NULL,
         FILELIST       TEXT,
         SPEAKERID      INT);"""
)

conn.execute(
    "INSERT INTO DATAINFO (ID,NAME,SOURCE,FILELIST,SPEAKERID) \
      VALUES (1, 'eminem', 'uberduck', '/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/eminem_all_processed.txt', 0)"
)

# id doesnt need to be included necessarily but makes sense
cursor = conn.execute("SELECT ID, NAME, SOURCE, FILELIST from DATAINFO")
for row in cursor:
    print("ID = ", row[0])
    print("NAME = ", row[1])
    print("ADDRESS = ", row[2])
    print("SALARY = ", row[3])

In [None]:
# this is an example of a speakerlist that would be loaded from a json
# a json could also be generated by querying the database e.g.
# subset_speakers(database, seed, speakerlist = None, nspeakers = None)
speakerlist = np.asarray([["vctk", "p302"], ["uberduck", "eminem"]])
print(speakerlist)

In [None]:
# now call get_filelist(database, speakerjson)
# this list (filelist locations) would be got from database
metalist_dir = "/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/metadata_collections"
metalist_files = os.listdir(metalist_dir)
# print(metalist_files)
train_ratios = np.ones(4) * 1.0
print(np.load(metalist_dir + "/" + metalist_files[0]))
print(np.load(metalist_dir + "/libritts_processed_file.npy"))
print(np.load(metalist_dir + "/uberduck_processed_files.npy", allow_pickle=True))
print(
    np.load(
        "/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/metadata_collections/vctk_processed_file.npy"
    )
)

In [None]:
# now synthesize selected speakers from filelists in database and synthesize selected (e.g. [2,4,5])
# something like the following, but with the ability to take only rows of multispeaker datasets corresponding to particular speakers
filelists = np.asarray([])
# files = np.asarray([])
for r in range(4):
    files = np.load(metalist_dir + "/" + metalist_files[r], allow_pickle=True)
    filelist = np.asarray([])

    if files.ndim > 0:
        nfiles = files.shape[0]
        for s in range(nfiles):
            filelist = np.append(filelist, files[s])
    else:
        filelist = files
    filelists = np.append(filelists, filelist)

print(filelists)
dd = synthesize_speakerids2(filelists, 1)
ad = list(dd.values())
ad2 = [ad[i] for i in [2, 4, 5]]
alldata = pd.concat(ad2)

In [None]:
print(alldata)