In [9]:
import os
import os.path

import numpy as np
from matchms.importing import load_from_msp
from matchms.exporting import save_as_msp

from raims.split import random_split


def save_split(splits, filenames, prefix):
    if len(splits) != len(filenames):
        raise ValueError('The number ofd splits does not match the number of provided filenames')

    if not os.path.exists(prefix):
        os.makedirs(prefix)

    for split, filename in zip(splits, filenames):
        save_as_msp(split, os.path.join(prefix, filename))

def random_split_and_save(data, frac, names, prefix):
    save_split(random_split(data, frac), names, prefix)

## Load dataset

In [7]:
mona = list(load_from_msp('../data/src/2022-03-04_MoNA-export-GC-MS_Spectra.msp'))



## Random split

In [10]:
random_split_and_save(data=mona, frac=[.8, .1, .1], names=['train.msp', 'test.msp', 'val.msp'],
                      prefix='../data/splits/mona-random')

## Split A

In [None]:
from collections import defaultdict
from typing import List
from matchms import Spectrum

def organize_dataset_by_key(data: List[Spectrum], key: str):
    unique = defaultdict(list)
    dropped = 0

    for record in data:
        if key not in record.metadata or record.metadata[key] == 'nan':
            dropped += 1
        else:
            unique[key].append(record)
    print(f'Dropped {dropped} entries out of {len(data)} records due to missing key {key}')

    return unique

def dict_head_tail(data: Dict[str, Spectrum]):
    head, tail = {}, {}

    for key, val in data.items():
        head[key] = val[0]
        tail[key] = val[1:]
    return head, tail


def split_a(data):
    random_split(list(tail.keys()), frac=[.5, .5])

In [70]:
np.asarray({'a': 0, 'b': 1, 'c': 2}.keys())

array(dict_keys(['a', 'b', 'c']), dtype=object)