In [2]:
import lmdb
from pathlib import Path

In [3]:
def find_leaf_directories(root_dir):
    ret = []
    has_subdir = False
    for subdir in root_dir.iterdir():
        if subdir.is_dir():
            has_subdir = True
            ret.extend(find_leaf_directories(subdir))
    if not has_subdir:
        ret.append(root_dir)
    return ret

In [4]:
train_data_home = Path('/nlm/SFMData/pretrain/20240724/train_split_lmdb/')

# find all leaf directories
data_folders = find_leaf_directories(train_data_home)
print(data_folders)

[PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals0_30.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals120_150.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals150_180.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals180_-1.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals30_60.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals60_90.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals90_120.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebrate_others_binary_vertebrate_others0_60.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebrate_others_binary_vertebrate_others120_180.lmdb'), PosixPath('/nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA

In [3]:
def check(pth):
    # check if the path contains 'data.mdb'
    if not (pth / 'data.mdb').exists():
        print(f'{pth} does not contain data.mdb')
        return False

    try:
        env = lmdb.open(str(pth), subdir=True, readonly=True, lock=False, readahead=False)
        txn = env.begin(write=False)
        meta = txn.get("metadata".encode())
    except Exception as e:
        print(f'Error in {pth}: {e}')
        return False

    return True

In [5]:


for pth in data_folders:
    print("Checking", pth)
    if "antibody_full_seq_rmdup" in str(pth):
        print("Skip due to kernel error")
        continue
    if check(pth):
        print("Success")
    else:
        print('Failed')
    print("================")

Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals0_30.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals120_150.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals150_180.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals180_-1.lmdb


Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals30_60.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals60_90.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/mammals_binary_mammals90_120.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebrate_others_binary_vertebrate_others0_60.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebrate_others_binary_vertebrate_others120_180.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebrate_others_binary_vertebrate_others180_240.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebrate_others_binary_vertebrate_others240_300.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebrate_others_binary_vertebrate_others300_-1.lmdb
Success
Checking /nlm/SFMData/pretrain/20240724/train_split_lmdb/DNA/vertebra

In [6]:
check(Path('/nlm/SFMData/pretrain/20240724/train_split_lmdb/material.lmdb'))

True

In [8]:
check(Path('/nlm/SFMData/pretrain/20240724/train_split_lmdb/antibody_full_seq_rmdup.sample30m.train.pended.20240731.ab.npy.lmdb'))

check(Path('/nlm/SFMData/pretrain/20240724/train_split_lmdb/antibody_full_seq_rmdup.sample30m.train.pended.20240731.ab.train.npy.lmdb'))


/nlm/SFMData/pretrain/20240724/train_split_lmdb/antibody_full_seq_rmdup.sample30m.train.pended.20240731.ab.npy.lmdb does not contain data.mdb


: 

In [8]:
check(Path('/nlm/SFMData/pretrain/20240724/train_split_lmdb/antibody.lmdb'))

True