In [1]:
from scripts.generate_pseudomultimer_dataset import (
    locate_cif_file,
    load_structure,
    get_sequence,
    slice_sequence_by_residue_range,
    build_dataset,
    get_ca_atoms,
)
from pathlib import Path
import json

# Path to features JSON
features_path = Path.cwd().parent / "DATA/pdb_features.json"
project_root = Path.cwd()  # base_dir for locate_cif_file

# Load JSON into a separate variable (avoid shadowing the path)
with open(features_path, "r") as f:
    features_data = json.load(f)

# Resolve CIF path and load structure
cif_path = locate_cif_file(pdb_id="1A2P", base_dir=project_root, pdb_features=features_data)
print("CIF path:", cif_path)

struct = load_structure(cif_path)
print("Loaded struct:", type(struct).__name__ if struct is not None else None)

# Option A: Get entire chain sequences as a dict {chain_id: seq}
all_chain_seqs = get_sequence(struct)
print("Chain A (full) length:", len(all_chain_seqs.get("A", "")))

# Option B: Get a sliced subsequence by residue numbers (inclusive)
subseq = slice_sequence_by_residue_range(struct, chain_id="A", start_res=3, end_res=5)
print("Sliced A[3..5]:", subseq)

get_ca_atoms(load_structure(cif_path), chain_id="A", start_res=3, end_res=5)

# build_dataset(pdb_features= features_data, base_dir= project_root)

CIF path: /home/jovyan/workspace/DATA/cif_unzipped/000/1A2P.cif
Loaded struct: AtomArray
Chain A (full) length: 108
Sliced A[3..5]: VIN


[[17.996999740600586, 38.57699966430664, 42.79499816894531],
 [19.56800079345703, 41.80500030517578, 44.10900115966797],
 [20.308000564575195, 43.99599838256836, 41.10200119018555]]

In [2]:
from scripts.generate_pseudomultimer_dataset import main

output_path_links = "/home/jovyan/workspace/DATA/training.links.txt.gz"
output_path_seqs = "/home/jovyan/workspace/DATA/training.seqs.txt.gz"
output_path_contact_masks = "/home/jovyan/workspace/DATA/training.contact_masks.pkl.gz"
main(
    links_output_path=output_path_links,
    seqs_output_path=output_path_seqs,
    contact_masks_ouput_path=output_path_contact_masks,
)

Links output : /home/jovyan/workspace/DATA/training.links.txt.gz
Seqs  output : /home/jovyan/workspace/DATA/training.seqs.txt.gz
Contact masks output: /home/jovyan/workspace/DATA/training.contact_masks.pkl.gz
Loaded 154410 PDB entries from pdb_features.json


Processing CATH entries:   0%|          | 136/493969 [00:02<2:30:39, 54.63it/s]
Processing CATH entries:   0%|          | 136/493969 [00:02<2:30:39, 54.63it/s]


Wrote 22 sequences, 11 links, and 11 contact masks.
Skipped 0 items.
Skipped items saved to: /home/jovyan/workspace/DATA/skipped_items.json


In [None]:
name = "1A2P_A_1"
truncated = name.rsplit("_", 1)[0]
print(name)
print(truncated)

In [3]:
from mint.data.mint import PseudoMMDataset
from mint.data.mint import PseudoMMDataModule

test = PseudoMMDataset(
    links_path="/home/jovyan/workspace/DATA/training.links.txt.gz",
    seqs_path="/home/jovyan/workspace/DATA/training.seqs.txt.gz",
    contact_masks_path="/home/jovyan/workspace/DATA/training.contact_masks.pkl.gz",
)

In [None]:
from mint.model.mint import MINT
from omegaconf import OmegaConf

cfg = OmegaConf.load(
    "/home/jovyan/workspace/mint/src/mint/configs/all_configs.yaml"
)  # -> DictConfig
print(type(cfg))
cfg.meta.code_base_dir = "/home/jovyan/workspace/mint"
cfg.meta.data_dir = "/home/jovyan/workspace/DATA"
cfg.meta.experiment_name = "contact_head_test"
cfg.data.links_path = "/home/jovyan/workspace/DATA/training.links.txt.gz"
cfg.data.seqs_path = "/home/jovyan/workspace/DATA/training.seqs.txt.gz"
cfg.data.contact_masks_path = "/home/jovyan/workspace/DATA/training.contact_masks.txt.gz"

print(cfg)

model = MINT(cfg)
test2 = PseudoMMDataModule(cfg)
dl = test2.train_dataloader()
batch = next(iter(dl))


model.forward(batch)

<class 'omegaconf.dictconfig.DictConfig'>
{'meta': {'code_base_dir': '/home/jovyan/workspace/mint', 'data_dir': '/home/jovyan/workspace/DATA', 'experiment_name': 'contact_head_test', 'experiment_dir': '${meta.code_base_dir}/outputs/${meta.experiment_name}', 'matmul_precision': 'medium', 'pretrained': None, 'resume': None, 'disable_checkpoint': False, 'save_top_k': -1}, 'mint': {'esm2': {'encoder_layers': 33, 'encoder_embed_dim': 1280, 'encoder_attention_heads': 16, 'token_dropout': 0.1, 'use_multimer': True}}, 'training_args': {'freeze_self_attn': False, 'lr': 0.0001, 'adam_betas': [0.9, 0.98], 'adam_eps': 1e-08, 'weight_decay': 0.01, 'warmup_updates': 2000, 'end_learning_rate': 1e-05, 'total_num_update': 100000}, 'trainer': {'accelerator': 'gpu', 'devices': 1, 'precision': 32, 'gradient_clip_val': 10.0, 'max_epochs': -1, 'log_every_n_steps': 5, 'accumulate_grad_batches': 1, 'val_check_interval': 0.0, 'limit_val_batches': 0}, 'data': {'train': {'links_path': '/home/jovyan/workspace/DAT

ConfigAttributeError: Missing key train
    full_key: train
    object_type=dict