In [1]:
from spectralis.spectralis_master import Spectralis

In [2]:
config = "/home/samva/Doctorate/DeNovo_Benchmark/configs/spectralis/spectralis_config.yaml"
spectralis = Spectralis(config)

[INFO] Loading config file: /home/samva/Doctorate/DeNovo_Benchmark/configs/spectralis/spectralis_config.yaml
[INFO] Loaded bin reclassification model
[INFO] Initiated bin reclassifier

===


In [3]:
config = spectralis.config

verbose = spectralis.config['verbose']

# Initialize bin reclass objects
binreclass_model = spectralis.binreclass_model
print(f'[INFO] Loaded bin reclassification model')
peptide2profiler = spectralis.peptide2profiler
profile2peptider = spectralis.profile2peptider
bin_reclassifier = spectralis.bin_reclassifier
print(f'[INFO] Initiated bin reclassifier')

[INFO] Loaded bin reclassification model
[INFO] Initiated bin reclassifier


In [4]:
from pyteomics import mgf
import tqdm
import numpy as np
from spectralis.denovo_utils import __utils__ as U
from spectralis.denovo_utils import __constants__ as C

def _process_mgf(mgf_path):

    n_spectra = 0

    charges, prec_mz, alpha_seqs  = [], [], []
    exp_ints, exp_mzs, scans = [], [], []

    ## Read MGF file
    with mgf.MGF(mgf_path) as reader:
        for spectrum in tqdm.tqdm(reader):
            
            charges.append(spectrum['params']['charge'][0])
            prec_mz.append(spectrum['params']['pepmass'][0])
            alpha_seqs.append(spectrum['params']['seq'])
            scans.append(spectrum['params']['scans'])
            
            exp_mzs.append(spectrum['m/z array'])
            exp_ints.append(spectrum['intensity array'])
            n_spectra += 1
    print(f'-- Finished reading {n_spectra} PSMs')

    precursor_z = np.array(charges)      
    precursor_m = np.array(prec_mz)
    scans = np.array(scans)

    ## Unimod encoding for peptide sequences
    alpha_seqs = np.array([p.replace('L', 'I')
                            .replace('OxM', "M[UNIMOD:35]")
                            .replace('M(O)', "M[UNIMOD:35]")
                            .replace('M(ox)', "M[UNIMOD:35]")
                            .replace('Z', "M[UNIMOD:35]") for p in alpha_seqs]
                        )
    if config['interpret_c_as_fix']:
        alpha_seqs = np.array([p.replace('C', 'C[UNIMOD:4]') for p in alpha_seqs])
        
    ## peptides padded to SEQ_LEN (Default 30)
    sequences = [U.map_peptide_to_numbers(p) for p in alpha_seqs]
    seq_lens = np.array([len(s) for s in sequences])
    padded_seqs = np.array([np.pad(seq, (0,C.SEQ_LEN-len(seq)), 
                                    'constant', constant_values=(0,0)) for seq in sequences]).astype(int)

    ## experimental spectra padded to max len of spectra
    len_padded = max([len(el) for el in exp_mzs])
    exp_mzs = np.array([np.pad(seq, (0,len_padded-len(seq)), 'constant', constant_values=(0,0)) for seq in exp_mzs])
    exp_ints = np.array([np.pad(seq, (0,len_padded-len(seq)), 'constant', constant_values=(0,0)) for seq in exp_ints])

    ## Filter invalid spectra: charge>max_charge or pep length>seq_len (Default 6 and 30)
    idx_valid_charge = np.where(precursor_z<=C.MAX_CHARGE)[0]
    idx_valid_peplen = np.where(seq_lens<=C.SEQ_LEN)[0]
    idx_valid = np.intersect1d(idx_valid_charge, idx_valid_peplen)
    idx_invalid = np.array([i for i in range(len(seq_lens)) if i not in idx_valid])

    assert idx_valid.shape[0]>0

    ## Filter data to valid spectra
    scans_valid = scans[idx_valid]
    scans_invalid = scans[idx_invalid] if idx_invalid.shape[0]>0 else np.array([])

    alpha_seqs = alpha_seqs[idx_valid]
    precursor_z = precursor_z[idx_valid]
    exp_ints = exp_ints[idx_valid]
    exp_mzs = exp_mzs[idx_valid]
    precursor_m = precursor_m[idx_valid]

    print(f'-- Input shapes\n\tseqs: {alpha_seqs.shape}, charges: {precursor_z.shape}, ints: {exp_ints.shape}, mzs: {exp_mzs.shape}, precursor mzs: {precursor_m.shape}')

    return padded_seqs, precursor_z, precursor_m, scans_valid, exp_mzs, exp_ints, alpha_seqs, scans_invalid

In [None]:
# rescoring

if self.scorer is None:
    ## Init scorer
    self.scorer = self._init_scorer()  
    print(f'[INFO] Initiated lev scorer')

## Compute prosit preds for peptides, prosit collision energy from config
prosit_out = U.get_prosit_output(alpha_peps, charges, self.config['prosit_ce'])
prosit_mzs, prosit_ints =  prosit_out['mz'],prosit_out['intensities']

## Compute peptide masses and collect bin reclass predictions to compute features
peptide_masses = np.array([U._compute_peptide_mass_from_seq(alpha_peps[j]) for j in range(len(alpha_peps)) ])
binreclass_out = self.bin_reclassifier.get_binreclass_preds(prosit_mzs=prosit_mzs,
                                                    prosit_ints=prosit_ints,
                                                    pepmass=peptide_masses,
                                                    exp_mzs=exp_mzs,
                                                    exp_int=exp_ints,
                                                    precursor_mz=precursor_mzs,
                                                )
y_probs, y_mz_probs, b_probs, b_mz_probs, y_changes, y_mz_inputs, b_mz_inputs = binreclass_out

## Compute features and scores
return self.scorer.get_scores(exp_mzs, exp_ints, prosit_ints, prosit_mzs, y_changes, 
                                return_features=return_features, original_scores=original_scores)

In [5]:
## process mgf file
mgf_path = "/home/samva/Doctorate/nextflow_workdirs/nextflow_denovo_refinement/c4/c3ba675550c9a82594da1b6c8d89b5/S08_annotated.mgf"

_out = _process_mgf(mgf_path)

75228it [00:10, 7465.96it/s]


-- Finished reading 75228 PSMs
-- Input shapes
	seqs: (75228,), charges: (75228,), ints: (75228, 799), mzs: (75228, 799), precursor mzs: (75228,)


In [6]:
_, precursor_z, precursor_m, scans_valid, exp_mzs, exp_ints, alpha_seqs, scans_invalid = _out

In [7]:
scorer = spectralis._init_scorer()  

[INFO] Loaded scorer:
	RandomForestRegressor(criterion='mse', max_depth=420, max_features=88,
                      min_samples_leaf=77, min_samples_split=114,
                      n_estimators=182, n_jobs=-1, random_state=13, verbose=1)


In [8]:
prosit_out = U.get_prosit_output(alpha_seqs, precursor_z, config['prosit_ce'])
prosit_mzs, prosit_ints =  prosit_out['mz'],prosit_out['intensities']
peptide_masses = np.array([U._compute_peptide_mass_from_seq(alpha_seqs[j]) for j in range(len(alpha_seqs)) ])

  0%|          | 0/76 [00:00<?, ?it/s]

100%|██████████| 76/76 [00:55<00:00,  1.38it/s]


In [9]:
bin_reclass_out = bin_reclassifier.get_binreclass_preds(prosit_mzs=prosit_mzs,
    prosit_ints=prosit_ints,
    pepmass=peptide_masses,
    exp_mzs=exp_mzs,
    exp_int=exp_ints,
    precursor_mz=precursor_m,
)

MZs for channel <y+>: [[175.11896   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 ...
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]]
MZs for channel <b+>: [[ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 ...
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]]


100%|██████████| 15046/15046 [08:22<00:00, 29.94it/s]


In [None]:
prosit_mzs, prosit_ints, pepmass, exp_mzs, exp_int, precursor_mz, return_mz_changes=False

In [13]:
import torch
from torch.utils.data import DataLoader

_dataset = bin_reclassifier.get_binreclass_dataset(prosit_mzs, prosit_ints, peptide_masses, exp_mzs, exp_ints, precursor_m)
dataloader = DataLoader(dataset=_dataset, batch_size=bin_reclassifier.batch_size, shuffle=False, num_workers=8)

all_y_probs, all_y_mz_probs, all_b_probs, all_b_mz_probs, all_y_changes, all_y_mz_inputs, all_b_mz_inputs = [],[],[],[],[],[],[]

temp = 0
with torch.no_grad():
    bin_reclassifier.binreclass_model.eval()  
    for local_batch in tqdm.tqdm(dataloader):
        X = local_batch.to(bin_reclassifier.device)

        #with torch.cuda.amp.autocast(dtype=torch.float16, enabled=True):
        outputs = bin_reclassifier.binreclass_model(X)
        outputs = outputs[:,:2,:].detach().cpu().numpy()
        outputs = 1 / (1 + np.exp(-outputs)) #bin_reclassifier.sigmoid(outputs)
        
        ## input and store change probs
        inputs = X[:,:2,:].detach().cpu().numpy()
        changes = outputs.copy()
        idx_one = np.where(inputs==1)
        changes[idx_one] = 1 - changes[idx_one]

        ## store only nonzero
        y_probs, y_mz_probs, b_probs, b_mz_probs, y_changes, y_mz_inputs, b_mz_inputs = bin_reclassifier.adapt_binreclass_preds(outputs, changes, inputs)
        all_y_probs.append(y_probs)
        all_y_mz_probs.append(y_mz_probs)
        all_b_probs.append(b_probs)
        all_b_mz_probs.append(b_mz_probs)
        all_y_changes.append(y_changes)
        all_y_mz_inputs.append(y_mz_inputs)
        all_b_mz_inputs.append(b_mz_inputs)

MZs for channel <y+>: [[175.11896   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 ...
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]
 [147.11281   0.        0.      ...   0.        0.        0.     ]]
MZs for channel <b+>: [[ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 ...
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]]


 does not have profile information (Triggered internally at ../torch/csrc/jit/codegen/cuda/graph_fuser.cpp:105.)
  return forward_call(*input, **kwargs)
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████| 15046/15046 [08:26<00:00, 29.68it/s]


In [21]:
all_b_probs[0].shape

(5,)

In [23]:
len(all_b_probs)

15046

In [29]:
all_b_probs[0]

array([array([0.8906, 0.4275, 0.3564, 0.3804, 0.3901, 0.512 , 0.4775, 0.385 ,
              0.3599, 0.4036, 0.4316, 0.4385, 0.4346, 0.384 , 0.355 , 0.3555,
              0.4006, 0.4116, 0.4136, 0.4414, 0.4922, 0.5347], dtype=float16),
       array([0.847 , 0.355 , 0.4724, 0.5625, 0.4922, 0.4731, 0.4148, 0.4492,
              0.4617, 0.5596, 0.6   , 0.6865, 0.8716], dtype=float16)        ,
       array([0.862 , 0.377 , 0.351 , 0.5176, 0.618 , 0.5635, 0.568 , 0.4531,
              0.63  , 0.544 , 0.728 , 0.7344, 0.768 , 0.8945], dtype=float16),
       array([0.8574, 0.3591, 0.352 , 0.554 , 0.5767, 0.468 , 0.407 , 0.356 ,
              0.4016, 0.4038, 0.4905, 0.546 , 0.649 , 0.8833], dtype=float16),
       array([0.875 , 0.3914, 0.4263, 0.441 , 0.4495, 0.4753, 0.3606, 0.4097,
              0.3877, 0.4067, 0.4155, 0.4895, 0.5884, 0.422 , 0.859 ],
             dtype=float16)                                                  ],
      dtype=object)

In [43]:
all_b_probs[1]

array([array([0.839 , 0.352 , 0.3743, 0.4724, 0.519 , 0.428 , 0.448 , 0.3796,
              0.5107, 0.588 , 0.707 , 0.7173, 0.7446, 0.8984], dtype=float16),
       array([0.8447, 0.361 , 0.501 , 0.5493, 0.4287, 0.3584, 0.3743, 0.4077,
              0.355 , 0.4436, 0.4702, 0.653 , 0.6685, 0.7256, 0.888 ],
             dtype=float16)                                                  ,
       array([0.8784, 0.3882, 0.3796, 0.4604, 0.382 , 0.377 , 0.4268, 0.371 ,
              0.3782, 0.3645, 0.4062, 0.4463, 0.468 , 0.499 , 0.5073, 0.5273,
              0.535 , 0.5493, 0.3647, 0.6177, 0.7085], dtype=float16)        ,
       array([0.877 , 0.3645, 0.3684, 0.498 , 0.4922, 0.46  , 0.4514, 0.4194,
              0.3586, 0.355 , 0.3945, 0.3691, 0.4302, 0.404 , 0.4087, 0.53  ,
              0.5264, 0.5776, 0.574 , 0.586 , 0.633 , 0.7173], dtype=float16),
       array([0.8716, 0.3672, 0.3982, 0.495 , 0.39  , 0.3926, 0.4656, 0.4514,
              0.397 , 0.3723, 0.354 , 0.3564, 0.4058, 0.446 , 0.394

In [37]:
different_formatted_el = all_b_probs[15045]

In [41]:
np.array([np.array(i, dtype=np.float16) for i in different_formatted_el])

array([[0.9233, 0.614 , 0.6714, 0.7173, 0.675 , 0.671 , 0.663 , 0.671 ,
        0.673 , 0.624 , 0.627 , 0.584 , 0.6074, 0.6074, 0.3823, 0.649 ,
        0.86  ],
       [0.9277, 0.626 , 0.6704, 0.753 , 0.7246, 0.712 , 0.691 , 0.64  ,
        0.63  , 0.615 , 0.6504, 0.6313, 0.65  , 0.644 , 0.3525, 0.667 ,
        0.8735],
       [0.9126, 0.637 , 0.6455, 0.7256, 0.6343, 0.6187, 0.5703, 0.5396,
        0.555 , 0.5645, 0.58  , 0.5454, 0.5615, 0.571 , 0.3809, 0.6343,
        0.8584]], dtype=float16)

In [26]:
all_b_probs[15045].shape

(3, 17)

In [48]:
all_y_probs_concat = np.concatenate(all_y_probs)
all_y_mz_probs_concat = np.concatenate(all_y_mz_probs)

In [103]:
new_all_b_probs[0].ndim

1

In [102]:
different_formatted_el.ndim

2

In [105]:
len(all_b_probs[:-1])

15045

In [112]:
new_b_probs_concat = []

for batch in all_b_probs:
    
    new_batch = []
    if batch.ndim:
        for arr in batch:
            new_batch.append(np.array(arr, dtype=np.float16))
        new_batch.append(np.array([0], dtype=np.float16))
        new_batch=np.array(new_batch)
        batch=new_batch[:-1]
    
    new_b_probs_concat.append(batch)

  # Remove the CWD from sys.path while we load stuff.


In [113]:
np.concatenate(new_b_probs_concat)

array([array([0.8906, 0.4275, 0.3564, 0.3804, 0.3901, 0.512 , 0.4775, 0.385 ,
              0.3599, 0.4036, 0.4316, 0.4385, 0.4346, 0.384 , 0.355 , 0.3555,
              0.4006, 0.4116, 0.4136, 0.4414, 0.4922, 0.5347], dtype=float16),
       array([0.847 , 0.355 , 0.4724, 0.5625, 0.4922, 0.4731, 0.4148, 0.4492,
              0.4617, 0.5596, 0.6   , 0.6865, 0.8716], dtype=float16)        ,
       array([0.862 , 0.377 , 0.351 , 0.5176, 0.618 , 0.5635, 0.568 , 0.4531,
              0.63  , 0.544 , 0.728 , 0.7344, 0.768 , 0.8945], dtype=float16),
       ...,
       array([0.9233, 0.614 , 0.6714, 0.7173, 0.675 , 0.671 , 0.663 , 0.671 ,
              0.673 , 0.624 , 0.627 , 0.584 , 0.6074, 0.6074, 0.3823, 0.649 ,
              0.86  ], dtype=float16)                                        ,
       array([0.9277, 0.626 , 0.6704, 0.753 , 0.7246, 0.712 , 0.691 , 0.64  ,
              0.63  , 0.615 , 0.6504, 0.6313, 0.65  , 0.644 , 0.3525, 0.667 ,
              0.8735], dtype=float16)           

In [52]:
new_all_b_probs = all_b_probs[0: 15044]

In [55]:
new_all_b_probs.append(np.array([np.array(x, dtype=np.float16) for x in different_formatted_el]))

In [51]:
np.concatenate(all_b_probs)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 15045 has 2 dimension(s)

In [10]:
test = [
    np.array([np.array([0.1, 0.2], dtype=object), np.array([0.3, 0.4, 0.5], dtype=object)], dtype=object),  # A batch with arrays of different sizes
    np.array([np.array([0.6, 0.7, .1], dtype=object), np.array([0.8, 0.9], dtype=object)], dtype=object)         # A batch where all arrays are the same size
]
# No issue!
el = np.concatenate(test)

In [11]:
test_2 = [
    np.array([np.array([0.1, 0.2], dtype=object), np.array([0.3, 0.4, 0.5], dtype=object)], dtype=object),  # A batch with arrays of different sizes
    np.array([np.array([0.6, 0.7], dtype=object), np.array([0.8, 0.9], dtype=object)], dtype=object)         # A batch where all arrays are the same size
]
# Issue! Because the second array defaults to a 2-dimensional one, whereas it should be concatenateble to the first one...
np.concatenate(test_2)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [96]:
test = [
    np.array([np.array([0.1, 0.2], dtype=object), np.array([0.3, 0.4, 0.5], dtype=object)], dtype=object),  # A batch with arrays of different sizes
    np.array([np.array([0.6, 0.7, .1], dtype=object), np.array([0.8, 0.9], dtype=object)], dtype=object)         # A batch where all arrays are the same size
]
# No issue!
el = np.concatenate(test)

test_2 = [
    np.array([np.array([0.1, 0.2], dtype=object), np.array([0.3, 0.4, 0.5], dtype=object)], dtype=object),  # A batch with arrays of different sizes
    np.array([np.array([0.6, 0.7], dtype=object), np.array([0.8, 0.9], dtype=object)], dtype=object)         # A batch where all arrays are the same size
]
# Issue! Because the second array defaults to a 2-dimensional one, whereas it should be concatenateble to the first one...
np.concatenate(test_2)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [97]:
test_2 = [
    np.array([np.array([0.1, 0.2], dtype=object), np.array([0.3, 0.4, 0.5], dtype=object)], dtype=object),  # A batch with arrays of different sizes
    np.array([np.array([0.6, 0.7], dtype=object), np.array([0.8, 0.9], dtype=object)], dtype=object)         # A batch where all arrays are the same size
]
el = np.concatenate(test_2)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [93]:
np.array([np.array([0.6, 0.7, .1], dtype=object), np.array([0.6, 0.7], dtype=object), np.array([0.8, 0.9], dtype=object)]).shape

  """Entry point for launching an IPython kernel.


(3,)

In [85]:
np.array([np.array([0.6, 0.7]), np.array([0.8, 0.9])], dtype=object).shape

(2, 2)

In [79]:
np.concatenate(test)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [74]:
new_test = []
for batch in test:
    new_b = [np.ravel(b) for b in batch]
    new_test.append(new_b)

In [75]:
new_test_concat = np.concatenate([np.array(batch) for batch in new_test])


  """Entry point for launching an IPython kernel.


In [70]:
new_test_concat

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [66]:
test

[array([array([0.1, 0.2]), array([0.3, 0.4, 0.5])], dtype=object),
 array([[0.6, 0.7],
        [0.8, 0.9]])]

In [68]:
new_test[1].shape

AttributeError: 'list' object has no attribute 'shape'

In [63]:
np.concatenate(test)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [None]:
all_y_probs = np.concatenate(all_y_probs)
all_y_mz_probs = np.concatenate(all_y_mz_probs)
all_b_probs = np.concatenate(all_b_probs) # error occurs here!
all_b_mz_probs = np.concatenate(all_b_mz_probs)
all_y_changes = np.concatenate(all_y_changes)

In [None]:
# rescoring_from_mgf deconstructed

print('== Spectralis rescoring from MGF file ==')




print(f'-- Getting scores for {len(alpha_seqs)} PSMs')

## Get Spectralis-scores
rescoring_out = rescoring(alpha_seqs, precursor_z,  
                                exp_ints, exp_mzs, precursor_m, return_features=return_features)

if return_features:
    scores, features = rescoring_out
else: 
    scores = rescoring_out

## Assign lowest possible score to invalid spectra
if scans_invalid.shape[0]>0:
    scans_valid = np.concatenate([scans_valid, scans_invalid])
    scores = np.concatenate([scores, np.zeros(scans_invalid.shape[0])+np.NINF])

## Write output to csv file
df = pd.DataFrame({'Spectralis_score':scores, 'scans':scans_valid})
if out_path is not None:
    df.to_csv(out_path, index=None)
    print(f'-- Writing scores to file <{out_path}>\nfor {len(df)} PSMs')

return df if not return_features else (df, features)