In [None]:
import numpy as np
import multiprocessing as mp
import os

from pyimzml.ImzMLParser import ImzMLParser

from src.psalign.alignment import Alignment
from src.psalign.imzml import convert_pyimzml, read_imzml, save_to_imzml

peaks = 1000
start_mz = 200
end_mz = 1000
nb_cores = mp.cpu_count() // 2

nb_segments = 8
window = 4
factor = 1.25
outlier_detection = True

instrument = 'orbitrap'
reference = None

# Download file from https://www.omicsdi.org/dataset/metabolights_dataset/MTBLS289
sample = 'A52 CT S3-profile'
path = '<path_to_file>'


In [2]:
# The spectra do not correspond to the same pixels, so we need to create a mapping between the data of the two files (profile and centroid) based on the coordinates.
p_centroid = ImzMLParser(f'{path}/{sample.replace("profile", "centroid")}.imzML')
p_profile = ImzMLParser(f'{path}/{sample}.imzML')

p_centroid_coordinates = [p_centroid.get_physical_coordinates(i) for i in range(len(p_centroid.coordinates))]
p_profile_coordinates = [p_profile.get_physical_coordinates(i) for i in range(len(p_profile.coordinates))]

idxs_p_centroid = []
for i, coor in enumerate(p_centroid_coordinates):
    if coor in p_profile_coordinates:
        idxs_p_centroid.append(i)

idxs_p_profile = []
for i, coor in enumerate(p_profile_coordinates):
    if coor in p_centroid_coordinates:
        idxs_p_profile.append(i)

mapping = dict()
for i in idxs_p_centroid:
    for j in idxs_p_profile:
        if p_centroid_coordinates[i] == p_profile_coordinates[j]:
            mapping[i] = j
            break
    else:
        print("This shouldn't happen.")

idxs_p_profile = [mapping[i] for i in idxs_p_centroid]

# check
count = 0
for i, j in zip(idxs_p_centroid, idxs_p_profile):
    coor, coor2 = p_centroid.get_physical_coordinates(i), p_profile.get_physical_coordinates(j)
    if coor != coor2:
        print(coor, coor2)
        count += 1

# This should be 0.
print(count)

np.save(f'{path}/idxs_profile.npy', np.array(idxs_p_profile))
np.save(f'{path}/idxs_centroid.npy', np.array(idxs_p_centroid))

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


0


In [3]:
if not os.path.exists(f'{path}/{sample}.npz'):
    
    convert_pyimzml(f'{path}/{sample}', low_mz=start_mz, high_mz=end_mz, dtype=np.float32, nb_of_peaks=peaks)

In [4]:
file = np.load(f'{path}/{sample}.npz')
idxs = np.load(f'{path}/idxs_profile.npy')
data = file['data'][idxs]
mz = file['axis']

width = None
distance = 10
nb_peaks = 100

In [5]:
alignment = Alignment(np.copy(data), mz, reference, nb_cores, instrument)
alignment.limit_mz_range(start_mz, end_mz)

print('Mass dispersion before alignment:')
alignment.get_mass_dispersion(distance=distance, width=width, nb_of_peaks=nb_peaks)

alignment.align(nb_segments, window, factor, outlier_detection)

print('Mass dispersion after alignment without optimization:')
alignment.get_mass_dispersion(distance=distance, width=width, nb_of_peaks=nb_peaks)

del alignment

Mass dispersion before alignment:

Mass dispersion [ppm]:
	Average:		0.85
	Median:			0.65
Cosine similarity:		0.9103

Compiling Numba functions: finished in 1.62 seconds!        


100%|██████████| 18270/18270 [00:08<00:00, 2277.04it/s]


The data was warped in 8.39 seconds.
Mass dispersion after alignment without optimization:

Mass dispersion [ppm]:
	Average:		0.87
	Median:			0.67
Cosine similarity:		0.9103



In [6]:
alignment = Alignment(data, mz, reference, nb_cores, instrument)
alignment.limit_mz_range(start_mz, end_mz)

# only_opt=True means that only the optimization is performed, the nodes are placed based on where the largest peaks are after segmenting the spectra. This is chosen because the initial alignment does not improve the mass dispersion or similarity.
alignment.align_optimization(nb_segments, window, factor, outlier_detection, delta=0.05, only_opt=True)

print('Mass dispersion after alignment with optimization:')
alignment.get_mass_dispersion(distance=distance, width=width, nb_of_peaks=nb_peaks)

Compiling Numba functions: finished in 1.79 seconds!        


100%|██████████| 18270/18270 [04:50<00:00, 62.84it/s]


The data was warped in 291.09 seconds.
Mass dispersion after alignment with optimization:

Mass dispersion [ppm]:
	Average:		0.60
	Median:			0.33
Cosine similarity:		0.913



(0.6049153460213311, 0.330727510565264, 0.9129888093236909)

Below you can see how to read and write to an .imzML and .ibd file before and after alignment.

In [7]:
file = np.load(f'{path}/{sample}.npz')
idxs = np.load(f'{path}/idxs_profile.npy')
data = file['data'][idxs]
mz = file['axis']

alignment = Alignment(data, mz, reference, nb_cores, instrument)
alignment.limit_mz_range(start_mz, end_mz)
del file, data, mz

x, y = alignment.compute_warping_functions_optimization(nb_segments, window, factor, outlier_detection, delta=0.05, only_opt=True)

# Apply the warping functions to the data with matching coordinates between centroid and profile file.
data, mzs, locations = read_imzml(f'{path}/{sample.replace("profile", "centroid")}.imzML')
idxs_p_centroid = np.load(f'{path}/idxs_centroid.npy')
data = [data[i] for i in idxs_p_centroid]
mzs = [mzs[i] for i in idxs_p_centroid]
locations = [locations[i] for i in idxs_p_centroid]

mzs = alignment.apply_warping_functions_to_mz(mzs, x, y)

save_to_imzml(f'{path}/{sample.replace("profile", "aligned_centroid")}.imzML', data, mzs, locations, dtype=np.float64)

Compiling Numba functions: finished in 2.11 seconds!        


100%|██████████| 18270/18270 [04:48<00:00, 63.32it/s]


The warping functions were computed in 288.92 seconds.


100%|██████████| 18270/18270 [00:00<00:00, 70917.46it/s]


The m/z vectors were warped in 0.26 seconds.
