# Load and examine a dataset
- sample by HW, mesurment by CO
- high-pressure torsion sample, Ni65Cu35

# Set environment: Load the packages
- for details on how to connect to the cluster and set up environments visit paullobpreis.com (pw:4DSTEM)

In [1]:
# timestr used to create simple timestamps for easier version controll
import time
timestr = time.strftime("%Y%m%d")

# py4dstem as main tool, the follwing command also prints the currently used version
import py4DSTEM
py4DSTEM.__version__

'0.14.16'

# Load and examine a dataset

In [37]:
# Load the .dm4 file from a workspace on OMNI or XNAS, set filepath with 'filepath_data = "FILEPATH"'
# even better, use dirpath'' and 'filepath_data', this makes it easier to add more paths later

dirpath = "/Users/paullobpreis/GitHub/Paullo9.github.io/data/"

filepath_data = dirpath + 'COPL_Ni65Cu35_C_ROI3_240827_aper_50_conv_1.5_spot_6_CL_47_stepsize_10_r_x_178_r_y_186_GIF_512x512_preprocessed_unfiltered_bin_4_20240828.h5'

In [38]:
# Load the datacube from the .dm4 file specified above

# py4DSTEM uses the import_file function to load non-native file formats, while the read function is used for files originally written by py4DSTEM.

datacube = py4DSTEM.import_file(
filepath_data,
)

Exception: EMD file or py4DSTEM detected - use py4DSTEM.read, not py4DSTEM.import_file!

In [17]:
# datacube directly passed reveiles the 4-dimensional array of Real Space and Diffaction Space for the sample

datacube

DataCube( A 4-dimensional array of shape (186, 178, 128, 128) called 'dm_dataset',
          with dimensions:

              Rx = [0.0,5.1,10.2,...] nm
              Ry = [0.0,5.1,10.2,...] nm
              Qx = [0.0,0.029196688532829286,0.05839337706565857,...] A^-1
              Qy = [0.0,0.029196688532829286,0.05839337706565857,...] A^-1
)

In [18]:
# The data itself can be observed by the following command:

datacube.data

array([[[[-3.19374451e+02, -9.44956299e+02, -1.77866669e+02, ...,
          -6.23862488e+02, -6.52326660e+02, -8.65313873e+01],
         [-3.75145508e+02,  2.62312927e+02,  1.34725540e+02, ...,
          -1.54536819e+02, -6.74444153e+02, -4.50006775e+02],
         [-4.00875305e+02, -6.70706665e+02, -2.69619110e+02, ...,
          -1.36987518e+02, -4.64479706e+02, -2.73969574e+02],
         ...,
         [-3.08361420e+02, -1.15247917e+01,  2.37341156e+02, ...,
           5.42015381e+01, -8.23706665e+02, -3.07181885e+02],
         [-8.05369415e+01,  8.27563477e+00, -7.19529724e+00, ...,
           3.49320953e+02, -5.31080704e+01, -6.64651428e+02],
         [ 8.28704071e+01, -1.89570724e+02, -1.51927139e+02, ...,
          -8.89043732e+01, -3.42993286e+02, -2.61407227e+02]],

        [[-7.27098389e+01, -3.83761292e+02,  1.74478149e+02, ...,
          -1.88317245e+02, -2.88742859e+02, -5.10008362e+02],
         [-1.94731689e+02, -6.48190063e+02, -3.72453583e+02, ...,
          -4.45353302e

# Bin-on-load
Binnig is an option for large datasets that might not fit the computers RAM, hopefully we can skip this part as we are using the OMNI cluster.

In [21]:
# this would turn a 128x128 dataset into a 64x64 dataset, hence 8 times smaller

datacube_binned = py4DSTEM.read(
   filepath_data,
   binfactor = 2
)

datacube_binned

DataCube( A 4-dimensional array of shape (186, 178, 128, 128) called 'dm_dataset',
          with dimensions:

              Rx = [0.0,5.1,10.2,...] nm
              Ry = [0.0,5.1,10.2,...] nm
              Qx = [0.0,0.029196688532829286,0.05839337706565857,...] A^-1
              Qy = [0.0,0.029196688532829286,0.05839337706565857,...] A^-1
)

# Calibrate pixel size and unit

In [22]:
# several properties of our datacube can be shown by:
print(datacube.data.shape)
print(datacube.shape)
print(datacube.Rshape)
print(datacube.Qshape)

(186, 178, 128, 128)
(186, 178, 128, 128)
(186, 178)
(128, 128)


In [23]:
# Vectors calibrating each dimension of the dataset are included within the datacube, utilizing any available calibrations retrieved from the file

# dimension vectors -
print('The first dimension:')
print(f'  - dimension name: {datacube.dim_names[0]}')
print(f'  - dimension units: {datacube.dim_units[0]}')
print(f'  - dim vector: {datacube.dims[0][:10]}') # note the `[:10]` - we're only displaying the first 10 entries
print()
print('The third dimension:')
print(f'  - dimension name: {datacube.dim_names[2]}')
print(f'  - dimension units: {datacube.dim_units[2]}')
print(f'  - dim vector: {datacube.dims[2][:10]}')

print()

# pixel sizes -
qpix = datacube.calibration.get_Q_pixel_size()
qpixunit = datacube.calibration.get_Q_pixel_units()
rpix = datacube.calibration.get_R_pixel_size()
rpixunit = datacube.calibration.get_R_pixel_units()
print()
print(f"The diffraction space pixels are each {qpix:.4f} {qpixunit}")
print(f"The real space pixels are each {rpix:.4f} {rpixunit}")

The first dimension:
  - dimension name: Rx
  - dimension units: nm
  - dim vector: [ 0.   5.1 10.2 15.3 20.4 25.5 30.6 35.7 40.8 45.9]

The third dimension:
  - dimension name: Qx
  - dimension units: A^-1
  - dim vector: [0.         0.02919669 0.05839338 0.08759007 0.11678675 0.14598344
 0.17518013 0.20437682 0.23357351 0.2627702 ]


The diffraction space pixels are each 0.0292 A^-1
The real space pixels are each 5.1000 nm


In [24]:
# complete list of calibrations is located here; the above vectors are derived from these values

datacube.calibration

Calibration( A Metadata instance called 'calibration', containing the following fields:

             Q_pixel_size:     0.029196688532829286
             R_pixel_size:     5.1
             Q_pixel_units:    A^-1
             R_pixel_units:    nm
             QR_flip:          False
             _root_treepath:   
)

In [25]:
# Currently, the real space pixel size is shown as 1 pixel, indicating that this information was either unavailable or not extracted from the .dm4 file.
# Assuming we know the real space pixel size between beam positions is 5 nanometers, we can update the value using:

datacube.calibration.set_R_pixel_size(5.1)
datacube.calibration.set_R_pixel_units('nm')

# and print the newly calibrated values with:
datacube.calibration

# the values will be automatically updated in the datacube

Calibration( A Metadata instance called 'calibration', containing the following fields:

             Q_pixel_size:     0.029196688532829286
             R_pixel_size:     5.1
             Q_pixel_units:    A^-1
             R_pixel_units:    nm
             QR_flip:          False
             _root_treepath:   
)

# Filterung and removing hot pixels, probably not needed

In [27]:
# The function works by finding pixels in the mean diffraction image that are thresh times brighter than any other pixel in their local neighborhood.
# It then replaces these hot pixels in each diffraction pattern with the local median intensity.

datacube.filter_hot_pixels(
    thresh = 8
)


Cleaning pixels: 100%|██████████| 33108/33108 [00:42<00:00, 786.66 images/s]


DataCube( A 4-dimensional array of shape (186, 178, 128, 128) called 'dm_dataset',
          with dimensions:

              Rx = [0.0,5.1,10.2,...] nm
              Ry = [0.0,5.1,10.2,...] nm
              Qx = [0.0,0.029196688532829286,0.05839337706565857,...] A^-1
              Qy = [0.0,0.029196688532829286,0.05839337706565857,...] A^-1
)

# Binning in diffraction space
Loading unbinned data and bin it later is prefered, here is a function to do so, the binning factor is given in parentheses

In [28]:
datacube.bin_Q(4)

DataCube( A 4-dimensional array of shape (186, 178, 32, 32) called 'dm_dataset',
          with dimensions:

              Rx = [0.0,5.1,10.2,...] nm
              Ry = [0.0,5.1,10.2,...] nm
              Qx = [0.0,0.11678675413131714,0.2335735082626343,...] A^-1
              Qy = [0.0,0.11678675413131714,0.2335735082626343,...] A^-1
)

# Save the modified data
- Now we set a new filepath consisting of the old path with additions indicating our processing
- from the name alone should be clear if we preprosessed/filtered/binned
- additionalls I implemented timestr to include the date

In [29]:
# this data is saved as a .h5 file format 

from os.path import splitext
filepath_save = splitext(filepath_data)[0] + '_preprocessed_unfiltered_bin_4_' + timestr + '.h5'


# print the new filepath 

print(filepath_save)

/Users/paullobpreis/GitHub/Paullo9.github.io/data/COPL_Ni65Cu35_C_ROI3_240827_aper_50_conv_1.5_spot_6_CL_47_stepsize_10_r_x_178_r_y_186_GIF_512x512_preprocessed_unfiltered_bin_4_20240828_preprocessed_unfiltered_bin_4_20240925.h5


In [30]:
# Save

py4DSTEM.save(
    filepath_save,
    datacube,
    mode = 'o'    # 'overwrite' mode
)

# Inspect the resulting HDF5 file
- first we want to see where the data lives without opening it
- secondly we check which default name was assigned

In [31]:
# 'dm_dataset' and 'dm_dataset_root' are placeholders we could re-assigning a different name later

py4DSTEM.print_h5_tree(filepath_save)

datacube.name

/
|---dm_dataset_root
    |---dm_dataset




'dm_dataset'

# Note
- When deciding whether or not to save a datacube to storage, it's important to be mindful of the necessity. Since datacubes are typically large and accessible from the original microscope file, avoid creating new files unless there is a compelling reason.