# Machine metadata in pandas DataFrame

In [None]:
import json
import pathlib
import pprint

import pandas as pd
pd.options.display.max_columns = 32

import vaep.utils
import vaep.pandas

configs:

In [None]:
DATA_PROJECT = pathlib.Path('../../project/data')

rawfile_metadata_in = 'rawfile_metadata.json'
rawfile_metadata_out = DATA_PROJECT / 'rawfile_metadata.csv'
rawfile_metadata_in

read raw file created by snakemake workflow

In [None]:
with open(rawfile_metadata_in) as f:
    data = json.load(f)

key_sampled = vaep.utils.sample_iterable(data, 1)[0]
sample = data[key_sampled]
pprint.pprint(sample)

In [None]:
sample.keys()

- essentially the all data is a dictionary with for keys: `['accession', 'cvLabel', 'name', 'value']`
- pick for each type of entry in  `['FileProperties', 'InstrumentProperties', 'MsData', 'ScanSettings', 'SampleData']` 
    - the `name` and `value` for a single in a list of entries

In [None]:
parsed = {}
for sample_id, meta_json in data.items():
    selected = {}
    for k, entries in meta_json.items():
        for entry in entries:
            selected[k, entry['name']] = entry['value']
    parsed[sample_id] = selected
pprint.pprint(parsed[sample_id])

In [None]:
# simple strings (not as shown by pprint)
(parsed[sample_id][('InstrumentProperties', 'instrument serial number')],
 parsed[sample_id][('InstrumentProperties', 'Thermo Scientific instrument model')])

Explicitly specifying the default dtypese here once. These are set when the data is read from the json file created in this script `rawfile_metadata_out`.

In [None]:
# # df.dtypes.to_dict() needed parsing
# from numpy import dtype
# types = {
#     ('FileProperties', 'Pathname'): dtype('O'),
#     ('FileProperties', 'Version'): dtype('int64'),
#     ('FileProperties', 'Content Creation Date'): dtype('O'),
#     ('InstrumentProperties', 'Thermo Scientific instrument model'): dtype('O'),
#     ('InstrumentProperties', 'instrument attribute'): dtype('O'),
#     ('InstrumentProperties', 'instrument serial number'): dtype('O'),
#     ('InstrumentProperties', 'Software Version'): dtype('O'),
#     ('InstrumentProperties', 'firmware version'): dtype('O'),
#     ('MsData', 'Number of MS1 spectra'): dtype('int64'),
#     ('MsData', 'Number of MS2 spectra'): dtype('float64'),
#     ('MsData', 'MS min charge'): dtype('int64'),
#     ('MsData', 'MS max charge'): dtype('int64'),
#     ('MsData', 'MS min RT'): dtype('float64'),
#     ('MsData', 'MS max RT'): dtype('float64'),
#     ('MsData', 'MS min MZ'): dtype('float64'),
#     ('MsData', 'MS max MZ'): dtype('float64'),
#     ('ScanSettings', 'scan start time'): dtype('float64'),
#     ('ScanSettings', 'mass resolution'): dtype('float64'),
#     ('ScanSettings', 'mass unit'): dtype('O'),
#     ('ScanSettings', 'Number of scans'): dtype('int64'),
#     ('ScanSettings', 'MS scan range'): dtype('O'),
#     ('ScanSettings', 'Retention time range'): dtype('O'),
#     ('ScanSettings', 'Mz range'): dtype('O'),
#     ('ScanSettings', 'beam-type collision-induced dissociation'): dtype('O'),
#     ('SampleData', 'sample number'): dtype('O'),
#     ('SampleData', 'Type'): dtype('O'),
#     ('SampleData', 'Vial'): dtype('O'),
#     ('SampleData', 'injection volume setting'): dtype('float64'),
#     ('SampleData', 'Row'): dtype('float64'),
#     ('SampleData', 'dilution factor'): dtype('int64'), # fails with NA
#     ('SampleData', 'sample name'): dtype('O'),
#     ('SampleData', 'Comment'): dtype('O'),
# }

In [None]:
df = pd.DataFrame.from_dict(parsed, orient='index')
df.columns.names = ['category', 'item']
df.index.name = 'Sample ID'

# df = df.astype(types)

# write and read to check that this works and convert dtypes directly
# jso n format cannot preserve multiindex columns
df.to_csv(rawfile_metadata_out)
df =  pd.read_csv(rawfile_metadata_out, header=[0,1], index_col=0) # read data elsewhere, set dtypes automatically, multiindex headers
df.describe(include='all')

In [None]:
df

In [None]:
df.dtypes

In [None]:
columns_selected = [
    #  ('FileProperties', 'Pathname'),
    #  ('FileProperties', 'Version'),
    #  ('FileProperties', 'Content Creation Date'),
    ('InstrumentProperties', 'Thermo Scientific instrument model'),
    ('InstrumentProperties', 'instrument attribute'),
    ('InstrumentProperties', 'instrument serial number'),
    #  ('InstrumentProperties', 'Software Version'),
    #  ('InstrumentProperties', 'firmware version'),
    #  ('MsData', 'Number of MS1 spectra'),
    #  ('MsData', 'Number of MS2 spectra')
    ('MsData', 'MS min charge'),
    ('MsData', 'MS max charge'),
    ('MsData', 'MS min RT'),
    ('MsData', 'MS max RT'),
    ('MsData', 'MS min MZ'),
    ('MsData', 'MS max MZ'),
    #  ('ScanSettings', 'scan start time'),
    ('ScanSettings', 'mass resolution'),
    #  ('ScanSettings', 'mass unit'),
    #  ('ScanSettings', 'Number of scans'),
    #  ('ScanSettings', 'MS scan range'),
    ('ScanSettings', 'Retention time range'),
    ('ScanSettings', 'Mz range'),
    ('ScanSettings', 'beam-type collision-induced dissociation'),
    ('SampleData', 'sample number'),
    ('SampleData', 'Type'),
    ('SampleData', 'Vial'),
    ('SampleData', 'injection volume setting'),
    ('SampleData', 'Row'),
    ('SampleData', 'dilution factor'),
    #  ('SampleData', 'sample name'),
    #  ('SampleData', 'Comment')
]

df[columns_selected]

In [None]:
df[columns_selected].describe(include='all')

Test that dtypes of reloaded data are the same. Documents how to load metadata

In [None]:
# import numpy.testing as npt

# desired = df.dtypes.to_dict()
# # read data elsewhere, set dtypes automatically
# df_new = pd.read_csv(rawfile_metadata_out, header=[0, 1], index_col=0)
# actual = df_new.dtypes.to_dict()
# npt.assert_equal(actual, desired)