# Rawfile metadata

- generated using `workflows/metadata`
- currently: all processed files by MQ (11,000 out of 50,000) raw files

In [None]:
from collections import namedtuple
from collections import defaultdict

import yaml
import numpy as np
import pandas as pd

import vaep.pandas

## Arguments

In [None]:
# FN_PEPTIDE_INTENSITIES: str = 'data/df_intensities_N07813_M10000.csv'  # Samples metadata extraced from erda
# FN_PEPTIDE_FREQ: str = 'data/processed/count_all_peptides.json' # Peptide counts for all parsed files on erda (for data selection)
fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow
out_folder: str = 'data'

### Machine metadata

- read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser)

In [None]:
df_meta_rawfiles = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0)
date_col = ('FileProperties', 'Content Creation Date')
df_meta_rawfiles[date_col] = pd.to_datetime(
    df_meta_rawfiles[date_col])
df_meta_rawfiles.sort_values(date_col, inplace=True)
df_meta_rawfiles
msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser." 

In [None]:
meta_stats = df_meta_rawfiles.describe(include='all', datetime_is_numeric=True)
meta_stats

subset with variation

In [None]:
meta_stats.loc[:, (meta_stats.loc['unique'] > 1) |  (meta_stats.loc['std'] > 0.1)]

In [None]:
df_meta_rawfiles_columns = df_meta_rawfiles.columns # needs to go to Config which is not overwriteable by attribute selection
meta_raw_names = df_meta_rawfiles.columns.droplevel()
assert meta_raw_names.is_unique
df_meta_rawfiles.columns = meta_raw_names
df_meta_rawfiles

In [None]:
meta_raw_selected = [
 'Content Creation Date', 
 'Thermo Scientific instrument model',
 'instrument serial number',
 'Software Version', 
 'Number of MS1 spectra',
 'Number of MS2 spectra', 
 'Number of scans',
 'MS max charge',
 'MS max RT',
 'MS min MZ',
 'MS max MZ',
 'MS scan range', 
 'mass resolution',
 'Retention time range',
 'Mz range',
 'beam-type collision-induced dissociation', 
 'injection volume setting',
 'dilution factor',
]
df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10))

- `MS min MZ`: outlier clearly shifts means
- `mass resolution` is unique (can this be?)
- `dillution factor` is unique (can this be?)

## Instrument type and settings

check some columns describing settings
  - quite some variation due to `MS max charge`: Is it a parameter?

In [None]:
MetaRawSettings = namedtuple('MetaRawSettings', 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') 
meta_raw_settings = [
    'Thermo Scientific instrument model',
    'instrument attribute',
    'instrument serial number',
    'Software Version',
    'MS max charge',
    'mass resolution',
    'beam-type collision-induced dissociation',
    'injection volume setting',
    'dilution factor',
]
meta_raw_settings = MetaRawSettings(*meta_raw_settings)
meta_raw_settings

In [None]:
# index gives first example with this combination
# df_meta_rawfiles[list(meta_raw_settings)].drop_duplicates()
df_meta_rawfiles[list(meta_raw_settings)].drop_duplicates(ignore_index=True)

view without `MS max charge`:
  - software can be updated
  - variation by `injection volume setting` and instrument over time
  - missing `dilution factor`
  

In [None]:
to_drop = ['MS max charge']
# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=False) # index gives first example with this combination
df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True)

Relatively big samples for different machines of the same kind running with the same firmware:

In [None]:
df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[meta_raw_settings.ms_model].count().sort_values().tail(10)

Ignoring instrument software

In [None]:
grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3]))
instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()
msg += (f" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)"
        f", of which {(instrument_counts >= 1000).sum()} have 1,000 rawfiles assigned to them. Note that the entire dataset contains fractionated measurements."
       )
instrument_counts

In [None]:
ms_groups = vaep.pandas.create_dict_of_dicts(grouping.groups, verbose=True, transform_values=list)

In [None]:
# d = dict()
# for (k1, k2, k3), v in grouping.groups.items():
#     print(f"{str((k1,k2,k3)):90}: {len(v):>5}")
#     if not k1 in d:
#         d[k1] = dict()
#     if not k2 in d[k1]:
#         d[k1][k2] = dict()
#     d[k1][k2][k3] = list(v)
# assert ms_groups == d

Save selection yaml

In [None]:
with open('data/files_per_instrument_nested.yaml', 'w') as f:
    yaml.dump(ms_groups, f)    

In [None]:
print(msg)