# ALD Study

In [None]:
from pathlib import Path
import pandas as pd
import vaep

pd.options.display.max_columns = 50
pd.options.display.max_rows = 100

In [None]:
folder_data = Path('data/applications/')
folder_data_out = Path('data/single_datasets/')
folder_run = Path('runs/ald_study')
folder_run.mkdir(parents=True, exist_ok=True)

print(*(folder_data.iterdir()), sep='\n')

f_proteome = folder_data / 'ald_proteome_spectronaut.tsv'
f_annotations = folder_data / 'ald_experiment_annotations.csv'
f_clinic = folder_data / 'ald_cli_164.csv'
f_raw_meta = folder_data / 'ald_metadata_rawfiles.csv'

In [None]:
data = pd.read_table(f_proteome, low_memory=False)
data.shape

In [None]:
data

In [None]:
data.iloc[:, :8].describe(include='all')

In [None]:
column_types = data.iloc[:, 8:].columns.to_series().apply(lambda s: tuple(s.split('.')[-2:]))
column_types.describe()  # .apply(lambda l: l[-1])

In [None]:
column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())]
column_types

In [None]:
data = data.set_index(list(data.columns[:8])).sort_index(axis=1)

In [None]:
data.loc[:, data.columns.str.contains(column_types[0])]

In [None]:
data.iloc[:20, :6]

create new multiindex from column

In [None]:
data.columns = pd.MultiIndex.from_tuples(data.columns.str.split().str[1].str.split(
    '.raw.').to_series().apply(tuple), names=['Sample ID', 'vars'])
data = data.stack(0)
data

## Meta data

- sample annotation (to select correct samples)
- meta data from Spectronaut ouput
- clinical data
- meta data from raw files (MS machine recorded meta data)

### From Spectronaut file

In [None]:
meta = data.index.to_frame().reset_index(drop=True)
meta

In [None]:
meta.describe()

### Sample annotations

- `Groups`: more detailed (contains sub-batch information)
- `Group2`: used to separate samples into cohorts for study
- `Sample type`: There are liver biopsy samples measured -> select only Plasma samples

In [None]:
annotations = pd.read_csv(f_annotations, index_col='Sample ID')
annotations

Select ALD subcohort

In [None]:
# annotations.Groups.value_counts()
annotations.Group2.value_counts()

In [None]:
groups = ['ALD']  # 'ALD-validation', 'HP'
selected = (annotations.Group2.isin(['ALD'])) & (annotations['Sample type'] == 'Plasma')
selected = selected.loc[selected].index
annotations.loc[selected].describe(include=['object', 'string'])

### Clinical data

In [None]:
clinic = pd.read_csv(f_clinic, index_col=0)
clinic

- `idx_overlap`:  Will be used to select samples with data across datasets available

In [None]:
print('Missing labels: ', selected.difference(clinic.index))
idx_overlap = clinic.index.intersection(selected)

In [None]:
clinic.loc[idx_overlap]

### Rawfile information

In [None]:
raw_meta = pd.read_csv(f_raw_meta, header=[0, 1], index_col=0)
raw_meta.index.name = "Sample ID (long)"
raw_meta

Measurements are super homogenous

In [None]:
raw_meta.describe()

In [None]:
idx = raw_meta.index.to_series()
idx = idx.str.extract(r'(Plate[\d]_[A-H]\d*)').squeeze()
idx.name = 'Sample ID'
idx.describe()

In [None]:
raw_meta = raw_meta.reset_index().set_index(idx)
raw_meta

In [None]:
df_meta_rawfiles_columns = raw_meta.columns  # needs to go to Config which is not overwriteable by attribute selection
meta_raw_names = raw_meta.columns.droplevel()
assert meta_raw_names.is_unique
meta_raw_names.name = None
raw_meta.columns = meta_raw_names

In [None]:
raw_meta.loc[['Plate6_F2']]

In [None]:
print("Missing metadata in set of selected labels: ", idx_overlap.difference(raw_meta.index))
idx_overlap = idx_overlap.intersection(raw_meta.index)  # proteomics data has to be part of metadata

Still save all metadata which is there, but subselect data samples accordingly

In [None]:
raw_meta.to_pickle(folder_data_out / 'raw_meta.pkl')

## Missing samples

From the above we can note that there is
- no clinical data for `Plate6_F2`
- no metadata for `Plate2_C1`

> see section below

## Select Proteomics data

taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/)

feature | description 
--- | ---
PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group.
PEP.StrippedSequence | -
PEP.IsProteotypic |  -
PEP.PeptidePosition | -
PG.Cscore | - 
PG.ProteinAccessions | -
PG.Genes | - 
PEP.Quantity | The quantitative value for that peptide as defined in the settings.
EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] 
EG.Qvalue | The q-value (FDR) of the EG.
EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. 

> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. 

> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. 

After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop

In [None]:
sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity']
sel_data = data.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2])
sel_data

In [None]:
sel_data = sel_data.squeeze().dropna().astype(float).unstack()
sel_data

In [None]:
idx = sel_data.index.to_series()
idx = idx.str.extract(r'(Plate[\d]_[A-H]\d*)').squeeze()
idx.name = 'Sample ID'
idx.describe()

- rawfile metadata -> keep 

In [None]:
sel_data = sel_data.set_index(idx)
sel_data = sel_data.loc[idx_overlap]
sel_data

In [None]:
des_data = sel_data.describe()
des_data

### Check for metadata from rawfile overlap

For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`)

In [None]:
idx_diff = sel_data.index.difference(raw_meta.index)
annotations.loc[idx_diff]

In [None]:
kwargs = {'xlabel': 'peptide number ordered by completeness',
          'ylabel': 'peptide was found in # samples',
          'title': 'peptide measurement distribution'}

ax = vaep.plotting.plot_counts(des_data.T.sort_values(by='count', ascending=False).reset_index(), feat_col_name='count', feature_name='Aggregated peptides', n_samples=len(sel_data), ax=None, **kwargs)

Dump selected data

In [None]:
sel_data.to_pickle(folder_data_out / 'ald_aggPeptides_spectronaut.pkl')