# ALD Study

In [None]:
from pathlib import Path
import pandas as pd
import vaep

pd.options.display.max_columns = 50
pd.options.display.max_rows = 100

In [None]:
folder_data = Path('data/applications/')
folder_data_out = Path('data/single_datasets/')
folder_run = Path('runs/appl_ald_data')
folder_run.mkdir(parents=True, exist_ok=True)

print(*(folder_data.iterdir()), sep='\n')

f_proteinGroups = folder_data / '20190620_210717_20190620_P0000005_Lili2Klibrary_Report.csv'
f_proteome = folder_data / 'ald_proteome_spectronaut.tsv'
f_annotations = folder_data / 'ald_experiment_annotations.csv'
f_clinic = folder_data / 'ald_cli_164.csv'
f_raw_meta = folder_data / 'ald_metadata_rawfiles.csv'

## (Aggregated) Peptide Data 

In [None]:
peptides = pd.read_table(f_proteome, low_memory=False)
peptides.shape

In [None]:
peptides

In [None]:
peptides.iloc[:, :8].describe(include='all')

In [None]:
column_types = peptides.iloc[:, 8:].columns.to_series().apply(lambda s: tuple(s.split('.')[-2:]))
column_types.describe()  # .apply(lambda l: l[-1])

In [None]:
column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())]
column_types

In [None]:
peptides = peptides.set_index(list(peptides.columns[:8])).sort_index(axis=1)

In [None]:
peptides.loc[:, peptides.columns.str.contains(column_types[0])]

In [None]:
peptides.iloc[:20, :6]

create new multiindex from column

In [None]:
peptides.columns = pd.MultiIndex.from_tuples(peptides.columns.str.split().str[1].str.split(
    '.raw.').to_series().apply(tuple), names=['Sample ID', 'vars'])
peptides = peptides.stack(0)
peptides

## Protein Group data

In [None]:
pg = pd.read_csv(f_proteinGroups, low_memory=False)
N_FRIST_META = 2
pg

In [None]:
pg.iloc[:, :N_FRIST_META].describe(include='all')

In [None]:
column_types = pg.iloc[:, N_FRIST_META:].columns.to_series().apply(lambda s: tuple(s.split('.')[-2:]))
column_types.describe()  # .apply(lambda l: l[-1])

In [None]:
column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())]
column_types

In [None]:
pg = pg.set_index(list(pg.columns[:N_FRIST_META])).sort_index(axis=1)
pg.loc[:, pg.columns.str.contains(column_types[1])]

In [None]:
pg.columns = pd.MultiIndex.from_tuples(pg.columns.str.split().str[1].str.split(
    '.htrms.').to_series().apply(tuple), names=['Sample ID', 'vars'])
pg = pg.stack(0)
pg

## Meta data

- sample annotation (to select correct samples)
- meta data from Spectronaut ouput
- clinical data
- meta data from raw files (MS machine recorded meta data)

### From Spectronaut file

In [None]:
meta = peptides.index.to_frame().reset_index(drop=True)
meta

In [None]:
meta.describe()

### Sample annotations

- `Groups`: more detailed (contains sub-batch information)
- `Group2`: used to separate samples into cohorts for study
- `Sample type`: There are liver biopsy samples measured -> select only Plasma samples

In [None]:
annotations = pd.read_csv(f_annotations, index_col='Sample ID')
annotations

Select ALD subcohort

In [None]:
# annotations.Groups.value_counts()
annotations.Group2.value_counts()

In [None]:
groups = ['ALD']  # 'ALD-validation', 'HP'
selected = (annotations.Group2.isin(['ALD'])) & (annotations['Sample type'] == 'Plasma')
selected = selected.loc[selected].index
annotations.loc[selected].describe(include=['object', 'string'])

### Clinical data

In [None]:
clinic = pd.read_csv(f_clinic, index_col=0)
clinic

- `idx_overlap`:  Will be used to select samples with data across datasets available

In [None]:
print('Missing labels: ', selected.difference(clinic.index))
idx_overlap = clinic.index.intersection(selected)

In [None]:
clinic.loc[idx_overlap]

### Rawfile information

In [None]:
raw_meta = pd.read_csv(f_raw_meta, header=[0, 1], index_col=0)
raw_meta.index.name = "Sample ID (long)"
raw_meta

Measurements are super homogenous

In [None]:
raw_meta.describe()

In [None]:
idx = raw_meta.index.to_series()
idx = idx.str.extract(r'(Plate[\d]_[A-H]\d*)').squeeze()
idx.name = 'Sample ID'
idx.describe()

In [None]:
raw_meta = raw_meta.set_index(idx)
raw_meta

In [None]:
df_meta_rawfiles_columns = raw_meta.columns  # needs to go to Config which is not overwriteable by attribute selection
meta_raw_names = raw_meta.columns.droplevel()
assert meta_raw_names.is_unique
meta_raw_names.name = None
raw_meta.columns = meta_raw_names

In [None]:
raw_meta.loc[['Plate6_F2']]

In [None]:
print("Missing metadata in set of selected labels: ", idx_overlap.difference(raw_meta.index))
idx_overlap = idx_overlap.intersection(raw_meta.index)  # proteomics data has to be part of metadata

Still save all metadata which is there, but subselect data samples accordingly

In [None]:
raw_meta.to_csv(folder_data_out / 'raw_meta.csv')

## Missing samples

From the above we can note that there is
- no clinical data for `Plate6_F2`
- no metadata for `Plate2_C1`: re-measured sample which looks fine, but fails with error `"Unable to access the RAW file using the native Thermo library"`

> see section below

## Select aggregated peptide level data

taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/)

feature | description 
--- | ---
PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group.
PEP.StrippedSequence | -
PEP.IsProteotypic |  -
PEP.PeptidePosition | -
PG.Cscore | - 
PG.ProteinAccessions | -
PG.Genes | - 
PEP.Quantity | The quantitative value for that peptide as defined in the settings.
EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] 
EG.Qvalue | The q-value (FDR) of the EG.
EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. 

> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. 

> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. 

After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop

In [None]:
sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity']
sel_data = peptides.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2])
sel_data

In [None]:
sel_data = sel_data.squeeze().dropna().astype(float).unstack()
sel_data

In [None]:
idx = sel_data.index.to_series()
idx = idx.str.extract(r'(Plate[\d]_[A-H]\d*)').squeeze()
idx.name = 'Sample ID'
idx.describe()

- rawfile metadata -> keep 

In [None]:
sel_data = sel_data.set_index(idx)
sel_data = sel_data.loc[idx_overlap]
sel_data

In [None]:
des_data = sel_data.describe()
des_data

### Check for metadata from rawfile overlap

For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`)

In [None]:
idx_diff = sel_data.index.difference(raw_meta.index)
annotations.loc[idx_diff]

In [None]:
kwargs = {'xlabel': 'peptide number ordered by completeness',
          'ylabel': 'peptide was found in # samples',
          'title': 'peptide measurement distribution'}

ax = vaep.plotting.plot_counts(des_data.T.sort_values(by='count', ascending=False).reset_index(
), feat_col_name='count', feature_name='Aggregated peptides', n_samples=len(sel_data), ax=None, **kwargs)

fig = ax.get_figure()
fig.tight_layout()
vaep.savefig(fig, name='data_aggPeptides_completness', folder=folder_run)

### Select features which are present in at least 25% of the samples

In [None]:
PROP_FEAT_OVER_SAMPLES = .25
prop = des_data.loc['count'] / len(sel_data)
selected = prop >= PROP_FEAT_OVER_SAMPLES
selected.value_counts()

In [None]:
sel_data = sel_data.loc[:, selected]
sel_data

Dump selected data

In [None]:
sel_data.to_pickle(folder_data_out / 'ald_aggPeptides_spectronaut.pkl')

## Select Protein Group data

In [None]:
sel_data = pg.drop('PG.NrOfStrippedSequencesUsedForQuantification', axis=1)
sel_data

In [None]:
mask = sel_data['PG.Quantity'] == 'Filtered'
print("No. of Filtered entries: ", mask.sum())
sel_data = sel_data.loc[~mask]
sel_data

In [None]:
sel_data = sel_data.squeeze().dropna().astype(float).unstack()
sel_data

In [None]:
gene_non_unique = sel_data.index.to_frame()["PG.Genes"].value_counts() > 1
gene_non_unique = gene_non_unique[gene_non_unique].index
gene_non_unique

In [None]:
sel_data.loc[pd.IndexSlice[:, gene_non_unique], :].T.describe()

In [None]:
sel_data = sel_data.T

idx = sel_data.index.to_series()
idx = idx.str.extract(r'(Plate[\d]_[A-H]\d*)').squeeze()
idx.name = 'Sample ID'
idx.describe()

In [None]:
sel_data = sel_data.set_index(idx)
sel_data = sel_data.loc[idx_overlap]
sel_data

In [None]:
des_data = sel_data.describe()
des_data

### Check for metadata from rawfile overlap

In [None]:
idx_diff = sel_data.index.difference(raw_meta.index)
annotations.loc[idx_diff]

In [None]:
kwargs = {'xlabel': 'protein group number ordered by completeness',
          'ylabel': 'peptide was found in # samples',
          'title': 'protein group measurement distribution'}

ax = vaep.plotting.plot_counts(des_data.T.sort_values(by='count', ascending=False).reset_index(
), feat_col_name='count', n_samples=len(sel_data), ax=None, **kwargs)

fig = ax.get_figure()
fig.tight_layout()
vaep.savefig(fig, name='data_proteinGroups_completness', folder=folder_run)

### Select features which are present in at least 25% of the samples

In [None]:
PROP_FEAT_OVER_SAMPLES = .25
prop = des_data.loc['count'] / len(sel_data)
selected = prop >= PROP_FEAT_OVER_SAMPLES
selected.value_counts()

In [None]:
sel_data = sel_data.loc[:, selected]
sel_data

Check for non unique genes after dropping uncommon protein groups.

In [None]:
gene_non_unique = sel_data.columns.to_frame()["PG.Genes"].value_counts() > 1
gene_non_unique = gene_non_unique[gene_non_unique].index
gene_non_unique

- less often found -> less intensity on average and on maximum

- [ ] decided if protein group should be subselected
- alternative selection: per sample, select protein group with highest intensity per sample

In [None]:
sel_data.T.loc[pd.IndexSlice[:, gene_non_unique], :].T.describe()

In [None]:
sel_data = sel_data.droplevel(1, axis=1)

In [None]:
sel_data.to_pickle(folder_data_out / 'ald_proteinGroups_spectronaut.pkl')