# Analyse peptides

## Specification
- access different levels of peptides easily
- select training data per gene easily


In [None]:
import json

import pandas as pd
from config import FN_FASTA_DB, FN_ID_MAP, FN_PEPTIDE_INTENSITIES

id_map = pd.read_json(FN_ID_MAP, orient="split")

mask_no_gene = id_map.gene.isna()
id_map.loc[mask_no_gene, "gene"] = "-"


with open(FN_FASTA_DB) as f:
    data_fasta = json.load(f)

In [None]:
id_map.loc['A0A024R1R8']

In [None]:
data_peptides = pd.read_pickle(FN_PEPTIDE_INTENSITIES)

In [None]:
set_peptides = set(data_peptides.columns)

In [None]:
import ipywidgets as w
from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_PEPTIDES, KEY_GENE_NAME, KEY_GENE_NAME_FASTA

TGREEN = "\033[32m"  # Green Text
RESET = "\033[0;0m"

w_first_letter = w.Dropdown(
    options=id_map[KEY_GENE_NAME_FASTA].str[0].unique(), description="First letter of gene"
)
w_genes = w.Dropdown(
    options=id_map.gene.loc[id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value].unique()
)

mask = id_map.gene == w_genes.value
selected = id_map.loc[mask, "protein"]


w_proteins_ids = w.Dropdown(options=selected.index)
w_protein = w.Dropdown(options=selected.unique())


def update_gene_list(first_letter):
    mask_selected_genes = id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value
    w_genes.options = id_map.gene.loc[mask_selected_genes].unique()


_ = w.interactive_output(update_gene_list, {"first_letter": w_first_letter})


def update_protein_list(gene):
    mask = id_map[KEY_GENE_NAME_FASTA] == gene
    selected = id_map.loc[mask, "protein"]
    w_proteins_ids.options = selected.index
    w_protein.options = selected.unique()


_ = w.interactive_output(update_protein_list, {"gene": w_genes})


def show_sequences(prot_id):
    _data = data_fasta[prot_id]
    print(f"Protein_ID on Uniport: {prot_id}")
    print(f"HEADER: {_data[KEY_FASTA_HEADER]}")
    print(f"Seq: {_data[KEY_FASTA_SEQ]}")
    annotate_seq = "Peptides: \n"
    for i, _l in enumerate(_data[KEY_PEPTIDES]):
        annotate_seq += f"No. of missed K or R: {i}\n "
        for _pep in _l:
            if _pep in set_peptides:
                annotate_seq += TGREEN + f"\t{_pep},\n" + RESET
            else:
                annotate_seq += f"\t{_pep},\n"

    print(annotate_seq)


#     display(_data)


w_out = w.interactive_output(show_sequences, {"prot_id": w_proteins_ids})

panel_levels = w.VBox(
    [w.HBox([w_first_letter, w_genes, w_protein, w_proteins_ids]), w_out]
)
panel_levels

- `gene` `->` `Protein_ID` (contains information of `gene` `->` `protein_isotopes`
- `protein_ID` `->` `sequences` (`FN_FASTA_DB`)

In [None]:
# from pprint import pprint
# from vaep.utils import sample_iterable
# for _protein_ID in sample_iterable(data_fasta.keys(), n=1):
#     pprint({_protein_ID: data_fasta[_protein_ID]})

## Connect to experimental peptide data

In [None]:
from tqdm.notebook import tqdm

counts_observed_by_missed_cleavages = {}
for _protein_id, _data in tqdm(data_fasta.items()):
    _peptides = _data[KEY_PEPTIDES]
    _counts = {}
    for i, _l in enumerate(_peptides):
        _counts[i] = 0
        for _pep in _l:
            if _pep in set_peptides:
                _counts[i] += 1
    counts_observed_by_missed_cleavages[_protein_id] = _counts

In [None]:
df_counts_observed_by_missed_cleavages = pd.DataFrame(
    counts_observed_by_missed_cleavages
).T

In [None]:
import matplotlib.pyplot as plt
from matplotlib import table

fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [5, 1], "wspace": 0.2}, figsize=(10,4))

_counts_summed = df_counts_observed_by_missed_cleavages.sum()
_counts_summed.name = "frequency"

ax = axes[0]
_ = _counts_summed.plot(kind="bar", ax=ax)
ax.set_xlabel("peptides from n miscleavages")
ax.set_ylabel("frequency")

ax = axes[1]
ax.axis("off")
_ = pd.plotting.table(ax=ax, data=_counts_summed, loc="best", colWidths=[1], edges='open')
_ = fig.suptitle('Peptides frequencies')

In [None]:
mask = df_counts_observed_by_missed_cleavages != 0
df_prot_observed = df_counts_observed_by_missed_cleavages.replace(0, pd.NA)

In [None]:
df_prot_observed = df_prot_observed.dropna(axis=0, how="all")
df_prot_observed = df_prot_observed.fillna(0)
df_prot_observed = df_prot_observed.convert_dtypes()

In [None]:
from vaep.pandas import combine_value_counts

combine_value_counts(df_prot_observed)

In [None]:
freq_pep_mapped_to_protID = df_prot_observed.sum(axis=1).value_counts()
freq_pep_mapped_to_protID = freq_pep_mapped_to_protID.sort_index()