# Protein sequence aggregation by gene

In [None]:
from collections import defaultdict
import json
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from Bio import Align

from config import FN_FASTA_DB
from config import fasta_entry as fasta_keys

In [None]:
with open(FN_FASTA_DB) as f:
    data_fasta = json.load(f)#, indent=4, sort_keys=False)
len(data_fasta)

In [None]:
gene_isotopes = defaultdict(list)
protein_wo_gene = []
for key, fasta_entry in tqdm(data_fasta.items()):
    gene = fasta_entry[fasta_keys.gene]
    if gene:
        gene_isotopes[gene].append(key)
    else:
        protein_wo_gene.append(key)

print(f"#{len(protein_wo_gene)} proteins have not gene associated: {', '.join(protein_wo_gene[:10])}, ...")

In [None]:
gene = 'ACTG1' # Actin as a contaminant protein
gene_isotopes[gene]

In [None]:
from pprint import pprint
for isotope in gene_isotopes[gene]:
    pprint(data_fasta[isotope])

## Sequences

In [None]:
sequences = {}
for isotope in gene_isotopes[gene]:
    sequences[isotope] = data_fasta[isotope][fasta_keys.seq]
sequences

In [None]:
sorted(sequences.values(), key=len)

In [None]:
sequences = pd.Series(sequences)
sequences.str.len()

In [None]:
aligner = Align.PairwiseAligner()

In [None]:
alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0']) # Identical? Maybe check if this is more than once the case?
for alignment in alignments:
    print(alignment)

In [None]:
data_fasta['I3L1U9'][fasta_keys.seq] == data_fasta['I3L3I0'][fasta_keys.seq]

In [None]:
alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical?
for alignment in alignments:
    print(alignment)
    break

In [None]:
alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical?
for alignment in alignments:
    print(alignment)
    break

## Unique Peptides

In [None]:
import itertools
peptides = {}
for isotope in gene_isotopes[gene]:
    sequences[isotope] = data_fasta[isotope][fasta_keys.peptides][0]

for peptides in itertools.zip_longest(*sequences.values, fillvalue=''):
    if len(set(peptides)) == 1: 
        print(f'all identical: {peptides[0]}')
    else:
        print('\t'.join(peptides))

In [None]:
for j, peptides in enumerate(sequences.values):
    if j==0:
        set_overlap = set(peptides)
    else:
        set_overlap = set_overlap.intersection(peptides)
set_overlap