<a href="https://colab.research.google.com/github/Palaeoprot/pFind/blob/main/pFind_codes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# pFind3 Routines
[Github](https://github.com/pFindStudio/pFind3)


Open-pFind enables precise, comprehensive and rapid peptide identification in shotgun proteomics Hao Chi, Chao Liu, Hao Yang, Wen-Feng Zeng, Long Wu, Wen-Jing Zhou, Xiu-Nan Niu, Yue-He Ding, Yao Zhang, Rui-Min Wang, Zhao-Wei Wang, Zhen-Lin Chen, Rui-Xiang Sun, Tao Liu, Guang-Ming Tan, Meng-Qiu Dong, Ping Xu, Pei-Heng Zhang, Si-Min He. BioRxiv, Mar. 20, 2018.


##Analytical Scripts


The scripts are used for compare two or more identification results from pFind 3.

Modified from
[https://github.com/daheitu/scripts_for_pFind3_protocol.io](https://github.com/daheitu/scripts_for_pFind3_protocol.io)


Create a new folder in a desired location and give it a name.

To compare across samples either the identified proteins or PTMs, respectively, copy either
###“pFind_protein_contrast_script.py”
or
###“pFind_PTM_contrast_script.py”

and the different pFind.protein files to be compared into this new folder. Note that each pFind.protein file should be renamed before it comes to this folder.

The file name should be a ready reminder of the sample and the purpose of the search.

Open “pFind_protein_contrast_script.py” or “pFind_PTM_contrast_script.py” using ‘Notepad++’ or another editor, specify the path of the new folder and save.

You may change the name of the output file if you dislike the default one.


Header annotation of pFind_protein_contrast_result.txt and that of or pFind_PTM_contrast_result.txt can be found in Table S9 and Table S10 in SI.

# Global Varibles

## Import Packages

In [None]:
# Data manipulation:
import pandas as pd
from itertools import islice
#import sketch
from collections import Counter

# Debugging:
import traceback

# File I/O and path handling:
import os
import copy, os
import re
import requests
import json
from typing import Dict, Any


# Numerical analysis and statistics:
import numpy as np
from scipy import stats
from statistics import mode, multimode  # Consider removing if unused

# Data visualization:
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgba

# Third-party modules (Commented out modules can be imported as needed):
# from Bio import SeqIO  # Only import if used
# from icecream import ic  # Import on demand if needed

# Google Colab specific for mounting Google Drive:
from google.colab import drive
drive.mount('/content/drive')

##Text Path and Amino Acid colours

In [None]:
# Set the base path as per your Google Drive structure
STUDY_NAME = 'Dinosaur'
PFIND_FOLDER = f'pFind_{STUDY_NAME}'
BASE_PATH = f'/content/drive/MyDrive/Colab_Notebooks/NovorCloud/{STUDY_NAME}/{PFIND_FOLDER}/'



# Amino acids colors
amino_acids_colors = {
    "I": "#009688", "V": "#8bc34a", "B": "009688", "L": "#009688",
    "F": "#507351", "C": "#ffeb3b", "M": "#ffeb3b", "A": "#bdd54e",
    "G": "#9e9e9e", "T": "#ffc75e", "W": "#f49272", "S": "#ffc107",
    "Y": "#30802f", "P": "#607d8b", "H": "#673ab7", "Z": "average",
    "Q": "#f44336", "E": "#f44336", "N": "#e81e63", "D": "#f44336",
    "X": "#9d9e9e", "K": "#701637", "R": "#bd3e04"
}

#GNC Matting (next - hidded due to size)

# Ignore all of this - this was stuff I was messing aroud witn by do not want to delete

In [None]:
input_data = """
IPR000225 (94%);IPR003613 (93%);IPR011989 (93%)
IPR000515 (9%);IPR035906 (9%);IPR027417 (7%)
IPR000742 (50%);IPR000885 (50%);IPR002369 (50%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001007 (100%);IPR008160 (100%)
IPR000885 (100%);IPR001791 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (23%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (25%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (33%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (50%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (50%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (52%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (57%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (58%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (58%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (58%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (62%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (67%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (73%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (73%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (75%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (75%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (75%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (75%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (80%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (80%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (80%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (81%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (83%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (83%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (83%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (83%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (86%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (87%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (88%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (88%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (89%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (90%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (91%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (91%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (91%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (92%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (92%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (93%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (93%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (93%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (93%)
IPR000885 (100%);IPR008160 (100%);IPR001007 (98%)
IPR000885 (100%);IPR008160 (100%);IPR001791 (80%)
IPR000885 (100%);IPR008160 (100%);IPR012419 (11%)
IPR000885 (100%);IPR008160 (100%);IPR013320 (100%)
IPR000885 (77%);IPR008160 (77%);IPR003848 (19%)
IPR000885 (95%);IPR001007 (95%);IPR008160 (95%)
IPR001007 (100%);IPR008160 (100%);IPR000885 (67%)
IPR001073 (100%);IPR008160 (100%);IPR008983 (100%)
IPR001098 (33%);IPR002298 (33%);IPR002421 (33%)
IPR001916 (81%);IPR023346 (81%);IPR000974 (80%)
IPR002035 (100%);IPR002223 (100%);IPR003961 (100%)
IPR002035 (100%);IPR003961 (100%);IPR013320 (100%)
IPR002035 (100%);IPR003961 (100%);IPR013783 (100%)
IPR002035 (100%);IPR008160 (100%);IPR013320 (100%)
IPR002035 (100%);IPR008160 (100%);IPR036465 (100%)
IPR002035 (100%);IPR008160 (100%);IPR036465 (100%)
IPR002035 (100%);IPR008160 (100%);IPR036465 (100%)
IPR002035 (100%);IPR008160 (100%);IPR036465 (100%)
IPR002035 (100%);IPR008160 (100%);IPR036465 (100%)
IPR002035 (100%);IPR008160 (100%);IPR036465 (100%)
IPR002035 (100%);IPR036465 (100%);IPR041900 (99%)
IPR003054 (100%);IPR018039 (100%);IPR032444 (100%)
IPR008160 (100%)
IPR008160 (100%)
IPR008160 (100%)
IPR008160 (100%)
IPR008160 (100%)
IPR008160 (100%)
IPR008160 (100%)
IPR008160 (100%);IPR000885 (100%);IPR001007 (58%)
IPR008160 (100%);IPR000885 (14%);IPR001007 (12%)
IPR008160 (100%);IPR000885 (35%);IPR013320 (19%)
IPR008160 (100%);IPR000885 (50%)
IPR008160 (100%);IPR000885 (50%)
IPR008160 (100%);IPR000885 (50%);IPR001007 (50%)
IPR008160 (100%);IPR000885 (50%);IPR001007 (50%)
IPR008160 (100%);IPR000885 (55%);IPR001007 (52%)
IPR008160 (100%);IPR000885 (62%);IPR001007 (45%)
IPR008160 (100%);IPR000885 (67%);IPR001007 (67%)
IPR008160 (100%);IPR000885 (67%);IPR001007 (67%)
IPR008160 (100%);IPR000885 (67%);IPR001007 (67%)
IPR008160 (100%);IPR000885 (67%);IPR001007 (67%)
IPR008160 (100%);IPR000885 (68%);IPR001007 (52%)
IPR008160 (100%);IPR000885 (69%);IPR001007 (69%)
IPR008160 (100%);IPR000885 (71%);IPR001007 (71%)
IPR008160 (100%);IPR000885 (75%);IPR001007 (75%)
IPR008160 (100%);IPR000885 (76%);IPR001007 (73%)
IPR008160 (100%);IPR000885 (76%);IPR001007 (73%)
IPR008160 (100%);IPR000885 (80%);IPR001007 (60%)
IPR008160 (100%);IPR000885 (80%);IPR001007 (64%)
IPR008160 (100%);IPR000885 (81%);IPR001007 (79%)
IPR008160 (100%);IPR000885 (83%)
IPR008160 (100%);IPR000885 (83%);IPR001007 (67%)
IPR008160 (100%);IPR000885 (84%);IPR001007 (83%)
IPR008160 (100%);IPR000885 (85%);IPR001007 (81%)
IPR008160 (100%);IPR000885 (86%);IPR001007 (47%)
IPR008160 (100%);IPR000885 (86%);IPR001007 (83%)
IPR008160 (100%);IPR000885 (86%);IPR001007 (84%)
IPR008160 (100%);IPR000885 (86%);IPR001007 (86%)
IPR008160 (100%);IPR000885 (87%);IPR001007 (85%)
IPR008160 (100%);IPR000885 (88%)
IPR008160 (100%);IPR000885 (88%);IPR001007 (63%)
IPR008160 (100%);IPR000885 (88%);IPR001007 (83%)
IPR008160 (100%);IPR000885 (88%);IPR001007 (85%)
IPR008160 (100%);IPR000885 (88%);IPR001007 (85%)
IPR008160 (100%);IPR000885 (89%)
IPR008160 (100%);IPR000885 (89%);IPR001007 (77%)
IPR008160 (100%);IPR000885 (89%);IPR001007 (83%)
IPR008160 (100%);IPR000885 (89%);IPR001007 (83%)
IPR008160 (100%);IPR000885 (89%);IPR001007 (84%)
IPR008160 (100%);IPR000885 (90%);IPR001007 (44%)
IPR008160 (100%);IPR000885 (90%);IPR001007 (84%)
IPR008160 (100%);IPR000885 (90%);IPR001007 (86%)
IPR008160 (100%);IPR000885 (90%);IPR001007 (90%)
IPR008160 (100%);IPR000885 (90%);IPR001791 (80%)
IPR008160 (100%);IPR000885 (91%);IPR001007 (79%)
IPR008160 (100%);IPR000885 (91%);IPR001007 (82%)
IPR008160 (100%);IPR000885 (91%);IPR001007 (83%)
IPR008160 (100%);IPR000885 (91%);IPR001007 (83%)
IPR008160 (100%);IPR000885 (91%);IPR001007 (86%)
IPR008160 (100%);IPR000885 (91%);IPR001007 (88%)
IPR008160 (100%);IPR000885 (92%)
IPR008160 (100%);IPR000885 (92%)
IPR008160 (100%);IPR000885 (92%);IPR001007 (24%)
IPR008160 (100%);IPR000885 (92%);IPR001007 (35%)
IPR008160 (100%);IPR000885 (92%);IPR001007 (82%)
IPR008160 (100%);IPR000885 (92%);IPR001007 (92%)
IPR008160 (100%);IPR000885 (92%);IPR001007 (92%)
IPR008160 (100%);IPR000885 (92%);IPR012419 (1%)
IPR008160 (100%);IPR000885 (92%);IPR012419 (1%)
IPR008160 (100%);IPR000885 (93%)
IPR008160 (100%);IPR000885 (93%)
IPR008160 (100%);IPR000885 (93%);IPR001007 (36%)
IPR008160 (100%);IPR000885 (93%);IPR001007 (89%)
IPR008160 (100%);IPR000885 (93%);IPR001007 (93%)
IPR008160 (100%);IPR000885 (94%);IPR001007 (29%)
IPR008160 (100%);IPR000885 (94%);IPR001007 (48%)
IPR008160 (100%);IPR000885 (94%);IPR001007 (49%)
IPR008160 (100%);IPR000885 (94%);IPR001007 (62%)
IPR008160 (100%);IPR000885 (94%);IPR001007 (79%)
IPR008160 (100%);IPR000885 (94%);IPR001007 (89%)
IPR008160 (100%);IPR000885 (94%);IPR001007 (90%)
IPR008160 (100%);IPR000885 (94%);IPR012419 (0%)
IPR008160 (100%);IPR000885 (94%);IPR012419 (0%)
IPR008160 (100%);IPR000885 (94%);IPR012419 (0%)
IPR008160 (100%);IPR000885 (94%);IPR012419 (2%)
IPR008160 (100%);IPR000885 (94%);IPR013320 (87%)
IPR008160 (100%);IPR000885 (94%);IPR013320 (88%)
IPR008160 (100%);IPR000885 (95%)
IPR008160 (100%);IPR000885 (95%)
IPR008160 (100%);IPR000885 (95%)
IPR008160 (100%);IPR000885 (95%)
IPR008160 (100%);IPR000885 (95%)
IPR008160 (100%);IPR000885 (95%)
IPR008160 (100%);IPR000885 (95%);IPR001007 (82%)
IPR008160 (100%);IPR000885 (95%);IPR001007 (88%)
IPR008160 (100%);IPR000885 (95%);IPR013320 (91%)
IPR008160 (100%);IPR000885 (96%)
IPR008160 (100%);IPR000885 (96%)
IPR008160 (100%);IPR000885 (96%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (22%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (52%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (52%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (68%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (74%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (77%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (78%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (89%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (90%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (91%)
IPR008160 (100%);IPR000885 (96%);IPR001007 (95%)
IPR008160 (100%);IPR000885 (96%);IPR012419 (3%)
IPR008160 (100%);IPR000885 (97%)
IPR008160 (100%);IPR000885 (97%)
IPR008160 (100%);IPR000885 (97%);IPR001007 (36%)
IPR008160 (100%);IPR000885 (97%);IPR001007 (87%)
IPR008160 (100%);IPR000885 (97%);IPR001007 (91%)
IPR008160 (100%);IPR000885 (97%);IPR001007 (91%)
IPR008160 (100%);IPR000885 (97%);IPR001007 (93%)
IPR008160 (100%);IPR000885 (97%);IPR001007 (96%)
IPR008160 (100%);IPR000885 (97%);IPR001007 (97%)
IPR008160 (100%);IPR000885 (97%);IPR048287 (93%)
IPR008160 (100%);IPR000885 (98%)
IPR008160 (100%);IPR000885 (98%)
IPR008160 (100%);IPR000885 (98%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (17%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (45%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (61%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (63%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (66%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (73%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (75%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (78%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (91%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (93%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (93%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (93%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (94%)
IPR008160 (100%);IPR000885 (98%);IPR001007 (97%)
IPR008160 (100%);IPR000885 (98%);IPR012419 (1%)
IPR008160 (100%);IPR000885 (98%);IPR013320 (89%)
IPR008160 (100%);IPR000885 (98%);IPR013320 (90%)
IPR008160 (100%);IPR000885 (99%)
IPR008160 (100%);IPR000885 (99%)
IPR008160 (100%);IPR000885 (99%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (56%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (58%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (59%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (60%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (79%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (81%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (81%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (90%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (91%)
IPR008160 (100%);IPR000885 (99%);IPR001007 (92%)
IPR008160 (100%);IPR000885 (99%);IPR012419 (2%)
IPR008160 (100%);IPR000885 (99%);IPR013320 (91%)
IPR008160 (100%);IPR001007 (66%);IPR000885 (63%)
IPR008160 (100%);IPR001007 (92%);IPR000885 (90%)
IPR008160 (100%);IPR001007 (98%);IPR000885 (95%)
IPR008160 (100%);IPR002035 (60%);IPR036465 (60%)
IPR008160 (100%);IPR002035 (85%);IPR003961 (85%)
IPR008160 (100%);IPR002035 (94%);IPR002223 (94%)
IPR008160 (100%);IPR013320 (95%);IPR048287 (94%)
IPR008160 (100%);IPR013320 (96%);IPR048287 (96%)
IPR008160 (100%);IPR013320 (96%);IPR048287 (96%)
IPR008160 (100%);IPR013320 (96%);IPR048287 (96%)
IPR008160 (100%);IPR016187 (80%);IPR001304 (40%)
IPR008160 (17%);IPR029044 (12%);IPR003329 (11%)
IPR008160 (19%);IPR019018 (13%);IPR037789 (13%)
IPR008160 (20%);IPR027417 (7%);IPR001734 (5%)
IPR008160 (22%);IPR002125 (10%);IPR016193 (10%)
IPR008160 (27%);IPR002035 (17%);IPR036465 (17%)
IPR008160 (29%);IPR000885 (16%);IPR009075 (13%)
IPR008160 (32%);IPR000885 (26%);IPR001007 (22%)
IPR008160 (32%);IPR000885 (30%);IPR001007 (24%)
IPR008160 (34%);IPR000885 (11%);IPR036465 (11%)
IPR008160 (37%);IPR000885 (33%);IPR001245 (20%)
IPR008160 (37%);IPR013783 (29%);IPR002035 (29%)
IPR008160 (38%);IPR001036 (30%);IPR004764 (30%)
IPR008160 (38%);IPR016187 (15%);IPR002035 (14%)
IPR008160 (44%);IPR036465 (32%);IPR002035 (32%)
IPR008160 (52%);IPR000885 (32%);IPR001007 (25%)
IPR008160 (53%);IPR048287 (36%);IPR013320 (36%)
IPR008160 (6%);IPR027417 (5%);IPR013783 (3%)
IPR008160 (67%);IPR000885 (56%);IPR001298 (11%)
IPR008160 (71%);IPR000885 (55%);IPR001007 (39%)
IPR008160 (74%);IPR000885 (66%);IPR008610 (3%)
IPR008160 (76%);IPR000885 (71%);IPR001007 (51%)
IPR008160 (78%);IPR000885 (73%);IPR001007 (69%)
IPR008160 (81%);IPR000885 (70%);IPR013320 (58%)
IPR008160 (82%);IPR000885 (61%);IPR001007 (34%)
IPR008160 (83%);IPR000885 (65%);IPR013320 (57%)
IPR008160 (86%);IPR000885 (38%);IPR001007 (32%)
IPR008160 (87%);IPR000885 (28%);IPR001007 (22%)
IPR008160 (87%);IPR000885 (57%);IPR001007 (41%)
IPR008160 (89%);IPR000885 (39%);IPR013320 (33%)
IPR008160 (89%);IPR002035 (25%);IPR036465 (25%)
IPR008160 (94%);IPR000885 (63%);IPR001007 (38%)
IPR008160 (94%);IPR000885 (71%);IPR001007 (65%)
IPR008160 (94%);IPR000885 (81%);IPR005546 (6%)
IPR008160 (95%);IPR000885 (90%);IPR001007 (80%)
IPR008160 (95%);IPR000885 (91%);IPR001007 (72%)
IPR008160 (95%);IPR001442 (23%);IPR016187 (23%)
IPR008160 (96%);IPR000885 (53%);IPR001007 (45%)
IPR008160 (96%);IPR019931 (42%);IPR019950 (42%)
IPR008160 (98%);IPR000885 (89%);IPR048287 (41%)
IPR008160 (98%);IPR000885 (96%);IPR001007 (89%)
IPR008160 (99%);IPR000885 (85%);IPR001007 (61%)
IPR008160 (99%);IPR000885 (87%);IPR001007 (81%)
IPR008160 (99%);IPR000885 (90%);IPR013320 (64%)
IPR008160 (99%);IPR000885 (94%);IPR001007 (84%)
IPR008160 (99%);IPR000885 (94%);IPR001007 (84%)
IPR008160 (99%);IPR000885 (95%);IPR001007 (89%)
IPR008160 (99%);IPR000885 (99%);IPR013320 (86%)
IPR008160 (99%);IPR001007 (96%);IPR000885 (95%)
IPR008160 (99%);IPR002035 (96%);IPR036465 (96%)
IPR013783 (39%);IPR008902 (38%);IPR013737 (38%)
IPR027417 (7%);IPR017946 (5%);IPR030395 (5%)
"""

# Split the input data into lines, then process each line
lines = input_data.strip().split('\n')

# Initialize a set to hold unique InterPro IDs with 100% match
interpro_ids_100 = set()

for line in lines:
    parts = line.split(';')
    for part in parts:
        if '(100%)' in part:
            # Extract the InterPro ID
            interpro_id = part.split(' ')[0]
            interpro_ids_100.add(interpro_id)

# Convert the set to a list if needed or directly iterate over the set
interpro_ids_100_list = list(interpro_ids_100)
print(interpro_ids_100_list)


In [None]:
# Query Interpro
for interpro_id in interpro_ids_100_list:
    data = query_interpro(interpro_id)
    if data:
        # Example of how you might extract protein details
        # The actual keys and structure will depend on the API response
        for entry in data.get('entries', []):
            for protein in entry.get('proteins', []):
                protein_id = protein.get('id')
                sequence = protein.get('sequence', {}).get('value')
                taxonomy = protein.get('taxonomy', {}).get('scientificName')
                print(protein_id, sequence, taxonomy)

# Functions

## Proteins Constrast  Functions

### Function: `get_complement_proteins`
This function calculates the complementary proteins for a given protein within a list of proteins, based on a specified group of proteins.
### Function: `get_info_dic`
This function parses a file to extract protein, peptide, and modification information, organizing it into a nested dictionary structure.

### Function: `reformate_dic`
This function iterates over a dictionary, presumably structured with proteins as keys and another dictionary as values, where the inner dictionary has tuples of peptide and modification as keys. It restructures this into a dictionary where proteins are keys, leading to peptides, which map to a list of tuples containing modification and the original details.

### Function: `get_all_pros_sites_peps`
This function appears to collect all proteins, sites, and peptides from a given dictionary, likely for further analysis or reporting. The function's specifics and its intended output format are deduced from the provided code snippet.

### Function: `compare_results`
This function is designed to compare and analyze the differences between two datasets or experimental results. The specifics of the comparison criteria and the expected format of the inputs and outputs are not provided in the snippet.

### Function: `write_dic2_file`
This function writes the content of a dictionary to a file, organizing the data in a specific format that would presumably be useful for downstream analysis or reporting.


In [None]:
def get_complement_proteins(protein_list, pro, group_pro):
    """
    Calculate complementary proteins for a given protein in a list, based on a group of proteins.

    Parameters:
    - protein_list (list): A list of proteins potentially related to a peptide.
    - pro (str): The target protein for which complementary proteins are sought.
    - group_pro (list): A group of proteins considered relevant for the analysis.

    Returns:
    - str: A string of complementary proteins separated by slashes ("/").
    """
    if len(protein_list) == 1 and protein_list[0] == pro:
        return ""
    else:
        group_set = set(group_pro)  # Convert group of proteins to a set for efficient lookup
        pep_related_pro = set(protein_list)  # Convert list of potentially related proteins to a set
        target_pro_set = {pro}  # Convert target protein to a set for operations
        complement_pros = pep_related_pro - target_pro_set & group_set  # Calculate complementary proteins
        return "/".join(list(complement_pros))


def get_info_dic(fl_path):
    """
    Parse a file to extract and organize protein, peptide, and modification information.

    Parameters:
    - fl_path (str): Path to the file containing protein and peptide information.

    Returns:
    - dict: A dictionary with proteins as keys and details of associated peptides and modifications as values.
    """
    rep_dic = {}
    with open(fl_path, 'r') as f:
        lines = f.readlines()

    i = 0
    while i < len(lines) - 5:
        if lines[i].startswith("-----"):
            break
        else:
            linelist = lines[i].strip().split("\t")
            if not linelist[0].isdigit():
                i += 1
                continue

            group_pro = [linelist[1]]
            i += 1
            while i < len(lines) - 5:
                sub_list = lines[i].strip().split("\t")
                if sub_list[2].isdigit():
                    break
                else:
                    group_pro.append(sub_list[2])
                    i += 1

            while i < len(lines) - 5:
                pep_info_list = lines[i].strip().split("\t")
                if "-----" in lines[i]:
                    break
                elif pep_info_list[0].isdigit():
                    i += 1
                    continue
                else:
                    # Extract and process peptide information
                    pep = pep_info_list[3]
                    mod = pep_info_list[8]
                    pros = pep_info_list[10]
                    spec = pep_info_list[-1]
                    pro_list = pros[:-1].split("/")
                    best_score = pep_info_list[7]
                    is_unique = "not unique" if len(pro_list) > 1 else "is unique"
                    for pro in pro_list:
                        if pro in group_pro:
                            comple_pros = get_complement_proteins(pro_list, pro, group_pro)
                            if pro not in rep_dic:
                                rep_dic[pro] = {}
                            rep_dic[pro][(pep, mod)] = [spec, best_score, is_unique, comple_pros]
                    i += 1
    return rep_dic

def reformat_dic(info_dic):
    """
    Reformat the given dictionary to a specific structure.

    Parameters:
    - info_dic (dict): The dictionary to reformat, assumed to contain protein-related information.

    Returns:
    - dict: The reformatted dictionary.
    """
    new_dic = {}
    for pro, pep_info in info_dic.items():
        for pep_mod, details in pep_info.items():
            pep, mod = pep_mod
            if pro not in new_dic:
                new_dic[pro] = {}
            if pep not in new_dic[pro]:
                new_dic[pro][pep] = []
            new_dic[pro][pep].append((mod, details))
    return new_dic

def get_all_pros_sites_peps(info_dic):
    """
    Extract all proteins, sites, and peptides from the given dictionary.

    Parameters:
    - info_dic (dict): The dictionary containing protein, peptide, and site information.

    Returns:
    - list: A list of tuples, each containing a protein, site, and peptide.
    """
    all_list = []
    for pro, pep_details in info_dic.items():
        for pep, mods_details in pep_details.items():
            for mod, details in mods_details:
                all_list.append((pro, mod, pep))
    return all_list

def compare_results(dic1, dic2):
    """
    Compare and analyze the differences between two dictionaries.

    Parameters:
    - dic1 (dict): The first dictionary for comparison.
    - dic2 (dict): The second dictionary for comparison.

    Returns:
    - dict: A dictionary containing the results of the comparison.
    """
    result_dic = {}
    # Implementation of comparison logic would go here.
    # This would likely involve iterating over the keys and values of both dictionaries,
    # comparing their contents, and storing the differences in result_dic.
    return result_dic

def write_dic2_file(dic, file_path):
    """
    Write the content of a dictionary to a file in a specified format.

    Parameters:
    - dic (dict): The dictionary whose content is to be written.
    - file_path (str): The path of the file where the dictionary content will be written.

    Returns:
    - None
    """
    with open(file_path, 'w') as f:
        for key, value in dic.items():
            f.write(f"{key}: {value}\n")


## PTM Functions
### Function: `get_modi_info`

This function retrieves information about specific modifications from a list of protein data.

### Function: `get_complement_proteins`

This function generates a string of proteins, excluding the protein at index `i`.

### Function: `get_info_dic`

This function processes a file to extract and organize information about proteins, peptides, modifications, and other related data, based on a specific modification interest.

### Function: `reformat_dic`

This function reorganizes the dictionary structure obtained from `get_info_dic` for further analysis or reporting.

unctions `analyze_site`, `update_protein_metrics`, and `pro_metrics_summary` would need to be defined to handle specific tasks within `reformat_dic`, following the principle of breaking down complex tasks into smaller, manageable functions.

### Function: `get_all_pros_sites_peps`

This function compiles a comprehensive list of all proteins, sites, and peptides from the given dictionary.


### Function: `compare_results`

This function compares the results across different datasets or experimental conditions.

### Function: `compare_structured_data`

The function `compare_structured_data` needs to be implemented to perform the actual comparison of structured data, identifying common and unique entries across datasets. This design aims to maintain a clear separation of concerns, with each function responsible for a distinct aspect of the data processing pipeline.



In [None]:
#pFind_PTM_contrast functions
def get_modi_info(line_list, mod_target):
    """
    Extracts modification information for a given modification target from a list.

    Parameters:
    - line_list (list): The list containing protein data.
    - mod_target (str): The target modification to search for.

    Returns:
    - list: A list of lists containing information about the target modification.
    """
    proteins = line_list[10]
    modifications = line_list[8]
    peptide = line_list[3]
    protein_list = proteins[:-1].split('/')
    peptide_position_list = line_list[11][:-1].split('/')
    modification_list = modifications[:-1].split(";")
    spectrum_number = line_list[-1]
    score = line_list[7]
    info_list = []

    for mod_info in modification_list:
        modification_position, modification_name = mod_info.split(',')
        if modification_name == mod_target:
            protein_position_list = []
            for i, protein in enumerate(protein_list):
                peptide_position = int(peptide_position_list[i].split(',')[0])
                modification_position_int = peptide_position + int(modification_position)
                protein_position = protein + "[" + str(modification_position_int) + "]"
                protein_position_list.append(protein_position)
            protein_positions = "/".join(protein_position_list) + "/"
            info_list.append([proteins, protein_positions, peptide, modifications, score, spectrum_number])

    return info_list

def get_complement_proteins(protein_list, i):
    """
    Generates a string of proteins excluding the one at the specified index.

    Parameters:
    - protein_list (list): The list of proteins.
    - i (int): The index of the protein to exclude.

    Returns:
    - str: A string of the remaining proteins joined by "/".
    """
    copy_list = copy.deepcopy(protein_list)
    del copy_list[i]
    return "" if not copy_list else "/".join(copy_list)

In [None]:
def get_info_dic(file_path, modify):
    """
    Extracts and organizes protein and peptide information from a file, based on a specific modification.

    Parameters:
    - file_path (str): The path to the input file.
    - modify (str): The modification to filter the data by.

    Returns:
    - dict: A dictionary with proteins as keys and their site-specific information as values.
    """
    uni_dic = {}

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        if modify in line:
            line_list = line.rstrip("\n").split("\t")
            pro_sites_info = get_modi_info(line_list, modify)

            for pro_site in pro_sites_info:
                pros, pro_pos, pep, mods, score, spec_num = pro_site
                protein_list = pros[:-1].split("/")
                is_unique = "unique" if len(protein_list) == 1 else "not unique"
                pro_pos_list = pro_pos[:-1].split('/')

                for i, protein in enumerate(protein_list):
                    pep_info_list = [pep, mods, score, spec_num, is_unique]
                    complem_proteins = get_complement_proteins(protein_list, i)
                    pep_info_list.append(complem_proteins)
                    protein_pos = pro_pos_list[i]

                    if protein not in uni_dic:
                        uni_dic[protein] = {}
                    uni_dic[protein].setdefault(protein_pos, []).append(pep_info_list)

    return uni_dic

def reformat_dic(uni_dic):
    """
    Reorganizes the dictionary structure for protein and peptide information.

    Parameters:
    - uni_dic (dict): The initial dictionary with unorganized protein information.

    Returns:
    - dict: A reorganized dictionary with detailed protein and site information.
    """
    re_uni_dic = {}
    for protein, sites_info in uni_dic.items():
        pro_metrics = {"m_num": 0, "best_score": float('inf'), "p_num": 0, "site_num": 0, "uni_pep_pro": "no unique peptides"}
        pro_sites_dic = {}

        for site, peptides in sites_info.items():
            site_data = analyze_site(peptides)
            update_protein_metrics(pro_metrics, site_data)

            pro_sites_dic[site] = site_data

        re_uni_dic[protein] = [pro_metrics_summary(pro_metrics), pro_sites_dic]

    return re_uni_dic

def get_all_pros_sites_peps(total_dic):
    """
    Compiles information on all proteins, their sites, and associated peptides.

    Parameters:
    - total_dic (dict): The dictionary containing detailed protein information.

    Returns:
    - dict: A structured dictionary with comprehensive information on proteins, sites, and peptides.
    """
    structured_dic = {}
    for sample, sample_data in total_dic.items():
        for protein, sites_info in sample_data.items():
            if protein not in structured_dic:
                structured_dic[protein] = {"sites": {}}

            for site, site_info in sites_info["sites"].items():
                structured_dic[protein]["sites"].setdefault(site, {"peptides": []})
                peptides = structured_dic[protein]["sites"][site]["peptides"]

                for peptide_info in site_info["peptides"]:
                    peptides.append(peptide_info)

    return structured_dic

def compare_results(total_dic):
    """
    Compares protein, site, and peptide information across different datasets.

    Parameters:
    - total_dic (dict): A dictionary containing protein information from various datasets.

    Returns:
    - dict: A dictionary highlighting differences or similarities across datasets.
    """
    if len(total_dic) == 1:
        return next(iter(total_dic.values()))

    structured_dic = get_all_pros_sites_peps(total_dic)
    compare_structured_data(structured_dic, total_dic)


    return structured_dic


def write_dic2_file(final_dic, sp_list, w_name, base_path):
    """
    Writes the dictionary containing protein, site, and peptide information to a file.

    The output file is structured with headers and subheaders to organize the information
    across different samples, including details about proteins, sites, peptides, and modifications.

    Parameters:
    - final_dic (dict): The dictionary containing the data to be written to the file.
    - sp_list (list): A list of sample identifiers.
    - w_name (str): The name of the output file.
    - base_path (str): The base path where the output file will be saved.

    This function creates a file in the specified base path and writes the contents of `final_dic`
    into it in a tab-separated format. It organizes the data with headers for easy understanding.
    """
    # Define the full path for the output file
    output_file_path = os.path.join(base_path, w_name)

    # Open the file for writing
    with open(output_file_path, 'w') as b:
        # Write the headers
        line1_list = [""] * 4
        line1_list.extend([sp + "\t" * 3 for sp in sp_list])
        b.write("\t".join(line1_list) + "\n")

        line2_list = ["Protein", "", "", ""]
        line2_list.extend(["Total_site_num@pro", "Total_pep_num@pro", "Total_spec_num@pro", "have unique peptides?"] * len(sp_list))
        b.write("\t".join(line2_list) + "\n")

        line3_list = ["", "site", "", ""]
        line3_list.extend(["Total_pep_num@site(Total_unique_pep_num@site)", "Total_spec_num@site(Total_unique_spec_num@site)", "best-score@site", "have unique peptides?"] * len(sp_list))
        b.write("\t".join(line3_list) + "\n")

        line4_list = ["", "", "Peptide", "Modification"]
        line4_list.extend(["Total_spec_num@pep", "shared proteins", "best-score@pep", "is unique?"] * len(sp_list))
        b.write("\t".join(line4_list) + "\n")

        # Write the data from the dictionary
        for pro, (pro_info, sites_dic) in final_dic.items():
            pro_wlist = [pro] + [""] * 3 + pro_info
            b.write("\t".join(map(str, pro_wlist)) + "\n")
            for site, (site_info, peps_dic) in sites_dic.items():
                site_wlist = ["", site] + [""] * 2 + site_info
                b.write("\t".join(map(str, site_wlist)) + "\n")
                for (pep, mod), pep_info in peps_dic.items():
                    pep_wlist = ["", "", pep, mod] + pep_info
                    b.write("\t".join(map(str, pep_wlist)) + "\n")


In [None]:
#pFind_protein_contrastfunctions

BASE_PATH = f'/content/drive/MyDrive/Colab_Notebooks/NovorCloud/Dinosaur/{STUDY_NAME}/{PFIND_FOLDER}'
output_name = "pFind_protein_contrast_result.txt" # important! the output file name

In [None]:
# Path PTM
BASE_PATH = f'/content/drive/MyDrive/Colab_Notebooks/NovorCloud/Dinosaur/{STUDY_NAME}/{PFIND_FOLDER}'
output_name = "pFind_PTM_contrast_result.txt" # important! the output file name
modify = "Oxidation[M]" # target modification name

In [None]:
#pFind_PTM_contrast
def main():
    total_dic = {}
    for fl in os.listdir(BASE_PATH):
        if fl.endswith(".protein"):
            fl_path = os.path.join(BASE_PATH, fl)
            fl_name = fl[:-8]
            print(fl_path)
            total_dic[fl_name] =  reformate_dic(get_info_dic(fl_path))
    final_dic = compare_results(total_dic)
    sp_list = list(total_dic.keys())
    # print(final_dic)
    write_dic2_file(final_dic, sp_list, output_name)



if __name__ == "__main__":
    main()
    print("Well Done!")

In [None]:
#pFind_protein_contrast
def main():
    total_dic = {}
    for fl in os.listdir(BASE_PATH):
        if fl.endswith(".protein"):
            fl_path = os.path.join(BASE_PATH, fl)
            fl_name = fl[:-8]
            total_dic[fl_name] =  reformate_dic(get_info_dic(fl_path))
    final_dic = compare_results(total_dic)
    sp_list = list(total_dic.keys())
    print(final_dic)
    write_dic2_file(final_dic, sp_list, output_name)



if __name__ == "__main__":
    main()
    print("Well Done!")