<a href="https://colab.research.google.com/github/Palaeoprot/pFind/blob/Merge_fasta_files/pFind_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**pFind Analysis** Scripts


# Global Varibles

In [None]:
!pip install Biopython



## Import Packages

In [275]:
# Data manipulation:
import pandas as pd
from itertools import islice
#import sketch
from collections import  defaultdict, Counter

# Debugging:
import traceback

# File I/O and path handling:
import os
import copy, os
import re
import requests
import json
from datetime import datetime
from typing import Dict, Any

# Numerical analysis and statistics:
import numpy as np
from scipy import stats
from statistics import mode, multimode  # Consider removing if unused

# Data visualization:
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgba
import seaborn as sns
import plotly.graph_objects as go


# Third-party modules (Commented out modules can be imported as needed):
from Bio import SeqIO  # Only import if used
# from icecream import ic  # Import on demand if needed

# Google Colab specific for mounting Google Drive:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Sequence dictionaries

In [None]:
# non_sequences_dict = {
#     'TRYUP-Sus_sco': {'species': 'Bos taurus','offset': 0, 'sequence': 'IVGGYTCAANSIPYQVSLNSGSHFCGGSLINSQWVVSAAHCYKSRIQVRLGEHNIDVLEGNEQFINAAKIITHPNFNGNTLDNDIMLIKLSSPATLNSRVATVSLPRSCAAAGTECLISGWGNTKSSGSSYPSLLQCLKAPVLSDSSCKSSYPGQITGNMICVGFLEGGKDSCQGDSGGPVVCNGQLQGIVSWGYGCAQKNKPGVYTKVCNYVNWIQQTIAAN'},
#     'AHSG-Bos_tau' : {'species': 'Bos taurus','offset': 0, 'sequence':'IPLDPVAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLANCSVRQQTQHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNDSRVVHAVEVALATFNAESNGSYLQLVEISRAQFVPLPVSVSVEFAVAATDCIAKEVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVRVTCTLFQTQPVIPQPQPDGAEAEAPSAVPDAAGPTPSAAGPPVASVVVGPSVVAVPLPLHRAHYDLRHTFSGVASVESSSGEAFHVGKTPIVGQPSIPGGPVRLCPGRIRYFKI'},
#     'AHSG-Bub_bub' : {'species': 'Bos taurus','offset': 0, 'sequence':'IPLDPVAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLANCSVRQQTEHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNDSRVVHAVEVALATFNAQSNGSYLQLVEISRAQFVPLPASVSVEFAVAATDCIAKDVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVAVTCTLFQTQPVILQPQPDGAEAGAPSAVPDAAGPAPSAAGPPVASVVVGPSVVAVPLPLHRAHYDLRHTFSGVASVESASGEAFHVGKTPIVGQPSVPGGPVRLCPGRIRYFKI'},
#     'AHSG-Equ_cab' : {'species': 'Bos taurus','offset': 1, 'sequence':' LPNGLSPAYRQLNCDDPETEQAALLAVDYINSHIHQGYKHVLNQIDKVQVWAQPTGESFKLEIDTLETTCHALDPTPLANCSVRQLTQHAVEGDCDVRLLKQNGQFSVSFVKCKSSPDSAEDVRKVCLDCPLLAPLNDTRVVHAVEAALAAFNAQNNGSYFQLVEISRAQLVPLPVSVHVEFAVAATDCVAKEVIDPAKCNLLAEKQYGFCKATLTEKVGGEDVAVTCTVFQTQPVVLLPQPDGPDVGVPGPVADAVTPAPSPADLPVASLVVGPVVVAASQLPPPVHRAHYDLRHAFAGVGSGESASGEAFHVEKPPKVAHPNTAAAAGPVVRPCPGRIRYFKII'},
#     'AHSG-Ovi_are' : {'species': 'Bos taurus','offset': 1, 'sequence':'IPLDPIAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLVNCSVRQQTEHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNNSQVVHAAEVALATFNAQNNGSYFQLVEISRAQFVPLPGSVSVEFAVAATDCIAKEVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVTVTCTLFQTQPVIPQPQPEGAEAGAPSAVPDAAVPDAAVPAPSAAGLPVGSVVAGPSVVAVPLPLHRAHYDLRHTFSGVASVESASGEAFHVGKTPIVGQPSVPGGPVHLCPGRIRYFKI'},
#     'AHSG-Cap_hir' : {'species': 'Bos taurus','offset': 1, 'sequence':'IPLDPIAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLANCSVRQQTEHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNNSQVVHAAEVALATFNAQNNGSYFQLVEISRAQFVPLPVSVSVEFAVAATDCIAKEVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVAVTCTLFQTQPVIPQPQPEGAEAGAPSAVPDAAVPAPSAAGLPVGSVVAGPSVVAVALPLHRAHYDLRHTFSGVASVESASGEAFHVGKTPIVGQPSVPGGPVHLCPGRIRYFKI'},
#     'AHSG-Equ_asn' : {'species': 'Bos taurus','offset': 1, 'sequence':' LPNGLSPAYRQLNCDDPETEQAALLAVDYINSHIHQGYKHVLNQIDKVQVWAQPTGESFKLEIDTLETTCHALDPTPLANCSVRQLTQHAVEGDCDVRLLKQNGQFSVSFVKCKSSPDSAEDVRKVCLDCPLLAPLNDTRVVHAVEAALAAFNAQNNGSYFQLVEISRAQLVPLPVSVHVEFAVAATDCVAKEVIDPAKCNLLAEKQYGFCKATLTEKVGGEDVAVTCTVFQTQPVVPLPQPDGPDVGVPGPVADAATPEPSPADLPVASLVVGPVVVAAPQLPPPVHRAHYDLRHAFAGVGSGESASGEAFHVEKPPKVAHPNTAAAAGPVVRPCPGRIRYFKI'},
#     'TPM1-Cam_dro' : {'species': 'Bos taurus','offset': 0, 'sequence':'MDAIKKKMQMLKLDKENAIDRAEQAEADKKQAEDRCKQLEEEQQALQKKLKGTEDEVEKYSESVKDAQEKLEQAEKKATDAEADVASLNRRIQLVEEELDRAQERLATALQKLEEAEKAADESERGMKVIENRAMKDEEKMELQEMQLKEAKHIAEDSDRKYEEVARKLVILEGELERSEERAEVAESRARQLEEELRTMDQALKSLMASEEEEGLGLPNSICCHSHNFALLFSPPPLPHCAISTVSPTVSSHPPPPHTPCSKCGDLEEELKIVTNNLKSLEAQADKYSTKEDKYEEEIKLLEEKLKEAETRAEFAERSVAKLEKTIDDLEDEVYAQKMKYKAISEELDNALNDITSL'},
#     'KS1-Hom_sap' : {'species': 'Bos taurus','offset': 0, 'sequence':'SRQFSSRSGYRSGGGFSSGSAGIINYQRRTTSSSTRRSGGGGGRFSSCGGGGGSFGAGGGFGSRSLVNLGGSKSISISVARGGGRGSGFGGGYGGGGFGGGGFGGGGFGGGGIGGGGFGGFGSGGGGFGGGGFGGGGYGGGYGPVCPPGGIQEVTINQSLLQPLNVEIDPEIQKVKSREREQIKSLNNQFASFIDKVRFLEQQNQVLQTKWELLQQVDTSTRTHNLEPYFESFINNLRRRVDQLKSDQSRLDSELKNMQDMVEDYRNKYEDEINKRTNAENEFVTIKKDVDGAYMTKVDLQAKLDNLQQEIDFLTALYQAELSQMQTQISETNVILSMDNNRSLDLDSIIAEVKAQYEDIAQKSKAEAESLYQSKYEELQITAGRHGDSVRNSKIEISELNRVIQRLRSEIDNVKKQISNLQQSISDAEQRGENALKDAKNKLNDLEDALQQAKEDLARLLRDYQELMNTKLALDLEIATYRTLLEGEESRMSGECAPNVSVSVSTSHTTISGGGSRGGGGGGYGSGGSSYGSGGGSYGSGGGGGGGRGSYGSGGSSYGSGGGSYGSGGGGGGHGSYGSGSSSGGYRGGSGGGGGGSSGGRGSGGGSSGGSIGGRGSSSGGVKSSGGSSSVKFVSTTYSGVTR'},
#     'KS9-Hom_sap' : {'species': 'Bos taurus','offset': 0, 'sequence':'SRQFSSRSGYRSGGGFSSGSAGIINYQRRTTSSSTRRSGGGGGRFSSCGGGGGSFGAGGGFGSRSLVNLGGSKSISISVARGGGRGSGFGGGYGGGGFGGGGFGGGGFGGGGIGGGGFGGFGSGGGGFGGGGFGGGGYGGGYGPVCPPGGIQEVTINQSLLQPLNVEIDPEIQKVKSREREQIKSLNNQFASFIDKVRFLEQQNQVLQTKWELLQQVDTSTRTHNLEPYFESFINNLRRRVDQLKSDQSRLDSELKNMQDMVEDYRNKYEDEINKRTNAENEFVTIKKDVDGAYMTKVDLQAKLDNLQQEIDFLTALYQAELSQMQTQISETNVILSMDNNRSLDLDSIIAEVKAQYEDIAQKSKAEAESLYQSKYEELQITAGRHGDSVRNSKIEISELNRVIQRLRSEIDNVKKQISNLQQSISDAEQRGENALKDAKNKLNDLEDALQQAKEDLARLLRDYQELMNTKLALDLEIATYRTLLEGEESRMSGECAPNVSVSVSTSHTTISGGGSRGGGGGGYGSGGSSYGSGGGSYGSGGGGGGGRGSYGSGGSSYGSGGGSYGSGGGGGGHGSYGSGSSSGGYRGGSGGGGGGSSGGRGSGGGSSGGSIGGRGSSSGGVKSSGGSSSVKFVSTTYSGVTR'},
#     'KS2-Hom_sap' : {'species': 'Bos taurus','offset': 0, 'sequence':'SRQFSSRSGYRSGGGFSSGSAGIINYQRRTTSSSTRRSGGGGGRFSSCGGGGGSFGAGGGFGSRSLVNLGGSKSISISVARGGGRGSGFGGGYGGGGFGGGGFGGGGFGGGGIGGGGFGGFGSGGGGFGGGGFGGGGYGGGYGPVCPPGGIQEVTINQSLLQPLNVEIDPEIQKVKSREREQIKSLNNQFASFIDKVRFLEQQNQVLQTKWELLQQVDTSTRTHNLEPYFESFINNLRRRVDQLKSDQSRLDSELKNMQDMVEDYRNKYEDEINKRTNAENEFVTIKKDVDGAYMTKVDLQAKLDNLQQEIDFLTALYQAELSQMQTQISETNVILSMDNNRSLDLDSIIAEVKAQYEDIAQKSKAEAESLYQSKYEELQITAGRHGDSVRNSKIEISELNRVIQRLRSEIDNVKKQISNLQQSISDAEQRGENALKDAKNKLNDLEDALQQAKEDLARLLRDYQELMNTKLALDLEIATYRTLLEGEESRMSGECAPNVSVSVSTSHTTISGGGSRGGGGGGYGSGGSSYGSGGGSYGSGGGGGGGRGSYGSGGSSYGSGGGSYGSGGGGGGHGSYGSGSSSGGYRGGSGGGGGGSSGGRGSGGGSSGGSIGGRGSSSGGVKSSGGSSSVKFVSTTYSGVTRR'},
#     'ALB-Bos_tau' : {'species': 'Bos taurus','offset': 0, 'sequence':'MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'},
#     'ALB-Equ_cab' : {'species': 'Bos taurus','offset': 0, 'sequence':'MKWVTFVSLLFLFSSAYSRGVLRRDTHKSEIAHRFNDLGEKHFKGLVLVAFSQYLQQCPFEDHVKLVNEVTEFAKKCAADESAENCDKSLHTLFGDKLCTVATLRATYGELADCCEKQEPERNECFLTHKDDHPNLPKLKPEPDAQCAAFQEDPDKFLGKYLYEVARRHPYFYGPELLFHAEEYKADFTECCPADDKLACLIPKLDALKERILLSSAKERLKCSSFQNFGERAVKAWSVARLSQKFPKADFAEVSKIVTDLTKVHKECCHGDLLECADDRADLAKYICEHQDSISGKLKACCDKPLLQKSHCIAEVKEDDLPSDLPALAADFAEDKEICKHYKDAKDVFLGTFLYEYSRRHPDYSVSLLLRIAKTYEATLEKCCAEADPPACYRTVFDQFTPLVEEPKSLVKKNCDLFEEVGEYDFQNALIVRYTKKAPQVSTPTLVEIGRTLGKVGSRCCKLPESERLPCSENHLALALNRLCVLHEKTPVSEKITKCCTDSLAERRPCFSALELDEGYVPKEFKAETFTFHADICTLPEDEKQIKKQSALAELVKHKPKATKEQLKTVLGNFSAFVAKCCGREDKEACFAEEGPKLVASSQLALA'},
#     'ALB-Equ_asn' : {'species': 'Bos taurus','offset': 0, 'sequence':'MKWVTFVSLLFLFSSAYFRGVLRRDTHKSEIAHRFNDLGEKHFKGLVLVAFSQYLQQCPFEDHVKLVNEVTEFAKKCAADESAENCDKSLHTLFGDKLCTVATLRATYGELADCCEKQEPERNECFLTHKDDHPNLPKLKPEPDAQCAAFQEDPDKFLGKYLYEVARRHPYFYGPELLFHAEEYKADFTECCPADDKAGCLIPKLDALKERILLSSAKERLKCSSFQKFGERAFKAWSVARLSQKFPKADFAEVSKIVTDLTKVHKECCHGDLLECADDRADLTKYICEHQDSISGKLKACCDKPLLQKSHCIAEVKEDDLPSDLPALAADFAEDKEICKHYKDAKDVFLGTFLYEYSRRHPDYSVSLLLRIAKTYEATLEKCCAEADPPACYATVFDQFTPLVEEPKSLVKKNCDLFEEVGEYDFQNALIVRYTKKAPQVSTPTLVEIGRTLGKVGSRCCKLPESERLPCSENHLALALNRLCVLHEKTPVSEKITKCCTDSLAERRPCFSALELDEGYIPKEFKAETFTFHADICTLPEDEKQIKKQSALAELVKHKPKATKEQLKTVLGNFSAFVAKCCGAEDKEACFAEEGPKLVASSQLALA'},
#     'ALB-Sus_scr' : {'species': 'Bos taurus','offset': 0, 'sequence':'MKWVTFISLLFLFSSAYSRGVFRRDTYKSEIAHRFKDLGEQYFKGLVLIAFSQHLQQCPYEEHVKLVREVTEFAKTCVADESAENCDKSIHTLFGDKLCAIPSLREHYGDLADCCEKEEPERNECFLQHKNDNPDIPKLKPDPVALCADFQEDEQKFWGKYLYEIARRHPYFYAPELLYYAIIYKDVFSECCQAADKAACLLPKIEHLREKVLTSAAKQRLKCASIQKFGERAFKAWSLARLSQRFPKADFTEISKIVTDLAKVHKECCHGDLLECADDRADLAKYICENQDTISTKLKECCDKPLLEKSHCIAEAKRDELPADLNPLEHDFVEDKEVCKNYKEAKHVFLGTFLYEYSRRHPDYSVSLLLRIAKIYEATLEDCCAKEDPPACYATVFDKFQPLVDEPKNLIKQNCELFEKLGEYGFQNALIVRYTKKVPQVSTPTLVEVARKLGLVGSRCCKRPEEERLSCAEDYLSLVLNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYKPKEFVEGTFTFHADLCTLPEDEKQIKKQTALVELLKHKPHATEEQLRTVLGNFAAFVQKCCAAPDHEACFAVEGPKFVIEIRGILA'},

#     }

sequences_dict = {
    'COL1A1-Bos_tau': {'species': 'Bos taurus','offset': -7, 'sequence': 'STGISVPGPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEGGPQGPRGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGSKGDTGAKGEPGPTGIQGPPGPAGEEGKRGARGEPGPAGLPGPPGERGGPGSRGFPGADGVAGPKGPAGERGAPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGEAGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGANGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGAPGKDGVRGLTGPIGPPGPAGAPGDKGEAGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPAGPPGPIGNVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPAGKEGSKGPRGETGPAGRPGEVGPPGPPGPAGEKGAPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPMGPPGLAGPPGESGREGAPGAEGSPGRDGSPGAKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGETGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGSPGKDGLNGLPGPIGPPGPRGRTGDAGPAGPPGPPGPPGPPGPPSGGYDLSFLPQPPQEK'},
    'COL1A1-Mam_mam': {'species': 'Mammuthus primigenius','offset': -8, 'sequence': 'SAGGISVPGPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPAGPQGPSGAPGPKGNSGEPGAPGSKGDAGAKGEPGPIGIQGPPGPAGEEGKRGARGEPGPTGLPGPPGERGGPGSRGFPGADGVAGPKGPAGERGSPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGAAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGEAGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGSNGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGSPGKDGPRGLTGPIGPPGPAGAPGDKGEAGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPTGAPGPIGNVGAPGAKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPAGKEGGKGPRGETGPAGRPGEVGPPGPPGPAGEKGSPGADGPAGAPGTPGPQGIGGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGSSGERGPPGPAGPPGLAGPPGESGREGAPGAEGSPGRDGSPGPKGDRGETGPSGPPGAPGAPGAPGPVGPAGKSGDRGETGPAGPAGPAGPAGVRGPAGPQGPRGDKGETGEQGDRGLKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPPGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSGAFDFSFLPQPPQEK'},
    'COL1A1-Gal_gal': {'species': 'Gallus gallus','offset': -7, 'sequence': 'SAGVAVPGPMGPAGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPAGPPGKNGDDGEAGKPGRPGQRGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGQPGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGPSGPAGARGNDGAPGAAGPPGPTGPAGPPGFPGAAGAKGETGPQGARGSEGPQGARGEPGPPGPAGAAGPAGNPGADGQPGAKGATGAPGIAGAPGFPGARGPSGPQGPSGAPGPKGNSGEPGAPGNKGDTGAKGEPGPAGVQGPPGPAGEEGKRGARGEPGPAGLPGPAGERGAPGSRGFPGADGIAGPKGPPGERGSPGAVGPKGSPGEAGRPGEPGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKPGERGAPGPPGAVGAAGKDGEAGAQGPPGPTGPAGERGEQGPAGAPGFQGLPGPAGPPGEAGKPGEQGVPGDAGAPGPAGARGERGFPGERGVQGPPGPQGPRGANGAPGNDGAKGDAGAPGAPGNQGPPGLQGMPGERGAAGLPGAKGDRGDPGPKGADGAPGKDGLRGLTGPIGPPGPAGAPGDKGEAGPPGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGETGDAGAKGDAGPPGPAGPTGAPGPAGAVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNIGLPGPPGPSGKEGGKGPRGETGPAGRPGEPGPAGPPGPPGEKGSPGADGPIGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGSPGERGPPGPMGPPGLAGPPGEAGREGAPGAEGAPGRDGAAGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKNGDRGETGPAGPAGPPGPAGARGPAGPQGPRGDKGETGEQGDRGMKGHRGFSGLQGPPGPPGAPGEQGPSGASGPAGPRGPPGSAGAAGKDGLNGLPGPIGPPGPRGRTGEVGPVGPPGPPGPPGPPGPPSGGFDFSFLPQPPQEK'},
    'COL1A1-Apr_xxx': {'species': 'Apr xxx','offset': -0, 'sequence': 'GPMGPAGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPAGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGEPGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGPSGPAGARGNDGSPGAAGPPGPTGPAGPPGFPGAAGAKGETGPQGGRGSEGPQGARGEPGPPGPAGAAGPAGNPGADGQPGAKGATGAPGIAGAPGFPGARGPSGPQGPSGAPGPKGNSGEPGAPGNKGDTGAKGEPGPAGVQGPPGPAGEEGKRGARGEPGPAGLPGPAGERGAPGSRGFPGADGIAGPKGPPGERGSPGPAGPKGSPGESGRPGEPGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKPGERGAPGPPGAVGAAGKDGEAGAQGPPGPTGPAGERGEQGPAGAPGFQGLPGPAGAPGEAGKPGEQGVPGDAGAPGPAGARGERGFPGERGVQGPPGPQGPRGANGAPGNDGAKGDAGAPGAPGNQGPPGLQGMPGERGAAGLPGAKGDRGDPGPKGADGIPGKDGLRGLTGPIGPPGPAGAPGDKGEAGPPGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGETGDAGAKGDAGPPGPAGPTGAPGPAGAVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNIGLPGPPGPSGKEGGKGPRGETGPAGRPGEPGPAGPPGPPGEKGSPGSDGPIGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGSPGERGPPGPMGPPGLAGPPGEAGREGAPGAEGAPGRDGSAGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKNGDRGETGPAGPAGPPGPAGARGPSGPQGPRGDKGETGEQGDRGMKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAAGKDGLNGLPGPIGPPGPRGRTGDVGPVGPPGPPGPPGPPGPPSGGFDFSFLPQ'},
    'COL1A1-Caa_xxx': {'species': 'Caa xxx','offset': -0, 'sequence': 'GPMGPAGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPAGPPGKNGEDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGEPGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGPSGPAGARGNDGAPGAAGPPGPTGPAGPPGFPGAGGAKGETGPPGARGSEGPQGARGEPGPPGPAGAAGPAGNPGADGQPGAKGATGAPGIAGAPGFPGARGPSGPQGPSGAPGPKGNAGEPGAPGNKGDTGAKGEPGPAGVQGPPGPAGEEGKRGARGEPGPAGLPGPAGERGAPGSRGFPGADGIAGPKGPPGERGSPGPVGPKGSPGEAGRPGEPGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQGGVMGFPGPKGAAGEPGKPGERGAPGPPGAVGAAGKDGEVGAQGPPGPTGPAGERGEQGPAGAPGFQGLPGPAGPPGEAGKPGEQGVPGDAGAPGPAGARGERGFPGERGVQGPPGPQGPRGANGAPGNDGAKGDAGAPGAPGNQGPPGLQGMPGERGAAGLPGAKGDRGDPGPKGADGAPGKDGLRGLTGPIGPPGPAGAPGDKGEAGPPGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGETGETGAKGDAGPPGPAGPTGAPGPAGAVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNIGLPGPPGPSGKEGGKGPRGETGPAGRPGEPGPAGPPGPPGEKGSPGADGPIGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGAPGERGPPGPMGPPGLAGPPGEAGREGSPGAEGAPGRDGAAGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGETGPQGPAGPPGPAGARGPAGPQGPRGDKGETGEQGDRGMKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAAGKDGLNGLPGPIGPPGPRGRTGEAGPVGPPGPPGPPGPPGPPSGGFDFSFMPQ'},
    'COL1A1-Nop_xxx': {'species': 'Nop xxx','offset': -0, 'sequence': 'GPMGPAGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPAGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGEPGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGPSGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAAGAKGETGPQGARGSEGPQGARGEPGPPGPAGAAGPAGNPGADGQPGAKGATGAPGIAGAPGFPGARGPSGPQGPSGAPGPKGNSGEPGAPGNKGDTGAKGEPGPAGVQGPPGPAGEEGKRGARGEPGPAGLPGPAGERGAPGSRGFPGADGIAGPKGPPGERGSPGAVGPKGSPGEAGRPGEPGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKPGERGAPGPPGAVGAAGKDGEAGAQGPPGPTGPAGERGEQGPAGAPGFQGLPGPAGPPGEAGKPGEQGVPGDAGAPGPAGARGERGFPGERGVQGPPGPQGPRGANGAPGNDGAKGDAGAPGAPGNQGPPGLQGMPGERGAAGLPGAKGDRGDPGPKGADGAPGKDGLRGLTGPIGPPGPAGAPGDKGEAGPPGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGETGDAGAKGDAGPPGPAGPTGAPGPAGAVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNIGLPGPPGPSGKEGGKGPRGETGPAGRPGEPGPAGPPGPPGEKGSPGADGPIGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGSPGERGPPGPMGPPGLAGPPGEAGREGAPGAEGAPGRDGAAGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKNGDRGETGPAGPAGPPGPAGARGPAGPQGPRGDKGETGEQGDRGMKGHRGFSGLQGPPGPPGQPGEQGPSGASGPAGPRGPPGSAGAAGKDGLNGLPGPIGPPGPRGRTGDVGPVGPPGPPGPPGPPGPPSGGFDFSFLPQ'},
    'COL1A1-Pog_vit': {'species': 'Pogona vitticeps','offset': -0, 'sequence': 'GPMGPSGPRGPPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPAGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDPGPAGPKGEPGSPGENGAPGQVGPRGLPGERGRPGAPGPAGARGNDGSPGAAGPPGPTGPAGPPGFPGAVGPKGETGAQGSRGSEGPQGARGEPGPPGPAGAAGPSGNPGTDGQPGAKGSPGAPGIAGAPGFPGARGPAGPQGPAGAPGPKGNSGEPGAPGNKGDTGAKGETGPAGVQGPPGPPGEEGKRGSRGEPGPAGLPGPAGERGAPGSRGFPGADGIAGPKGPPGERGSPGPAGPKGSTGEAGRPGEPGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPAGPPGARGQAGVMGFPGPKGAAGEPGKPGERGAPGAVGAVGAPGKDGEVGAQGPPGPTGPAGERGEQGPSGAPGFQGLPGPAGAPGEAGKPGEQGVPGDVGAPGPAGARGERGFPGERGVQGAAGPPGPRGANGSPGNDGAKGDAGAPGAPGNQGPPGLQGMPGERGAAGLPGAKGDRGDTGPKGLDGAPGKDGLRGLTGPIGPPGPAGAPGDKGEAGPQGPAGPTGARGAPGDRGEPGPAGPAGFAGPPGTDGQPGAKGEPGDAGAKGDAGPPGPAGATGPAGPAGPIGAPGPKGARGSPGPPGATGFPGAAGRVGPPGPSGNIGLPGPPGPVGKEGAKGPRGETGPAGRPGEAGPAGPPGPPGEKGSPGSDGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPMGPPGLAGPPGEAGREGSPGAEGAPGRDGPAGPKGDRGETGPAGPPGAPGAPGAPGPMGPAGKNGDRGETGPAGPAGPAGPAGARGAAGPQGPRGDKGETGEQGDRGMKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPIGPPGPRGRTGDVGPAGPPGPPGPPGPPGPPSGGFDFSFLPQPPQEKAHDGR'},
    'COL1A1-Alg_sin': {'species': 'Alligator sinensis','offset': -0, 'sequence': 'GPMGPAGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPAGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGEPGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGPSGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAAGAKGETGPQGARGSEGPQGARGEPGPPGPAGAAGPAGNPGADGQPGAKGATGAPGIAGAPGFPGARGPSGPQGPSGAPGPKGNSGEPGAPGNKGDTGAKGEPGPAGVQGPPGPAGEEGKRGARGEPGPAGLPGPAGERGAPGSRGFPGADGIAGPKGPPGERGSPGAVGPKGSPGEAGRPGEPGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKPGERGAPGPPGAVGAAGKDGEAGAQGPPGPTGPAGERGEQGPAGAPGFQGLPGPAGPPGEAGKPGEQGVPGDAGAPGPAGARGERGFPGERGVQGPPGPQGPRGANGAPGNDGAKGDAGAPGAPGNQGPPGLQGMPGERGAAGLPGAKGDRGDPGPKGADGAPGKDGLRGLTGPIGPPGPAGAPGDKGEAGPPGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGETGDAGAKGDAGPPGPAGPTGAPGPAGAVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNIGLPGPPGPSGKEGGKGPRGETGPAGRPGEPGPAGPPGPPGEKGSPGADGPIGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGSPGERGPPGPMGPPGLAGPPGEAGREGAPGAEGAPGRDGAAGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKNGDRGETGPAGPAGPPGPAGARGPAGPQGPRGDKGETGEQGDRGMKGHRGFSGLQGPPGPPGQPGEQGPSGASGPAGPRGPPGSAGAAGKDGLNGLPGPIGPPGPRGRTGDVGPVGPPGPPGPPGPPGPPSGGFDFSFLPQ'},
    'COL1A1-Tha_sir': {'species': 'Thamnophis sirtalis','offset': -0, 'sequence': 'GPMGPAGPRGLPGPPGAPGPQGFQGPPGEPGEPGSSGPMGPRGPAGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQVGPRGLPGERGRPGPAGSAGARGNDGAAGAAGPTGPTGPAGPPGFPGPAGPKGETGPQGARGGEGAQGARGEPGAAGPAGAAGPAGNPGSDGQPGAKGAPGAPGISGAPGFPGARGAAGPQGPTGAPGPKGNSGEPGAPGNKGDAGAKGEAGPAGVQGPSGPPGEEGKRGSRGEPGPSGLPGPAGERGAPGSRGFPGADGIAGPKGPPGERGAPGPAGPKGSTGEAGRTGEPGLPGAKGLTGSPGSPGADGKTGPAGPAGQDGRPGPAGPPGARGQAGVMGFPGPKGSAGEPGKPGERGAPGATGANGAPGKDGDAGAQGPPGPAGPAGERGEQGPSGAPGFQGLPGPAGAPGESGKPGEQGVPGDVGAPGPSGARGERGFPGERGAQGPAGPSGPRGANGAPGNDGAKGDAGAPGAPGGQGPPGLQGMPGERGAAGLPGAKGDRGDPGAKGTDGSPGKDGPRGLTGPIGPPGPAGSPGDKGESGPSGPAGPTGARGAPGDRGEPGPAGPAGFAGPPGTDGQPGAKGEPGDAGAKGDAGPPGPAGATGPAGPAGPVGAPGPKGARGNAGPPGATGFPGAAGRVGPPGPSGNIGLPGPPGPSGKEGSKGPRGETGPAGRPGEVGPAGPPGPSGEKGSPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPTGEPGKQGPSGASGERGPPGPSGPPGLAGPPGEAGREGSPGAEGAPGRDGAAGPKGDRGESGPAGAPGAPGAPGAPGPVGPAGKNGERGESGPAGPAGPAGPSGARGPAGAQGPRGDKGETGEQGDRGMKGHRGFSGLQGPPGPPGSPGEQGPSGSSGPAGPRGPPGSAGSSGKDGLNGLPGPIGPPGPRGRTGDVGPAGPAGPPGPPGPPGPPSGGFDFSFLPQPPQEKAHDGGR'},
    'COL1A2-Bos_tau': {'species': 'Bos taurus','offset': -4, 'sequence': 'GGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPSGDPGKAGEKGHAGLAGARGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGARGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDIGSPGRDGARGAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPSGISGPPGPPGPAGKEGLRGPRGDQGPVGRSGETGASGPPGFVGEKGPSGEPGTAGPPGTPGPQGLLGAPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPVGKHGNRGEPGPAGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGAVGPAGPRGPAGPSGPAGKDGRIGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYEFGFDGDFYR'},
    'COL1A2-Lox_afr': {'species': 'Loxodonta africana','offset': -5, 'sequence': 'KGIGLGPGPMGLMGPRGPPGATGPPGSPGFQGPPGEPGEPGQTGPAGSRGPAGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQIGARGLPGERGRVGGPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGEIGPVGNPGPSGPAGPRGEAGLPGVSGPVGPPGNPGANGLAGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGSKGEPGSAGPQGPPGPSGEEGKRGSSGEAGSAGPAGPPGLRGGPGSRGLPGADGRAGVMGPPGSRGASGPAGVRGPSGDSGRPGEPGVMGPRGLPGSPGNVGPAGKEGPAGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPAGDPGKNGDKGHAGLAGPRGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPSGTAGEAGKPGERGIPGEFGLPGPAGPRGERGPPGQSGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPGGLPGERGAAGIPGGKGEKGETGLRGDTGNTGRDGARGAPGAVGAPGPAGATGDRGEAGPAGSAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQAGAKGERGTKGPKGENGPVGPTGPVGAAGPAGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPAGITGPPGPPGAAGKEGLRGPRGDQGPVGRTGETGASGPPGFAGEKGSSGEPGTAGPPGTPGPQGILGPPGILGLPGSRGERGLPGVAGAVGEPGPLGIAGPPGARGPPGAVGSPGVNGAPGEAGRDGNPGSDGPPGRDGLPGHKGERGYPGNAGPVGTAGAPGPQGPLGPAGKHGNRGEPGPAGSVGPVGAVGPRGPSGPQGARGDKGEAGDKGPRGLPGFKGHNGLQGLPGLAGQHGDQGSPGSVGPAGPRGPAGPSGPVGKDGRPGHAGAVGPAGVRGSQGSQGPSGPPGPPGPPGPPGPSGGGYDFGYDGDFYRA'},
    'COL1A2-Chk_COL1A2': {'species': 'Gallus gallus','offset': -4, 'sequence': 'AADFGPGPMGLMGPRGPPGASGPPGPPGFQGVPGEPGEPGQTGPQGPRGPPGPPGKAGEDGHPGKPGRPGERGVAGPQGARGFPGTPGLPGFKGIRGHNGLDGQKGQPGTPGTKGEPGAPGENGTPGQPGARGLPGERGRIGAPGPAGARGSDGSAGPTGPAGPIGAAGPPGFPGAPGAKGEIGPAGNVGPTGPAGPRGEIGLPGSSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPPGPAGPSGARGLVGEPGPAGAKGESGNKGEPGAAGPPGPPGPSGEEGKRGSNGEPGSAGPPGPAGLRGVPGSRGLPGADGRAGVMGPAGNRGASGPVGAKGPNGDAGRPGEPGLMGPRGLPGQPGSPGPAGKEGPVGFPGADGRVGPIGPAGNRGEPGNIGFPGPKGPTGEPGKPGEKGNVGLAGPRGAPGPEGNNGAQGPPGVTGNQGAKGETGPAGPPGFQGLPGPSGPAGEAGKPGERGLHGEFGVPGPAGPRGERGLPGESGAVGPAGPIGSRGPSGPPGPDGNKGEPGNVGPAGAPGPAGPGGIPGERGVAGVPGGKGEKGAPGLRGDTGATGRDGARGLPGAIGAPGPAGGAGDRGEGGPAGPAGPAGARGIPGERGEPGPVGPSGFAGPPGAAGQPGAKGERGPKGPKGETGPTGAIGPIGASGPPGPVGAAGPAGPRGDAGPPGMTGFPGAAGRVGPPGPAGITGPPGPPGPAGKDGPRGLRGDVGPVGRTGEQGIAGPPGFAGEKGPSGEAGAAGPPGTPGPQGILGAPGILGLPGSRGERGLPGIAGATGEPGPLGVSGPPGARGPSGPVGSPGPNGAPGEAGRDGNPGNDGPPGRDGAPGFKGERGAPGNPGPSGALGAPGPHGQVGPSGKPGNRGDPGPVGPVGPAGAFGPRGLAGPQGPRGEKGEPGDKGHRGLPGLKGHNGLQGLPGLAGQHGDQGPPGNNGPAGPRGPPGPSGPPGKDGRNGLPGPIGPAGVRGSHGSQGPAGPPGPPGPPGPPGPNGGGYEVGFDAEYYR'},
#    'COL1A2-Gal_gal': {'species': 'Gallus gallus','offset': -2, 'sequence': 'AGSVGMGYPPQPISGFPGPPGPSGPPGPPGHAGPPGSNGYQGPPGEPGQPGPSGPPGPAGMIGPAGPPGKDGEPGRPGRNGDRGIPGLPGHKGHPGMPGMPGMKGARGFDGKDGAKGDSGAPGPKGEAGQPGANGSPGQPGPRGPTGERGRPGNPGGPGAHGKDGAPGAAGPPGPPGPPGTAGFPGSPGFKGEAGPPGPAGASGSPGERGEPGPQGQAGPPGPQGPPGRAGSPGNKGEMGPSGIPGAPGLPGGRGLPGPPGTSGNPGAKGTPGEPGKNGAKGDPGPKGERGENGTPGAPGPPGEEGKRGANGEPGQNGVPGTPGERGSPGFRGLPGSNGLPGEKGPAGERGSPGPPGPSGPAGDRGQDGGPGLPGMRGLPGIPGSPGSDGKPGPPGNQGEPGRSGPPGPAGPRGQPGVMGFPGPKGNEGAPGKNGERGPGGPPGTPGPAGKNGDVGLPGPPGPAGPAGDRGEPGPSGSPGLQGLPGGPGPAGENGKPGEPGPKGDIGGPGFPGPKGENGIPGERGAQGPPGPTGARGGPGPAGSEGAKGPPGPPGAPGGTGLPGLQGMPGERGASGSPGPKGDKGEPGGKGADGLPGARGERGNVGPIGPPGPAGPPGDKGETGPAGAPGPAGSRGGPGERGEQGLPGPAGFPGAPGQNGEPGGKGERGPPGLRGEAGPPGAAGPQGGPGAPGPPGPQGVKGERGSPGGPGAAGFPGARGLPGPPGNNGSPGPPGNAGPPGKDGPPGPPGNTGPPGGSGPPGLRGEPGAPGEKGPPGARGERGTPGDPGPQGIIGSRGSTGLPGPRGLPGPAGMAGGKGEDGKPGVNGVPGERGAPGPQGPMGQRGLPGEPGRDGNPGSDGSPGRDGSPGGKGDRGESGPPGVPGPPGHPGPAGNNGAPGKAGERGFQGPPGPPGSAGPAGARGPAGPQGPRGDKGETGERGSAGIKGHRGFPGTPGLPGPPGPLGPQGAIGSPGASGARGPPGPAGPPGKDGRGGYPGPIGPPGPRGNRGESGPAGPPGQPGLPGPSGPPGPCCGGGVASLGAGEK'},
    'COL1A2-Ayf_COL1A2': {'species': 'Ayf xxx','offset': -0, 'sequence': ' GPMGLMGPRGPPGASGPPGPPGFQGVPGEPGEPGQTGPQGPRGPPGPPGKAGEDGHPGKPGRPGERGVAGPQGARGFPGTPGLPGFKGIRGHNGLDGQKGQPGTPGTKGEPGAPGENGTPGQPGARGLPGERGRVGAPGPAGARGSDGSAGPTGPAGPIGAAGPPGFPGAPGAKGEIGPAGNVGPTGPAGPRGEIGLPGSSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPPGPAGPSGARGLVGEPGPAGAKGESGNKGEPGAAGPPGPPGPSGEEGKRGSNGEPGSAGPPGPAGLRGVPGSRGLPGADGRAGVMGPAGNRGASGPVGAKGPNGDGGRPGEPGLMGPRGLPGQPGSPGPAGKEGPVGFPGADGRVGPIGPAGNRGEPGNIGFPGPKGPTGEPGKPGEKGNVGLAGPRGAPGPEGNNGAQGPPGVTGNQGAKGEQGPAGPPGFQGLPGPSGPAGEAGKPGERGLHGEFGVPGPAGPRGERGPPGESGAVGPAGPIGSRGPSGPPGPDGNKGEPGNVGAAGGPGPAGPGGIPGERGVAGVPGGKGEKGAPGLRGDTGATGRDGARGLPGAIGAPGPAGGAGDRGEGGPAGPAGPAGARGIPGERGEPGPVGPSGFAGPPGAAGQPGAKGERGPKGPKGETGPTGAIGPIGASGPPGPAGAAGPAGPRGDAGPPGMTGFPGAAGRVGPPGPAGITGPPGPPGPAGKDGARGLRGDVGPVGRTGEQGIAGPPGFAGEKGPSGEAGAPGPPGTPGPQGILGAPGILGLPGSRGERGLPGISGATGEPGPLGVSGPPGARGPSGPVGSPGPNGAPGEAGRDGNPGNDGPPGRDGAPGFKGERGAPGSPGPSGALGAPGPHGQVGPAGKPGNRGDPGPAGHVGPAGAFGPRGLAGPQGPRGEKGEPGDKGHRGLPGLKGHNGLQGLPGLAGQHGDQGPPGNNGPAGPRGPPGPSGPPGKDGRNGLPGPIGPAGARGSHGSQGPAGPPGPPGPPGPPGPNGGGYEVGYDAEYYRADQPSLRPK'},
    'COL1A2-Com_COL1A2': {'species': 'Com xxx','offset': -0, 'sequence': ' GPMGLMGPRGPPGASGPPGPPGFQGLPGEPGEPGQTGPQGPRGPPGPPGKAGEDGHPGKPGRPGERGVAGPQGARGFPGTPGLPGFKGIRGHNGLDGQKGQPGTPGSKGEPGAPGENGTPGQPGARGLPGERGRIGAPGPAGARGSDGSTGPTGPAGPIGAAGPPGFPGAPGAKGEIGPAGNVGPTGPAGPRGEIGLPGSSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGILGPPGPAGPSGARGLVGEPGPAGAKGESGMKGEPGAAGPAGPPGPSGEEGKRGSPGEPGSAGPPGPAGLRGVPGSRGLPGADGRAGVMGPAGNRGASGPVGAKGPSGDAGRPGEPGLMGPRGLPGQPGSPGPAGKEGPVGFPGADGRVGPIGPAGNRGEPGNIGFPGPKGPTGEPGKPGEKGNVGLAGPRGAPGPEGNNGAQGPPGVTGNPGGKGETGPAGPPGFQGLPGPSGPAGEAGKPGERGLHGEFGVPGPAGPRGERGPPGESGAVGPAGAIGSRGPSGPPGPDGNKGEPGNVGAAGAPGPAGPGGIPGERGVAGVPGGKGEKGAPGLRGDTGATGRDGARGLPGAIGAPGPAGGAGDRGEGGPAGPAGPAGARGIPGERGEPGPVGPNGFAGPPGAAGQPGAKGERGPKGPKGESGPTGAIGPIGASGPPGPVGAAGPAGPRGDAGPPGMTGFPGAAGRVGPPGPAGITGPPGPPGPAGKDGPRGLRGDVGPVGRTGEQGIAGPPGFAGEKGPSGEAGAAGPPGTPGPQGILGAPGILGLPGSRGERGLPGISGATGEPGPLGVSGPPGARGPSGPVGSPGPNGAPGEAGRDGNPGNDGPPGRDGAPGFKGERGAPGSPGPSGALGAPGPHGQVGPSGKPGNRGEPGPAGAVGPAGAFGPRGLAGPQGPRGEKGLPGDKGPRGLPGLKGHNGLQGLPGLAGQHGDQGPPGNTGPAGPRGPPGPSGPPGKDGRNGLPGPIGPAGVRGSHGSQGPAGPPGPPGPPGPPGPNGGGYEVGFDAEYYRADQPSLRPK'},
    'COL1A2-Stc_COL1A2': {'species': 'Struthio camelus','offset': -0, 'sequence': ' GPMGLMGPRGPPGASGPPGPPGFQGLPGEPGEPGQTGPQGPRGPPGPPGKAGEDGHPGKPGRPGERGVAGPQGARGFPGTPGLPGFKGIRGHNGLDGQKGQPGTPGSKGEPGAPGENGTPGQPGARGLPGERGRIGAPGPAGARGSDGSTGPTGPAGPIGAAGPPGFPGAPGAKGEIGPAGNVGPTGPAGPRGEIGLPGSSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGILGPPGPAGPSGARGLVGEPGPAGAKGESGMKGEPGAAGPAGPPGPSGEEGKRGSPGEPGSAGPPGPAGLRGVPGSRGLPGADGRAGVMGPAGNRGASGPVGAKGPSGDAGRPGEPGLMGPRGLPGQPGSPGPAGKEGPVGFPGADGRVGPIGPAGNRGEPGNIGFPGPKGPTGEPGKPGEKGNVGLAGPRGAPGPEGNNGAQGPPGVTGNPGGKGETGPAGPPGFQGLPGPSGPAGEAGKPGERGLHGEFGVPGPAGPRGERGPPGESGAVGPAGAIGSRGPSGPPGPDGNKGEPGNVGAAGAPGPAGPGGIPGERGVAGVPGGKGEKGAPGLRGDTGATGRDGARGLPGAIGAPGPAGGAGDRGEGGPAGPAGPAGARGIPGERGEPGPVGPNGFAGPPGAAGQPGAKGERGPKGPKGESGPTGAIGPIGASGPPGPVGAAGPAGPRGDAGPPGMTGFPGAAGRVGPPGPAGITGPPGPPGPAGKDGPRGLRGDVGPVGRTGEQGIAGPPGFAGEKGPSGEAGAAGPPGTPGPQGILGAPGILGLPGSRGERGLPGISGATGEPGPLGVSGPPGARGPSGPVGSPGPNGAPGEAGRDGNPGNDGPPGRDGAPGFKGERGAPGSPGPSGALGAPGPHGQVGPSGKPGNRGEPGPAGAVGPAGAFGPRGLAGPQGPRGEKGLPGDKGPRGLPGLKGHNGLQGLPGLAGQHGDQGPPGNTGPAGPRGPPGPSGPPGKDGRNGLPGPIGPAGVRGSHGSQGPAGPPGPPGPPGPPGPNGGGYEVGFDAEYYRADQPSLRPK'},
    'COL1A2-Hip_amp':  {'species': 'Hippopotamus amphibius','offset': 0, 'sequence': 'GPMGIMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRSGERGVVGPQGARGFPGTPGIPGFKGIRGHNGIDGIKGQPGAPGVKGEPGAP---------GARGIPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGEIGPVGSPGASGPAGPRGEVGIPGVSGPVGPPGN----------GAAGIPGVAGAPGIPGPRGIPGPVGAAGATGARGIVGEPGPAGSK------------GPQGIPGPSGEEGKRGSTGEIGPAGPPGPPGIRGSPGSRGIPGADGRAGVMGIPGSRGATGPAGVRG------------------FPGSPGNIGPAGKEGPVGIPGIDGRPGPTGPAGAR----NIGFPGPKGPTGD---NGDKGHAGIAGARGAPGPDGNNGAQGPP--Q---GGKGEQGPAGPPGFQGIPGPAGTAGEAGKPGERGIPGEFGIPGPAGPRGERGPPGESGAAGPTGPIGNRGPSGPAGPDGNKGEPGVVGAPGTAGPSGPSGIPGERGAAGIPGPKGEKGEPGIR-------RDGARGAPGAVGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGER---GTRGDGGPPGATGFPGAAGR------------------------------TGPPGPSGISGPPGPPGPAGKEGIRGPRGDQGPVGRSGETGASGPPGFAGEK-------------TPGPQGIIGAPGFIGIPGSRGERGIPGVAGSVGEPGPIGIAGPPGARGPPGAVGNPGVNGAPGEAGRDGNPGSDGPPGR----GHKGERGYP-------GAGAPGPQGPVGPTGKHGNRGEPGPAGVVGPTGAVGPRGPSGPQGIRGDKGEPGDKGPRGIPGIKGHNGIQGIPGIAGHHGDQGAPGSVGPAGPRGPAGPSGPVGKDGRTGHPGAVGPAGIRGSQGPAGPPGPPGPPGPPGPS'},
    'COL1A2-Equ_asi':  {'species': 'Equus asinus','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPAGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHKGLDGLKGQPGAPGVKGEPGAPGENGTPGQAGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLTGAKGAAGLPGVAGAPGLPGPRGIPGPAGAAGATGARGLVGEPGPAGSKGESGNKGEPGAAGPQGPPGPSGEEGKRGPNGEPGSTGPAGPPGLRGSPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPTGEPGKPGDKGHAGLAGARGAPGPDGNNGAQGPPGPQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEVGKPGERGLPGEFGLPGPAGARGERGPPGESGAAGPAGPIGSRGPSGPPGPDGNKGEPGVLGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGEIGNPGRDGARGAPGAVGAPGPAGANGDRGEAGAAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGVTGFPGAAGRTGPPGPSGISGPPGPPGAAGKEGLRGPRGDQGPVGRAGETGASGPPGFAGEKGPSGEPGTAGPPGTPGPQGLLGAPGILGLPGSRGERGLPGVAGSLGEPGPLGIAGPPGARGPPGAVGAPGVNGAPGEAGRDGNPGSDGPPGRDGQPGHKGERGYPGNAGPVGAVGAPGPHGPVGPTGKHGNRGEPGPVGSVGPVGAVGPRGPSGPQGVRGDKGEPGDKGPRGLPGIKGHNGLQGLPGLAGQHGDQGAPGSVGPAGPRGPAGPTGPVGKDGRSGQPGTVGPAGVRGSQGSQGPAGPPGPPGPPGPPGPSGGGYDFGYDGDFYRA'},
    'COL1A2-EquxPrez': {'species': 'EquusxPrezwal','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPAGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQAGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLTGAKGAAGLPGVAGAPGLPGPRGIPGPAGAAGATGARGLVGEPGPAGSKGESGNKGEPGAAGPQGPPGPSGEEGKRGPNGEPGSTGPAGPPGLRGSPGSRGLPGADGRAGVMGPAGSRGASGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPSGEPGKPGDKGHAGLAGARGAPGPDGNNGAQGPPGPQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEVGKPGERGLPGEFGLPGPAGARGERGPPGESGAAGPAGPIGSRGPSGPPGPDGNKGEPGVLGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGEIGNPGRDGARGAPGAVGAPGPAGANGDRGEAGAAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGVTGFPGAAGRTGPPGPSGISGPPGPPGAAGKEGLRGPRGDQGPVGRAGETGASGPPGFAGEKGPSGEPGTAGPPGTPGPQGLLGAPGILGLPGSRGERGLPGVAGSLGEPGPLGIAGPPGARGPPGAVGAPGVNGAPGEAGRDGNPGSDGPPGRDGQPGHKGERGYPGNAGPVGAVGAPGPHGPVGPTGKHGHRGEPGPVGSVGPVGAVGPRGPSGPQGVRGDKGEPGDKGPRGLPGIKGHNGLQGLPGLAGQHGDQGAPGSVGPAGPRGPAGPTGPVGKDGRSGQPGTVGPAGVRGSQGSQGPAGPPGPPGPPGPPGPSGGGYDFGYDGDFYRA'},
    'COL1A2-Bub_Bub': {'species': 'Bubalus bubalis','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGPAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPSGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPTGDPGKAGEKGHAGLAGARGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGVRGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDIGSPGRDGARGAPGAVGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPAGISGPPGPPGPAGKEGLRGPRGDQGPVGRTGETGASGPPGFVGEKGPSGEPGTAGPPGTPGPQGLLGLPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPVGKHGNRGEPGPAGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGSVGPAGPRGPAGPSGPAGKDGRIGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYDFGFDGDFYRA'},
    'COL1A2-Bos_ind': {'species': 'Bos indicus','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPSGDPGKAGEKGHAGLAGARGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGARGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDIGSPGRDGARGAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPSGISGPPGPPGPAGKEGLRGPRGDQGPVGRSGETGASGPPGFVGEKGPSGEPGTAGPPGTPGPQGLLGAPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPXGKHGNRGEPGPAGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGAVGPAGPRGPAGPSGPAGKDGRIGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYEFGFDGDFYRA'},
    'COL1A2-Bis_bis': {'species': 'Bison bison','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPSGDPGKAGEKGHAGLAGARGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGARGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDIGSPGRDGARGAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPSGISGPPGPPGPAGKEGLRGPRGDQGPVGRSGETGASGPPGFVGEKGPSGEPGTAGPPGTPGPQGLLGAPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPVGKHGNRGEPGPAGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGAVGPAGPRGPAGPSGPAGKDGRIGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYEFGFDGDFYRA'},
    'COL1A2-Bosxbos': {'species': 'BosxBos','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPSGDPGKAGEKGHAGLAGARGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGARGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDIGSPGRDGARGAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPSGISGPPGPPGPAGKEGLRGPRGDQGPVGRSGETGASGPPGFVGEKGPSGEPGTAGPPGTPGPQGLLGAPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPVGKHGNRGEPGPAGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGAVGPAGPRGPAGPSGPAGKDGRIGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYEFGFDGDFYRA'},
    'COL1A2-Cap_hir': {'species': 'Capra hircus','offset': -7, 'sequence': 'QFDGKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPAGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPTGDPGKAGEKGHAGLAGPRGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGARGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDVGSPGRDGARGAPGAVGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPAGISGPPGPPGPAGKEGLRGPRGDQGPVGRTGEPGAAGPPGFVGEKGPSGEPGTAGPPGTPGPQGFLGPPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPTGKHGSRGEPGPVGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGAVGPAGPRGPAGPTGPAGKDGRTGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYDFGFDGDFYRA'},
    'COL1A2-Phy_cas': {'species': 'Physter catodon','offset': -8, 'sequence': 'QYDGKGVGIGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKSGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGTPGVKGEPGAPGENGIPGQVGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPLGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGSRGEVGLPGVSGPVGPPGNPGVNGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGDSGNKGEPGAVGPTGPPGPSGEEGKRGTTGEIGSAGPPGPPGLRGNPGSRGLPGADGRAGVMGLHGSRGGTGPAGVRGPSGDSGRPGEPGLMGPRGFPGSPGNAGPAGKEGPTGLPGIDGRPGPIGPAGTRGEPGNIGFPGPKGPTGDPGKNGEKGHAGLAGPRGAPGPDGNNGAQGPPGLQGVSGGKGEQGPAGPPGFQGLPGPAGTAGEAGKAGERGIPGEFGLPGPAGPRGERGPPGESGAAGPAGPIGSRGPSGPAGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDVGSHGRDGARGAPGAVGAPGPAGANGDRGEAGPAGAAGPAGSRGSPGERGEVGPAGPNGFAGSAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPAGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPSGITGPPGPPGPAGKEGLRGPRGDQGPVGRTGETGASGPPGFVGEKGPSGEPGTAGSPGTPGPQGLLGSPGFLGLPGSRGERGLPGVAGSLGEPGPLGIAGPTGARGPPGAVGNPGVNGAPGEAGRDGNPGSDGPPGRDGQAGHKGERGYPGNAGPTGTAGAPGPQGPVGPTGKHGNRGESGPSGPVGLAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGTVGPAGPRGPAGPSGPSGKDGRTGHPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPAGPSGGGYDFGFEGDFYRA'},
    'COL1A2-Bis_bis': {'species': 'Bison bison','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGAAGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPSGDPGKAGEKGHAGLAGARGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGARGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDIGSPGRDGARGAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPSGISGPPGPPGPAGKEGLRGPRGDQGPVGRSGETGASGPPGFVGEKGPSGEPGTAGPPGTPGPQGLLGAPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPVGKHGNRGEPGPAGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGAVGPAGPRGPAGPSGPAGKDGRIGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYEFGFDGDFYRA'},
    'COL1A2-Bos_mut': {'species': 'Bubalus bubalis','offset': -7, 'sequence': 'QFDAKGGGPGPMGLMGPRGPPGASGAPGPQGFQGPPGEPGEPGQTGPAGARGPPGPPGKAGEDGHPGKPGRPGERGVVGPQGARGFPGTPGLPGFKGIRGHNGLDGLKGQPGAPGVKGEPGAPGENGTPGQTGARGLPGERGRVGAPGPAGARGSDGSVGPVGPAGPIGSAGPPGFPGAPGPKGELGPVGNPGPAGPAGPRGEVGLPGLSGPVGPPGNPGANGLPGAKGAAGLPGVAGAPGLPGPRGIPGPVGASGATGARGLVGEPGPAGSKGESGNKGEPGAVGQPGPPGPSGEEGKRGSTGEIGPAGPPGPPGLRGNPGSRGLPGADGRAGVMGPAGSRGATGPAGVRGPNGDSGRPGEPGLMGPRGFPGSPGNIGPAGKEGPVGLPGIDGRPGPIGPAGARGEPGNIGFPGPKGPSGDPGKAGEKGHAGLAGARGAPGPDGNNGAQGPPGLQGVQGGKGEQGPAGPPGFQGLPGPAGTAGEAGKPGERGIPGEFGLPGPAGARGERGPPGESGAAGPTGPIGSRGPSGPPGPDGNKGEPGVVGAPGTAGPSGPSGLPGERGAAGIPGGKGEKGETGLRGDIGSPGRDGARGAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPRGSPGERGEVGPAGPNGFAGPAGAAGQPGAKGERGTKGPKGENGPVGPTGPVGAAGPSGPNGPPGPAGSRGDGGPPGATGFPGAAGRTGPPGPSGISGPPGPPGPAGKEGLRGPRGDQGPVGRSGETGASGPPGFVGEKGPSGEPGTAGPPGTPGPQGLLGAPGFLGLPGSRGERGLPGVAGSVGEPGPLGIAGPPGARGPPGNVGNPGVNGAPGEAGRDGNPGNDGPPGRDGQPGHKGERGYPGNAGPVGAAGAPGPQGPVGPVGKHGNRGEPGPAGAVGPAGAVGPRGPSGPQGIRGDKGEPGDKGPRGLPGLKGHNGLQGLPGLAGHHGDQGAPGAVGPAGPRGPAGPSGPAGKDGRIGQPGAVGPAGIRGSQGSQGPAGPPGPPGPPGPPGPSGGGYEFGFDGDFYRA'},
    'COL1A1-Bos_mut': {'species': 'Bos mutus','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEGGPQGPRGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGSKGDTGAKGEPGPTGIQGPPGPAGEEGKRGARGEPGPAGLPGPPGERGGPGSRGFPGADGVAGPKGPAGERGAPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGEAGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGANGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGAPGKDGVRGLTGPIGPPGPAGAPGDKGEAGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPAGPPGPIGNVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPAGKEGSKGPRGETGPAGRPGEVGPPGPPGPAGEKGAPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPMGPPGLAGPPGESGREGAPGAEGSPGRDGSPGAKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGETGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGSPGKDGLNGLPGPIGPPGPRGRTGDAGPAGPPGPPGPPGPPGPPSGGYDLSFLPQPPQEKAHDGGR'},
    'COL1A1-Equ_cab': {'species': 'Equus caballus','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGNKGDTGAKGEPGPTGIQGPPGPAGEEGKRGARGEPGPTGLPGPPGERGGPGARGFPGADGVAGPKGPAGERGAPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGESGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGSNGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGSPGKDGVRGLTGPIGPPGPAGAPGDKGETGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPAGPPGPIGSVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPVGKEGGKGPRGETGPAGRPGEAGPPGPPGPAGEKGSPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPVGPPGLAGPPGESGREGSPGAEGSPGRDGSPGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGEAGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPIGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSGGFDFSFLPQPPQEKSHDGGR'},
    'COL1A1-Equ_asi': {'species': 'Equus asinus','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGNKGDTGAKGEPGPTGIQGPPGPAGEEGKRGARGEPGPTGLPGPPGERGGPGARGFPGADGVAGPKGPAGERGAPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGESGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGSNGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGSPGKDGVRGLTGPIGPPGPAGAPGDKGETGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPAGPPGPIGSVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPVGKEGGKGPRGETGPAGRPGEAGPPGPPGPAGEKGSPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPVGPPGLAGPPGESGREGSPGAEGSPGRDGSPGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGEAGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPIGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSAGFDFSFLPQPPQEKSHDGGR'},
    'COL1A1-Cam_fer': {'species': 'Bos taurus','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGSSGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGNKGDTGAKGEPGPTGVQGPPGPAGEEGKRGARGEPGPAGLPGPPGERGGPGSRGFPGADGVAGPKGPAGERGSPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGEAGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGANGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGSPGKDGVRGLTGPIGPPGPAGAPGDKGETGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPTGPPGPIGSVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPVGKEGSKGPRGETGPAGRPGEVGPPGPPGPAGEKGAPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGPNGERGPPGPMGPPGLAGPPGESGREGAPGAEGSPGRDGSPGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGETGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPIGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSGGFDFSFLPQPPQEKAGGDGR'},
    'COL1A1-Vul_vul': {'species': 'Bos taurus','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGNKGDTGAKGEPGPTGIQGPPGPAGEEGKRGARGEPGPTGLPGPPGERGGPGSRGFPGADGVAGPKGPAGERGSPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGEAGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGANGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGSPGKDGVRGLTGPIGPPGPAGAPGDKGEAGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPTGPPGPIGNVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPAGKEGGKGPRGETGPAGRPGEVGPPGPPGPAGEKGSPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPMGPPGLAGPPGESGREGSPGAEGSPGRDGSPGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKNGDRGETGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGSPGKDGLNGLPGPIGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSGGFDFSFLPQPPQEKAHDGGR'},
    'COL1A1-Equ_asn': {'species': 'Equus asinus','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGNKGDTGAKGEPGPTGIQGPPGPAGEEGKRGARGEPGPTGLPGPPGERGGPGARGFPGADGVAGPKGPAGERGAPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGESGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGSNGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGSPGKDGVRGLTGPIGPPGPAGAPGDKGETGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPAGPPGPIGSVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPVGKEGGKGPRGETGPAGRPGEAGPPGPPGPAGEKGSPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPVGPPGLAGPPGESGREGSPGAEGSPGRDGSPGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGEAGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPIGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSAGFDFSFLPQPPQEKSHDGGR'},
    'COL1A1-Equ_asn': {'species': 'Equus asinus','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGAPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGAKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGNKGDTGAKGEPGPTGIQGPPGPAGEEGKRGARGEPGPTGLPGPPGERGGPGARGFPGADGVAGPKGPAGERGAPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGESGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGSNGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGSPGKDGVRGLTGPIGPPGPAGAPGDKGETGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPAGPPGPIGSVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPVGKEGGKGPRGETGPAGRPGEAGPPGPPGPAGEKGSPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGASGERGPPGPVGPPGLAGPPGESGREGSPGAEGSPGRDGSPGPKGDRGETGPAGPPGAPGAPGAPGPVGPAGKSGDRGEAGPAGPAGPIGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPIGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSAGFDFSFLPQPPQEKSHDGGR'},
    'COL1A1-Sus_Scr': {'species': 'Sus scrofa','offset': 0, 'sequence': 'GPMGPSGPRGLPGPPGAPGPQGFQGPPGEPGEPGASGPMGPRGPPGPPGKNGDDGEAGKPGRPGERGPPGPQGARGLPGTAGLPGMKGHRGFSGLDGAKGDAGPAGPKGEPGSPGENGAPGQMGPRGLPGERGRPGPPGPAGARGNDGATGAAGPPGPTGPAGPPGFPGAVGAKGEAGPQGARGSEGPQGVRGEPGPPGPAGAAGPAGNPGADGQPGGKGANGAPGIAGAPGFPGARGPSGPQGPSGPPGPKGNSGEPGAPGSKGDTGAKGEPGPTGVQGPPGPAGEEGKRGARGEPGPAGLPGPPGERGGPGSRGFPGADGVAGPKGPAGERGSPGPAGPKGSPGEAGRPGEAGLPGAKGLTGSPGSPGPDGKTGPPGPAGQDGRPGPPGPPGARGQAGVMGFPGPKGAAGEPGKAGERGVPGPPGAVGPAGKDGEAGAQGPPGPAGPAGERGEQGPAGSPGFQGLPGPAGPPGEAGKPGEQGVPGDLGAPGPSGARGERGFPGERGVQGPPGPAGPRGANGAPGNDGAKGDAGAPGAPGSQGAPGLQGMPGERGAAGLPGPKGDRGDAGPKGADGAPGKDGVRGLTGPIGPPGPAGAPGDKGETGPSGPAGPTGARGAPGDRGEPGPPGPAGFAGPPGADGQPGAKGEPGDAGAKGDAGPPGPAGPTGPPGPIGSVGAPGPKGARGSAGPPGATGFPGAAGRVGPPGPSGNAGPPGPPGPAGKEGSKGPRGETGPAGRPGEAGPPGPPGPAGEKGSPGADGPAGAPGTPGPQGIAGQRGVVGLPGQRGERGFPGLPGPSGEPGKQGPSGPSGERGPPGPMGPPGLAGPPGESGREGAPGAEGSPGRDGAPGPKGDRGESGPAGPPGAPGAPGAPGPVGPAGKSGDRGETGPAGPAGPVGPVGARGPAGPQGPRGDKGETGEQGDRGIKGHRGFSGLQGPPGPPGSPGEQGPSGASGPAGPRGPPGSAGAPGKDGLNGLPGPIGPPGPRGRTGDAGPVGPPGPPGPPGPPGPPSGGFDFSFLPQPPQEKAHDGGR'},
    'COL3A1-Equ_cab': {'species': 'Equus caballus','offset': -17, 'sequence': 'QYESYDVKAGVAGGGLAGYPGPVGPPGPPGPPGASGHPGSPGSPGYQGPPGEPGQAGPAGPPGPPGAIGPSGPAGKDGESGRPGRPGERGLPGPPGIKGPAGIPGFPGMKGHRGFDGRNGEKGETGAPGLKGENGLPGENGAPGPMGPRGAPGERGRPGLPGAAGSRGNDGARGSDGQPGPPGPPGTAGFPGSPGAKGEVGPAGSPGSSGAPGQRGEPGPQGNAGAPGPPGPPGASGSPGGKGEMGPAGIPGAPGLIGARGPPGPSGANGAPGQRGPAGEPGKNGAKGEPGARGERGEAGSPGIPGAKGEDGKDGSPGEPGANGLPGAAGERGAPGFRGPAGPNGLPGEKGAPGDRGSPGPAGPRGAAGEPGRDGVPGGPGIRGVPGSPGGPGSDGKPGPPGSQGESGRPGPPGPSGPRGQPGVMGFPGPKGNDGAPGKNGERGGPGGPGPPGPAGKNGETGPQGPPGPTGPSGDKGDAGLPGPQGLQGLPGTSGPPGENGKPGEPGPKGEAGSPGIPGGKGDSGAPGERGPPGAAGPPGLRGGAGPPGPEGGKGPAGAPGPPGAAGTPGLQGMPGERGGPGGPGVKGDKGEPGSAGADGAPGKDGPRGPTGPIGPPGPAGQPGDKGEGGAPGLPGIAGPRGGPGERGEVGPPGPAGFPGAPGQNGEPGAKGERGAPGEKGEGGAPGIAGPPGGSGPAGPPGPQGVKGERGSPGGPGAAGFPGARGLPGPPGSNGNPGPPGPSGAPGKDGPAGPPGSSGSPGSPGVSGPKGDAGQPGEKGSPGPQGPPGVPGPSGLIGITGARGLAGPPGMPGARGSPGPQGVKGESGKPGAPGHNGERGPPGPQGLPGLAGTAGEPGRDGNPGSDGLPGRDGAPGGKGDRGENGSPGAPGAPGHPGPPGPVGPAGKNGDRGESGPAGPAGAPGPAGARGAPGPQGPRGDKGETGERGSTGIKGHRGFPGSPGSPGSPGPAGHQGAVGSPGPAGPRGPAGPSGPPGKDGTLGHPGPIGPPGPRGNRGERGSEGSPGHPGQPGPPGPPGAPGPCCGAGVSAISGVIGPEKSGGYAPYYG'},
    'COL3A1-BosxBos': {'species': 'BosxBos','offset': -17, 'sequence': 'QYEAYDVKSGIAGGGIAGYPGPAGPPGPPGPPGTSGHPGAPGAPGYQGPPGEPGQAGPAGPPGPPGAIGPSGPAGKDGESGRPGRPGERGFPGPPGMKGPAGMPGFPGMKGHRGFDGRNGEKGETGAPGLKGENGVPGENGAPGPMGPRGAPGERGRPGLPGAAGARGNDGARGSDGQPGPPGPPGTAGFPGSPGAKGEVGPAGSPGSSGAPGQRGEPGPQGHAGAPGPPGPPGSNGSPGGKGEMGPAGIPGAPGLIGARGPPGPPGTNGVPGQRGAAGEPGKNGAKGDPGPRGERGEAGSPGIAGPKGEDGKDGSPGEPGANGLPGAAGERGVPGFRGPAGANGLPGEKGPPGDRGGPGPAGPRGVAGEPGRDGLPGGPGLRGIPGSPGGPGSDGKPGPPGSQGETGRPGPPGSPGPRGQPGVMGFPGPKGNDGAPGKNGERGGPGGPGPQGPAGKNGETGPQGPPGPTGPSGDKGDTGPPGPQGLQGLPGTSGPPGENGKPGEPGPKGEAGAPGIPGGKGDSGAPGERGPPGAGGPPGPRGGAGPPGPEGGKGAAGPPGPPGSAGTPGLQGMPGERGGPGGPGPKGDKGEPGSSGVDGAPGKDGPRGPTGPIGPPGPAGQPGDKGESGAPGVPGIAGPRGGPGERGEQGPPGPAGFPGAPGQNGEPGAKGERGAPGEKGEGGPPGAAGPAGGSGPAGPPGPQGVKGERGSPGGPGAAGFPGGRGPPGPPGSNGNPGPPGSSGAPGKDGPPGPPGSNGAPGSPGISGPKGDSGPPGERGAPGPQGPPGAPGPLGIAGLTGARGLAGPPGMPGARGSPGPQGIKGENGKPGPSGQNGERGPPGPQGLPGLAGTAGEPGRDGNPGSDGLPGRDGAPGAKGDRGENGSPGAPGAPGHPGPPGPVGPAGKSGDRGETGPAGPSGAPGPAGSRGPPGPQGPRGDKGETGERGAMGIKGHRGFPGNPGAPGSPGPAGHQGAVGSPGPAGPRGPVGPSGPPGKDGASGHPGPIGPPGPRGNRGERGSEGSPGHPGQPGPPGPPGAPGPCCGAGGVAAIAGVGAEKAGGFAPYYG'},
    'COL3A1-Bos_tau': {'species': 'Bos taurus','offset': -9, 'sequence': ' SGVAGGGIAGYPGPAGPPGPPGPPGTSGHPGAPGAPGYQGPPGEPGQAGPAGPPGPPGAIGPSGPAGKDGESGRPGRPGERGFPGPPGMKGPAGMPGFPGMKGHRGFDGRNGEKGETGAPGLKGENGVPGENGAPGPMGPRGAPGERGRPGLPGAAGARGNDGARGSDGQPGPPGPPGTAGFPGSPGAKGEVGPAGSPGSSGAPGQRGEPGPQGHAGAPGPPGPPGSNGSPGGKGEMGPAGIPGAPGLIGARGPPGPPGTNGVPGQRGAAGEPGKNGAKGDPGPRGERGEAGSPGIAGPKGEDGKDGSPGEPGANGLPGAAGERGVPGFRGPAGANGLPGEKGPPGDRGGPGPAGPRGVAGEPGRDGLPGGPGLRGIPGSPGGPGSDGKPGPPGSQGETGRPGPPGSPGPRGQPGVMGFPGPKGNDGAPGKNGERGGPGGPGPQGPAGKNGETGPQGPPGPTGPSGDKGDTGPPGPQGLQGLPGTSGPPGENGKPGEPGPKGEAGAPGIPGGKGDSGAPGERGPPGAGGPPGPRGGAGPPGPEGGKGAAGPPGPPGSAGTPGLQGMPGERGGPGGPGPKGDKGEPGSSGVDGAPGKDGPRGPTGPIGPPGPAGQPGDKGESGAPGVPGIAGPRGGPGERGEQGPPGPAGFPGAPGQNGEPGAKGERGAPGEKGEGGPPGAAGPAGGSGPAGPPGPQGVKGERGSPGGPGAAGFPGGRGPPGPPGSNGNPGPPGSSGAPGKDGPPGPPGSNGAPGSPGISGPKGDSGPPGERGAPGPQGPPGAPGPLGIAGLTGARGLAGPPGMPGARGSPGPQGIKGENGKPGPSGQNGERGPPGPQGLPGLAGTAGEPGRDGNPGSDGLPGRDGAPGAKGDRGENGSPGAPGAPGHPGPPGPVGPAGKSGDRGETGPAGPSGAPGPAGSRGPPGPQGPRGDKGETGERGAMGIKGHRGFPGNPGAPGSPGPAGHQGAVGSPGPAGPRGPVGPSGPPGKDGASGHPGPIGPPGPRGNRGERGSEGSPGHPGQPGPPGPPGAPGPCCGAGGVAAIAGVGAEK'},
    'COL3A1-Bos_ind': {'species': 'Bos indicus','offset': -17, 'sequence': 'QYEAYDVKSGXAGGGIAGYPGPAGPPGPPGPPGTSGHPGAPGAPGYQGPPGEPGQAGPAGPPGPPGAIGPSGPAGKDGESGRPGRPGERGFPGPPGMKGPAGMPGFPGMKGHRGFDGRNGEKGETGAPGLKGENGVPGENGAPGPMGPRGAPGERGRPGLPGAAGARGNDGARGSDGQPGPPGPPGTAGFPGSPGAKGEVGPAGSPGSSGAPGQRGEPGPQGHAGAPGPPGPPGSNGSPGGKGEMGPAGIPGAPGLIGARGPPGPPGTNGVPGQRGAAGEPGKNGAKGDPGPRGERGEAGSPGIAGPKGEDGKDGSPGEPGANGLPGAAGERGVPGFRGPAGANGLPGEKGPPGDRGGPGPAGPRGVAGEPGRDGLPGGPGLRGIPGSPGGPGSDGKPGPPGSQGETGRPGPPGSPGPRGQPGVMGFPGPKGNDGAPGKNGERGGPGGPGPQGPAGKNGETGPQGPPGPTGPSGDKGDTGPPGPQGLQGLPGTSGPPGENGKPGEPGPKGEAGAPGIPGGKGDSGAPGERGPPGAGGPPGPRGGAGPPGPEGGKGAAGPPGPPGSAGTPGLQGMPGERGGPGGPGPKGDKGEPGSSGVDGAPGKDGPRGPTGPIGPPGPAGQPGDKGESGAPGVPGIAGPRGGPGERGEQGPPGPAGFPGAPGQNGEPGAKGERGAPGEKGEGGPPGAAGPAGGSGPAGPPGPQGVKGERGSPGGPGAAGFPGGRGPPGPPGSNGNPGPPGSSGAPGKDGPPGPPGSNGAPGSPGISGPKGDSGPPGERGAPGPQGPPGAPGPLGIAGLTGARGLAGPPGMPGARGSPGPQGIKGENGKPGPSGQNGERGPPGPQGLPGLAGTAGEPGRDGNPGSDGLPGRDGAPGAKGDRGENGSPGAPGAPGHPGPPGPVGPAGKSGDRGETGPAGPSGAPGPAGSRGPPGPQGPRGDKGETGERGAMGIKGHRGFPGNPGAPGSPGPAGHQGAVGSPGPAGPRGPVGPSGPPGKDGASGHPGPIGPPGPRGNRGERGSEGSPGHPGQPGPPGPPGAPGPCCGAGGVAAIAGVGAEKAGGFAPYYG'},
    'COL3A1-Bos_mut' : {'species': 'Bos mutus','offset': -17, 'sequence': 'QYEAYDVKSGIAGGGIAGYPGPAGPPGPPGPPGTSGHPGAPGAPGYQGPPGEPGQAGPAGPPGPPGAIGPSGPAGKDGESGRPGRPGERGFPGPPGMKGPAGMPGFPGMKGHRGFDGRNGEKGETGAPGLKGENGVPGENGAPGPMGPRGAPGERGRPGLPGAAGARGNDGARGSDGQPGPPGPPGTAGFPGSPGAKGEVGPAGSPGSSGAPGQRGEPGPQGHAGAPGPPGPPGSNGSPGGKGEMGPAGIPGAPGLIGARGPPGPPGTNGVPGQRGAAGEPGKNGAKGDPGPRGERGEAGSPGIAGPKGEDGKDGSPGEPGANGLPGAAGERGVPGFRGPAGANGLPGEKGPPGDRGGPGPAGPRGVAGEPGRDGLPGGPGLRGIPGSPGGPGSDGKPGPPGSQGETGRPGPPGSPGPRGQPGVMGFPGPKGNDGAPGKNGERGGPGGPGPQGPAGKNGETGPQGPPGPTGPSGDKGDTGPPGPQGLQGLPGTSGPPGENGKPGEPGPKGEAGAPGIPGGKGDSGAPGERGPPGAGGPPGPRGGAGPPGPEGGKGAAGPPGPPGSAGTPGLQGMPGERGGPGGPGPKGDKGEPGSSGVDGAPGKDGPRGPTGPIGPPGPAGQPGDKGESGAPGVPGIAGPRGGPGERGEQGPPGPAGFPGAPGQNGEPGAKGERGAPGEKGEG------------------------------------------------------------------------------------------------------GPPGAAG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------PPGKDGASGHPGPIGPPGPRGNRGERGSEGSPGHPGQPGPPGPPGAPGPCCGAGGVAAIAGVGAEKAGGFAPYYG'},
    'TRYUP-Sus_sco': {'species': 'Bos taurus','offset': 0, 'sequence': 'IVGGYTCAANSIPYQVSLNSGSHFCGGSLINSQWVVSAAHCYKSRIQVRLGEHNIDVLEGNEQFINAAKIITHPNFNGNTLDNDIMLIKLSSPATLNSRVATVSLPRSCAAAGTECLISGWGNTKSSGSSYPSLLQCLKAPVLSDSSCKSSYPGQITGNMICVGFLEGGKDSCQGDSGGPVVCNGQLQGIVSWGYGCAQKNKPGVYTKVCNYVNWIQQTIAAN'},
    'AHSG-Bos_tau' : {'species': 'Bos taurus','offset': 0, 'sequence':'IPLDPVAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLANCSVRQQTQHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNDSRVVHAVEVALATFNAESNGSYLQLVEISRAQFVPLPVSVSVEFAVAATDCIAKEVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVRVTCTLFQTQPVIPQPQPDGAEAEAPSAVPDAAGPTPSAAGPPVASVVVGPSVVAVPLPLHRAHYDLRHTFSGVASVESSSGEAFHVGKTPIVGQPSIPGGPVRLCPGRIRYFKI'},
    'AHSG-Bub_bub' : {'species': 'Bubalus bubalis','offset': 0, 'sequence':'IPLDPVAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLANCSVRQQTEHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNDSRVVHAVEVALATFNAQSNGSYLQLVEISRAQFVPLPASVSVEFAVAATDCIAKDVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVAVTCTLFQTQPVILQPQPDGAEAGAPSAVPDAAGPAPSAAGPPVASVVVGPSVVAVPLPLHRAHYDLRHTFSGVASVESASGEAFHVGKTPIVGQPSVPGGPVRLCPGRIRYFKI'},
    'AHSG-Equ_cab' : {'species': 'Equus caballus','offset': 1, 'sequence':' LPNGLSPAYRQLNCDDPETEQAALLAVDYINSHIHQGYKHVLNQIDKVQVWAQPTGESFKLEIDTLETTCHALDPTPLANCSVRQLTQHAVEGDCDVRLLKQNGQFSVSFVKCKSSPDSAEDVRKVCLDCPLLAPLNDTRVVHAVEAALAAFNAQNNGSYFQLVEISRAQLVPLPVSVHVEFAVAATDCVAKEVIDPAKCNLLAEKQYGFCKATLTEKVGGEDVAVTCTVFQTQPVVLLPQPDGPDVGVPGPVADAVTPAPSPADLPVASLVVGPVVVAASQLPPPVHRAHYDLRHAFAGVGSGESASGEAFHVEKPPKVAHPNTAAAAGPVVRPCPGRIRYFKII'},
    'AHSG-Ovi_are' : {'species': 'Ovis aries','offset': 1, 'sequence':'IPLDPIAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLVNCSVRQQTEHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNNSQVVHAAEVALATFNAQNNGSYFQLVEISRAQFVPLPGSVSVEFAVAATDCIAKEVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVTVTCTLFQTQPVIPQPQPEGAEAGAPSAVPDAAVPDAAVPAPSAAGLPVGSVVAGPSVVAVPLPLHRAHYDLRHTFSGVASVESASGEAFHVGKTPIVGQPSVPGGPVHLCPGRIRYFKI'},
    'AHSG-Cap_hir' : {'species': 'Capra hircus','offset': 1, 'sequence':'IPLDPIAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWPRRPTGEVYDIEIDTLETTCHVLDPTPLANCSVRQQTEHAVEGDCDIHVLKQDGQFSVLFTKCDSSPDSAEDVRKLCPDCPLLAPLNNSQVVHAAEVALATFNAQNNGSYFQLVEISRAQFVPLPVSVSVEFAVAATDCIAKEVVDPTKCNLLAEKQYGFCKGSVIQKALGGEDVAVTCTLFQTQPVIPQPQPEGAEAGAPSAVPDAAVPAPSAAGLPVGSVVAGPSVVAVALPLHRAHYDLRHTFSGVASVESASGEAFHVGKTPIVGQPSVPGGPVHLCPGRIRYFKI'},
    'AHSG-Equ_asn' : {'species': 'Equus asinus','offset': 1, 'sequence':' LPNGLSPAYRQLNCDDPETEQAALLAVDYINSHIHQGYKHVLNQIDKVQVWAQPTGESFKLEIDTLETTCHALDPTPLANCSVRQLTQHAVEGDCDVRLLKQNGQFSVSFVKCKSSPDSAEDVRKVCLDCPLLAPLNDTRVVHAVEAALAAFNAQNNGSYFQLVEISRAQLVPLPVSVHVEFAVAATDCVAKEVIDPAKCNLLAEKQYGFCKATLTEKVGGEDVAVTCTVFQTQPVVPLPQPDGPDVGVPGPVADAATPEPSPADLPVASLVVGPVVVAAPQLPPPVHRAHYDLRHAFAGVGSGESASGEAFHVEKPPKVAHPNTAAAAGPVVRPCPGRIRYFKI'},
    'TPM1-Cam_dro' : {'species': 'Camelus dromarius','offset': 0, 'sequence':'MDAIKKKMQMLKLDKENAIDRAEQAEADKKQAEDRCKQLEEEQQALQKKLKGTEDEVEKYSESVKDAQEKLEQAEKKATDAEADVASLNRRIQLVEEELDRAQERLATALQKLEEAEKAADESERGMKVIENRAMKDEEKMELQEMQLKEAKHIAEDSDRKYEEVARKLVILEGELERSEERAEVAESRARQLEEELRTMDQALKSLMASEEEEGLGLPNSICCHSHNFALLFSPPPLPHCAISTVSPTVSSHPPPPHTPCSKCGDLEEELKIVTNNLKSLEAQADKYSTKEDKYEEEIKLLEEKLKEAETRAEFAERSVAKLEKTIDDLEDEVYAQKMKYKAISEELDNALNDITSL'},
    'KS1-Hom_sap' : {'species': 'Homo sapiens','offset': 0, 'sequence':'SRQFSSRSGYRSGGGFSSGSAGIINYQRRTTSSSTRRSGGGGGRFSSCGGGGGSFGAGGGFGSRSLVNLGGSKSISISVARGGGRGSGFGGGYGGGGFGGGGFGGGGFGGGGIGGGGFGGFGSGGGGFGGGGFGGGGYGGGYGPVCPPGGIQEVTINQSLLQPLNVEIDPEIQKVKSREREQIKSLNNQFASFIDKVRFLEQQNQVLQTKWELLQQVDTSTRTHNLEPYFESFINNLRRRVDQLKSDQSRLDSELKNMQDMVEDYRNKYEDEINKRTNAENEFVTIKKDVDGAYMTKVDLQAKLDNLQQEIDFLTALYQAELSQMQTQISETNVILSMDNNRSLDLDSIIAEVKAQYEDIAQKSKAEAESLYQSKYEELQITAGRHGDSVRNSKIEISELNRVIQRLRSEIDNVKKQISNLQQSISDAEQRGENALKDAKNKLNDLEDALQQAKEDLARLLRDYQELMNTKLALDLEIATYRTLLEGEESRMSGECAPNVSVSVSTSHTTISGGGSRGGGGGGYGSGGSSYGSGGGSYGSGGGGGGGRGSYGSGGSSYGSGGGSYGSGGGGGGHGSYGSGSSSGGYRGGSGGGGGGSSGGRGSGGGSSGGSIGGRGSSSGGVKSSGGSSSVKFVSTTYSGVTR'},
    'KS9-Hom_sap' : {'species': 'Homo sapiens','offset': 0, 'sequence':'SRQFSSRSGYRSGGGFSSGSAGIINYQRRTTSSSTRRSGGGGGRFSSCGGGGGSFGAGGGFGSRSLVNLGGSKSISISVARGGGRGSGFGGGYGGGGFGGGGFGGGGFGGGGIGGGGFGGFGSGGGGFGGGGFGGGGYGGGYGPVCPPGGIQEVTINQSLLQPLNVEIDPEIQKVKSREREQIKSLNNQFASFIDKVRFLEQQNQVLQTKWELLQQVDTSTRTHNLEPYFESFINNLRRRVDQLKSDQSRLDSELKNMQDMVEDYRNKYEDEINKRTNAENEFVTIKKDVDGAYMTKVDLQAKLDNLQQEIDFLTALYQAELSQMQTQISETNVILSMDNNRSLDLDSIIAEVKAQYEDIAQKSKAEAESLYQSKYEELQITAGRHGDSVRNSKIEISELNRVIQRLRSEIDNVKKQISNLQQSISDAEQRGENALKDAKNKLNDLEDALQQAKEDLARLLRDYQELMNTKLALDLEIATYRTLLEGEESRMSGECAPNVSVSVSTSHTTISGGGSRGGGGGGYGSGGSSYGSGGGSYGSGGGGGGGRGSYGSGGSSYGSGGGSYGSGGGGGGHGSYGSGSSSGGYRGGSGGGGGGSSGGRGSGGGSSGGSIGGRGSSSGGVKSSGGSSSVKFVSTTYSGVTR'},
    'KS2-Hom_sap' : {'species': 'Homo sapiens','offset': 0, 'sequence':'SRQFSSRSGYRSGGGFSSGSAGIINYQRRTTSSSTRRSGGGGGRFSSCGGGGGSFGAGGGFGSRSLVNLGGSKSISISVARGGGRGSGFGGGYGGGGFGGGGFGGGGFGGGGIGGGGFGGFGSGGGGFGGGGFGGGGYGGGYGPVCPPGGIQEVTINQSLLQPLNVEIDPEIQKVKSREREQIKSLNNQFASFIDKVRFLEQQNQVLQTKWELLQQVDTSTRTHNLEPYFESFINNLRRRVDQLKSDQSRLDSELKNMQDMVEDYRNKYEDEINKRTNAENEFVTIKKDVDGAYMTKVDLQAKLDNLQQEIDFLTALYQAELSQMQTQISETNVILSMDNNRSLDLDSIIAEVKAQYEDIAQKSKAEAESLYQSKYEELQITAGRHGDSVRNSKIEISELNRVIQRLRSEIDNVKKQISNLQQSISDAEQRGENALKDAKNKLNDLEDALQQAKEDLARLLRDYQELMNTKLALDLEIATYRTLLEGEESRMSGECAPNVSVSVSTSHTTISGGGSRGGGGGGYGSGGSSYGSGGGSYGSGGGGGGGRGSYGSGGSSYGSGGGSYGSGGGGGGHGSYGSGSSSGGYRGGSGGGGGGSSGGRGSGGGSSGGSIGGRGSSSGGVKSSGGSSSVKFVSTTYSGVTRR'},
    'ALB-Bos_tau' : {'species': 'Bos taurus','offset': 0, 'sequence':'MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'},
    'ALB-Equ_cab' : {'species': 'Equus caballus','offset': 0, 'sequence':'MKWVTFVSLLFLFSSAYSRGVLRRDTHKSEIAHRFNDLGEKHFKGLVLVAFSQYLQQCPFEDHVKLVNEVTEFAKKCAADESAENCDKSLHTLFGDKLCTVATLRATYGELADCCEKQEPERNECFLTHKDDHPNLPKLKPEPDAQCAAFQEDPDKFLGKYLYEVARRHPYFYGPELLFHAEEYKADFTECCPADDKLACLIPKLDALKERILLSSAKERLKCSSFQNFGERAVKAWSVARLSQKFPKADFAEVSKIVTDLTKVHKECCHGDLLECADDRADLAKYICEHQDSISGKLKACCDKPLLQKSHCIAEVKEDDLPSDLPALAADFAEDKEICKHYKDAKDVFLGTFLYEYSRRHPDYSVSLLLRIAKTYEATLEKCCAEADPPACYRTVFDQFTPLVEEPKSLVKKNCDLFEEVGEYDFQNALIVRYTKKAPQVSTPTLVEIGRTLGKVGSRCCKLPESERLPCSENHLALALNRLCVLHEKTPVSEKITKCCTDSLAERRPCFSALELDEGYVPKEFKAETFTFHADICTLPEDEKQIKKQSALAELVKHKPKATKEQLKTVLGNFSAFVAKCCGREDKEACFAEEGPKLVASSQLALA'},
    'ALB-Equ_asn' : {'species': 'Equus asinus','offset': 0, 'sequence':'MKWVTFVSLLFLFSSAYFRGVLRRDTHKSEIAHRFNDLGEKHFKGLVLVAFSQYLQQCPFEDHVKLVNEVTEFAKKCAADESAENCDKSLHTLFGDKLCTVATLRATYGELADCCEKQEPERNECFLTHKDDHPNLPKLKPEPDAQCAAFQEDPDKFLGKYLYEVARRHPYFYGPELLFHAEEYKADFTECCPADDKAGCLIPKLDALKERILLSSAKERLKCSSFQKFGERAFKAWSVARLSQKFPKADFAEVSKIVTDLTKVHKECCHGDLLECADDRADLTKYICEHQDSISGKLKACCDKPLLQKSHCIAEVKEDDLPSDLPALAADFAEDKEICKHYKDAKDVFLGTFLYEYSRRHPDYSVSLLLRIAKTYEATLEKCCAEADPPACYATVFDQFTPLVEEPKSLVKKNCDLFEEVGEYDFQNALIVRYTKKAPQVSTPTLVEIGRTLGKVGSRCCKLPESERLPCSENHLALALNRLCVLHEKTPVSEKITKCCTDSLAERRPCFSALELDEGYIPKEFKAETFTFHADICTLPEDEKQIKKQSALAELVKHKPKATKEQLKTVLGNFSAFVAKCCGAEDKEACFAEEGPKLVASSQLALA'},
    'ALB-Sus_scr' : {'species': 'Sus scrofa','offset': 0, 'sequence':'MKWVTFISLLFLFSSAYSRGVFRRDTYKSEIAHRFKDLGEQYFKGLVLIAFSQHLQQCPYEEHVKLVREVTEFAKTCVADESAENCDKSIHTLFGDKLCAIPSLREHYGDLADCCEKEEPERNECFLQHKNDNPDIPKLKPDPVALCADFQEDEQKFWGKYLYEIARRHPYFYAPELLYYAIIYKDVFSECCQAADKAACLLPKIEHLREKVLTSAAKQRLKCASIQKFGERAFKAWSLARLSQRFPKADFTEISKIVTDLAKVHKECCHGDLLECADDRADLAKYICENQDTISTKLKECCDKPLLEKSHCIAEAKRDELPADLNPLEHDFVEDKEVCKNYKEAKHVFLGTFLYEYSRRHPDYSVSLLLRIAKIYEATLEDCCAKEDPPACYATVFDKFQPLVDEPKNLIKQNCELFEKLGEYGFQNALIVRYTKKVPQVSTPTLVEVARKLGLVGSRCCKRPEEERLSCAEDYLSLVLNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYKPKEFVEGTFTFHADLCTLPEDEKQIKKQTALVELLKHKPHATEEQLRTVLGNFAAFVQKCCAAPDHEACFAVEGPKFVIEIRGILA'},

    }



###Amino Acid cand PTM colours

In [None]:
# Amino acids colors
amino_acids_colors = {
    "I": "#009688", "V": "#8bc34a", "B": "009688", "L": "#009688",
    "F": "#507351", "C": "#ffeb3b", "M": "#ffeb3b", "A": "#bdd54e",
    "G": "#9e9e9e", "T": "#ffc75e", "W": "#f49272", "S": "#ffc107",
    "Y": "#30802f", "P": "#607d8b", "H": "#673ab7", "Z": "average",
    "Q": "#f44336", "E": "#f44336", "N": "#e81e63", "D": "#f44336",
    "X": "#9d9e9e", "K": "#701637", "R": "#bd3e04"
}

# PTM colors - if a colour is not defined, the PTM is not plotted

ptm_colors = {
    "Hydroxylation": "#DFDEE3",
    "Deamidated": "#8F0030",
    "DeamidationN" :"#FF0008",
    "DeamidationQ" :"#C41E23",
    "Oxidation": "#A1CF6B",
    "Dioxidation": "#7AC74F",
    "Trioxidation ": "#61AB36",
    "Carboxy": "#f44336",
    "Dehydrated": "#c6d8f5",
    "Ammonia-loss": "#9A879D",
    "Phospho": "#F4845F",
    "Thiophospho": "#F25C54",
    "Carboxymethyl": "#7A3B69",
    "Formyl": "#9A879D",
    "Pentose": "#CE4257",
    "Arg->Orn": "#563F1B",
    "Sulfide": "#F7F052"
}

##Hail Mary Matching
(for when only prayer will do) This attempts to find additional locations in sequence of peptides which do not match to the collagen dictionary

In [None]:
# Parameters for the Hail Mary approach
allowed_mismatches_short_peptide = 2  # Allowing up to 2 mismatches for shorter peptides
min_length_short_peptide = 10  # Minimum length for considering short peptides
allowed_mismatches_long_peptide = 3  # Allowing up to 3 mismatches for longer peptides
min_length_long_peptide = 15  # Minimum length for considering long peptides

## Functions

### Directory functions

In [None]:
def find_spectra_files(base_path):
    """
    Walk through the directory structure starting from base_path to find all
    pFind-Filtered.spectra files.

    Parameters:
    - base_path (str): The base directory to start the search from.

    Returns:
    - list: A list of paths to pFind-Filtered.spectra files.
    """
    spectra_files = []
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file == 'pFind-Filtered.spectra':
                spectra_files.append(os.path.join(root, file))
    return spectra_files

def combine_spectra_files(spectra_files):
    """
    Combine spectra files into a single pandas DataFrame.

    Parameters:
    - spectra_files (list): A list of paths to pFind-Filtered.spectra files.

    Returns:
    - DataFrame: A combined DataFrame of all spectra files.
    """
    df_list = []
    for file in spectra_files:
        df = pd.read_csv(file, sep='\t')  # Assuming the file is tab-separated
        df_list.append(df)
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

### Dictionary Manipulations

In [None]:
# def levenshtein_distance(s1, s2):
#     """
#     Calculate the Levenshtein distance between two strings.
#     The Levenshtein distance measures the minimum number of single-character edits
#     (insertions, deletions, or substitutions) required to change one word into the other.

#     Parameters:
#     - s1 (str): The first string to compare.
#     - s2 (str): The second string to compare.

#     Returns:
#     - int: The Levenshtein distance between the two strings.
#     """
#     if len(s1) < len(s2):
#         return levenshtein_distance(s2, s1)

#     if len(s2) == 0:
#         return len(s1)

#     previous_row = range(len(s2) + 1)
#     for i, c1 in enumerate(s1):
#         current_row = [i + 1]
#         for j, c2 in enumerate(s2):
#             insertions = previous_row[j + 1] + 1
#             deletions = current_row[j] + 1
#             substitutions = previous_row[j] + (c1 != c2)
#             current_row.append(min(insertions, deletions, substitutions))
#         previous_row = current_row

#     return previous_row[-1]

def simplified_mismatch(s1, s2): # replaces levenshtein_distance which runs tooo slow
    """
    Compare two strings of the same length and determine if they match,
    allowing for a specified number of mismatches depending on the length
    of the strings. Treats Isoleucine (I) and Leucine (L) as equivalent.

    Parameters:
    - s1 (str): The first string to compare.
    - s2 (str): The second string to compare.

    Returns:
    - bool: True if the strings match within the allowed number of mismatches, False otherwise.
    """
    # Define mismatch allowances based on length
    if len(s1) < 10:
        max_mismatches = 2
    elif len(s1) < 15:
        max_mismatches = 3
    elif len(s1) < 20:
        max_mismatches = 4
    else:
        max_mismatches = 0  # or another number, depending on your criteria for longer sequences

    mismatches = 0

    for c1, c2 in zip(s1, s2):
        if c1 != c2:
            # Treat I/L as equivalent
            if not ((c1 == 'I' and c2 == 'L') or (c1 == 'L' and c2 == 'I')):
                mismatches += 1
                if mismatches > max_mismatches:
                    return False

    return True

def extract_gene_identifier(full_gene_name):
    """
    Extracts the gene identifier from the full gene name.

    Parameters:
    - full_gene_name (str): The full gene name, e.g., 'COL3A1-Bos_mut'.

    Returns:
    - str: The extracted gene identifier, e.g., 'COL3A1'.
    """
    # Split the full gene name at the hyphen and return the first part
    return full_gene_name.split('-')[0]

def exact_or_near_exact_match(peptide, sequence_dict):
    """
    Attempt to find an exact or near-exact match for a peptide in the dictionary.

    Parameters:
    - peptide (str): The peptide sequence to match.
    - sequence_dict (dict): A dictionary with sequences as values.

    Returns:
    - str or None: The key of the matching sequence if found; None otherwise.
    """
    for key, details in sequence_dict.items():
        sequence = details['sequence']
        # Here you could implement near-exact matching criteria, e.g., length comparison
        if peptide == sequence:  # Exact match
            return key
    return None

def find_match_and_update(row, sequence_position_dict, NCP_peptide_positions_dict):
    """
    Update the row based on matches found in sequence_position_dict and NCP_peptide_positions_dict.

    Parameters:
    - row: A pandas Series representing a row in a DataFrame.
    - sequence_position_dict: A dictionary containing matched sequences with gene data.
    - NCP_peptide_positions_dict: A dictionary containing NCP peptide positions.

    Returns:
    - Updated row with match data from sequence_position_dict or NCP_peptide_positions_dict.
    """
    sequence = row['Sequence']
    # Check for a match in the sequence_position_dict
    if sequence in sequence_position_dict:
        # Use the extract_gene_identifier function to simplify gene name extraction
        gene = extract_gene_identifier(sequence_position_dict[sequence]['gene'])
        # Update 'GNC&spp' with the extracted gene, append if already exists
        row['GNC&spp'] = f"{gene}" if pd.isnull(row['GNC&spp']) else f"{row['GNC&spp']},{gene}"

    if sequence in NCP_peptide_positions_dict:
        gene = extract_gene_identifier(NCP_peptide_positions_dict[sequence]['gene'])
        # Append or set the gene identifier for NCP matches
        row['GNC&spp'] = f"{gene}" if pd.isnull(row['GNC&spp']) else f"{row['GNC&spp']},{gene}"

    return row



###Assignment Functions

In [None]:

def are_amino_acids_equivalent(aa1, aa2):
    """Check if two amino acids are equivalent, considering isobaric amino acids and deamidation."""
    if aa1 == aa2:
        return True  # Identical amino acids are always equivalent
    # Define equivalence groups
    equivalence_groups = [('I', 'L'), ('N', 'D'), ('Q', 'E')]
    for group in equivalence_groups:
        if (aa1 in group and aa2 in group):
            return True
    return False

def peptides_match_with_biochemical_equivalence(peptide1, peptide2, max_mismatches):
    """Compare two peptides with a given allowance for mismatches, considering biochemical equivalences."""
    mismatches = 0
    for aa1, aa2 in zip(peptide1, peptide2):
        if not are_amino_acids_equivalent(aa1, aa2):
            mismatches += 1
            if mismatches > max_mismatches:
                return False
    return True


def match_peptides_with_mismatches(peptides, sequence_dict, max_mismatches):
    """
    Match each peptide in a list to sequences in a dictionary, allowing for a specified number of mismatches.

    Parameters:
    - peptides (list of str): The list of peptide sequences to match.
    - sequence_dict (dict): A dictionary with sequences as values.
    - max_mismatches (int): The maximum number of mismatches allowed in the matching.

    Returns:
    - dict: A dictionary mapping peptides to their matched sequences and the number of mismatches.
    """
    matched_peptides = {}

    for peptide in peptides:
        for key, details in sequence_dict.items():
            sequence = details['sequence']
            mismatches = simplified_mismatch(peptide, sequence)
            if mismatches <= max_mismatches:
                matched_peptides[peptide] = {'match': key, 'mismatches': mismatches}
                break  # Stop searching after the first match for this peptide

    return matched_peptides


def find_peptide_positions_in_NCPs(non_sequences_dict, combined_df):
    """
    Identifies positions of peptide fragments in the Non-Collagen Proteins sequence from a DataFrame,
    including the gene name and position for each match.
    """
    unique_peptides = combined_df['Sequence'].unique().tolist()
    NCP_peptide_positions_dict = {}

    for gene, details in non_sequences_dict.items():
        protein_sequence = details['sequence']
        offset = details['offset']

        for peptide in unique_peptides:
            position = protein_sequence.find(peptide)
            if position != -1:
                corrected_position = position + offset
                # Use 'gene' for consistency
                NCP_peptide_positions_dict[peptide] = {'gene': gene, 'position': corrected_position}

    return NCP_peptide_positions_dict

def collagen_specific_criteria(peptide):
    """
    Determine if a peptide meets specific criteria for being considered a collagen peptide.

    Parameters:
    - peptide (str): The peptide sequence to evaluate.

    Returns:
    - bool: True if the peptide meets collagen-specific criteria, False otherwise.
    """
    # Example criterion: 'G' must appear at every third position in the sequence
    return all(peptide[i] == 'G' for i in range(len(peptide) % 3, len(peptide), 3))

def match_with_one_difference(unmatched_peptide, sequence_position_dict):
    """
    Find a peptide in sequence_position_dict with only one amino acid difference.

    Args:
        unmatched_peptide: The peptide to match.
        sequence_position_dict: Dictionary of peptides that have already been matched,
                          where keys are peptide sequences and values are dictionaries
                          containing 'gene' and 'position'.

    Returns:
        The key of the matched peptide in sequence_position_dict if a match is found;
        otherwise, returns None.
    """
    for matched_peptide in sequence_position_dict:
        if len(unmatched_peptide) == len(matched_peptide):
            # Count the number of differences
            differences = sum(1 for p, q in zip(unmatched_peptide, matched_peptide) if p != q)
            if differences == 1:
                return matched_peptide
    return None

def isobaric_and_deamidation_matching(peptide1, peptide2, max_diff=1):
    """
    Compares two peptides for similarity, allowing for isobaric amino acids (I/L) and
    potential deamidation (N/D, Q/E) as equivalent, up to a max difference.

    Parameters:
    - peptide1, peptide2: Peptides to compare.
    - max_diff: Maximum allowed differences (default 1).

    Returns:
    - Boolean indicating whether the peptides are considered similar under the given rules.
    """
    if len(peptide1) != len(peptide2):
        return False  # Peptides of different lengths are automatically not similar

    diff_count = 0
    for a, b in zip(peptide1, peptide2):
        if a == b:
            continue  # Identical amino acids
        elif (a in ['I', 'L'] and b in ['I', 'L']) or (a in ['N', 'D'] and b in ['N', 'D']) or (a in ['Q', 'E'] and b in ['Q', 'E']):
            continue  # Treat specific substitutions as equivalent
        else:
            diff_count += 1
            if diff_count > max_diff:
                return False  # Exceeds max allowed differences

    return True  # Peptides are similar within the allowed differences

def iteratively_match_peptides(peptides, sequence_dict, max_mismatches=2):
    """
    Attempts to iteratively match peptides with sequences, taking into account isobaric amino acids (I/L)
    and deamidation (N/D, Q/E), and allowing for an increasing number of mismatches up to a specified maximum.

    Parameters:
    - peptides (list): A list of peptide sequences to match.
    - sequence_dict (dict): A dictionary of sequences with their associated gene and position details.
    - max_mismatches (int): The maximum number of mismatches allowed in the matching process.

    Returns:
    - tuple: Two dictionaries, the first mapping matched peptides to their details, and the second containing unmatched peptides.
    """
    matched_peptides = {}
    unmatched_peptides = {pep: None for pep in peptides}  # Initialize all peptides as unmatched initially

    # Iteratively increase the allowed number of mismatches
    for allowed_mismatches in range(max_mismatches + 1):
        for peptide in list(unmatched_peptides.keys()):  # Work on a static list of current unmatched peptides
            for gene, details in sequence_dict.items():
                sequence = details['sequence']
                offset = details['offset']
                # Check if the peptide matches with current mismatch allowance
                if isobaric_and_deamidation_matching(peptide, sequence, max_diff=allowed_mismatches):
                    # If a match is found, calculate the position considering the offset
                    position = sequence.find(peptide) + offset
                    matched_peptides[peptide] = {'gene': gene, 'position': position}
                    del unmatched_peptides[peptide]  # Remove matched peptide from the unmatched list
                    break  # Proceed to the next peptide

    return matched_peptides, unmatched_peptides

def hail_mary_match(peptide, candidate_peptides, allowed_mismatches, min_length):
    """
    Attempt to match a peptide with candidates allowing for a specified number of mismatches,
    but only if the peptide meets a minimum length requirement.

    Parameters:
    - peptide (str): The peptide to match.
    - candidate_peptides (dict): Dictionary of peptides already matched with details.
    - allowed_mismatches (int): The number of mismatches allowed in the match. This parameter dynamically
      changes based on whether the peptide is considered short or long.
    - min_length (int): The minimum length of peptide to consider for matching. This parameter dynamically
      changes based on whether the peptide is being evaluated for short or long criteria.

    Returns:
    - str or None: The closest matching peptide if a match is found; otherwise, None.
    """
    if len(peptide) < min_length:
        return None  # Skip peptides shorter than the minimum length requirement.

    for candidate, details in candidate_peptides.items():
        if len(candidate) != len(peptide):
            continue  # Skip if lengths do not match.

        # Calculate the number of mismatches between the candidate and the peptide.
        mismatches = sum(1 for a, b in zip(candidate, peptide) if a != b)

        if mismatches <= allowed_mismatches:
            return candidate  # Return the first candidate that matches within the allowed mismatches.

    return None  # No suitable match found.


def find_collagen_matches(collagen_motif_peptides, sequences_dict):
    """
    Matches filtered peptides against collagen sequences.

    Parameters:
    - collagen_motif_peptides (list): A list of peptides filtered based on collagen-specific criteria.
    - sequences_dict (dict): A dictionary containing collagen sequences, keyed by some identifier (e.g., gene name).

    Returns:
    - dict: A dictionary with peptides as keys and details (gene, position) as values for the matches found.
    """
    sequence_position_dict = {}
    for peptide in collagen_motif_peptides:
        for gene, details in sequences_dict.items():
            sequence = details['sequence']
            offset = details.get('offset', 0)  # Assume offset is 0 if not specified
            position = sequence.find(peptide)
            if position != -1:  # Peptide found in the sequence
                corrected_position = position + offset  # Adjust for sequence offset
                sequence_position_dict[peptide] = {'gene': gene, 'position': corrected_position}
                break  # Exit loop after finding the first match for this peptide
    return sequence_position_dict


def parse_modifications(mod_str):
    """
    Parses a string containing modification data into a structured list.

    This function extracts the position, type of modification, and the amino acid
    involved from a given modification string. The expected format of the modification
    string is "position,Modification[AminoAcid];".

    Parameters:
        mod_str (str): A string representing modifications, or NaN/None for no modifications.

    Returns:
        list of tuples: A list where each tuple contains (position, modification type, amino acid).
    """
    # Check if mod_str is not a string (e.g., NaN represented as float in pandas)
    if not isinstance(mod_str, str):
        return []

    parsed_mods = []
    mods = mod_str.strip(';').split(';')
    for mod in mods:
        parts = mod.split(',')
        if len(parts) == 2:
            pos, mod_detail = parts
            try:
                mod_type, aa = mod_detail.split('[')
                aa = aa.rstrip(']')
                parsed_mods.append((int(pos), mod_type, aa))
            except ValueError:
                # Skip modification if parsing fails
                continue

    return parsed_mods

def parse_peptide(peptide):
    """
    Parses peptide strings to identify amino acids and extract them along with their positions.

    :param peptide: A string representation of a peptide.
    :return: A list of tuples, each representing an amino acid and its position in the peptide.
    """
    # A simple pattern that matches each amino acid in the sequence.
    aa_pattern = re.compile(r'[A-Z]')
    parsed_peptides = []
    pos = 1  # Position initialization

    for match in aa_pattern.finditer(peptide):
        aa = match.group()  # Extract the amino acid
        parsed_peptides.append((aa, pos))
        pos += 1  # Increment position for each amino acid

    return parsed_peptides

###Dataframe manipulation

In [None]:
def parse_and_update_df(sequence, sequence_position_dict): #combined_df
    """
    Check if the sequence matches an entry in the sequence_position_dict and return gene and position.

    Parameters:
    - sequence (str): The peptide sequence to check.
    - sequence_position_dict (dict): Dictionary with peptide sequences as keys and details as values.

    Returns:
    - tuple: (gene, position) if match is found; (None, None) otherwise.
    """
    if sequence in sequence_position_dict:
        # Extract gene and position from the matched entry
        gene = sequence_position_dict[sequence]['gene']
        position = sequence_position_dict[sequence]['start_position']
        return gene, position
    else:
        # Return None or placeholders if no match is found
        return None, None

def update_NCP_matches(row, NCP_peptide_positions_dict):
    """
    Update the row with NCP peptide positions if the sequence matches an NCP peptide.

    Parameters:
    - row: A pandas Series representing a row in a DataFrame.
    - NCP_peptide_positions_dict: A dictionary containing NCP peptide positions.

    Returns:
    - Updated row with NCP peptide match data.
    """
    sequence = row['Sequence']
    if sequence in NCP_peptide_positions_dict:
        match_data = NCP_peptide_positions_dict[sequence]
        # Check and process the gene name specifically if it exists
        if 'gene' in match_data:
            gene = match_data['gene'].split('-', 1)[0]  # Extract gene name before the underscore
            row['GNC&spp'] = gene  # Assuming you're updating to a gene-specific column
        else:
            for key, value in match_data.items():
                row[key] = value
    return row


# def find_match_and_update(row, sequence_position_dict, NCP_peptide_positions_dict):
#     """
#     Update the row based on matches found in sequence_position_dict and NCP_peptide_positions_dict.

#     Parameters:
#     - row: A pandas Series representing a row in a DataFrame.
#     - sequence_position_dict: A dictionary containing matched sequences with gene data.
#     - NCP_peptide_positions_dict: A dictionary containing NCP peptide positions.

#     Returns:
#     - Updated row with match data from sequence_position_dict or NCP_peptide_positions_dict.
#     """
#     sequence = row['Sequence']
#     if sequence in sequence_position_dict:
#         gene = sequence_position_dict[sequence]['gene']
#         if row['GNC&spp']:
#             row['GNC&spp'] = f"{row['GNC&spp']},{gene}"
#         else:
#             row['GNC&spp'] = gene
#     return row
def extract_protein_frequencies_and_associations(df, column_name):
    """
    Extracts the frequency of each protein ID and the five most common proteins
    listed with each ID from a specified column in a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing protein data.
    column_name (str): The name of the column with protein IDs separated by '/'.

    Returns:
    pd.DataFrame: A DataFrame with each protein's frequency and its five most common associations.
    """
    # Parse the specified column to create a list of proteins per row
    #df['Proteins_List'] = df[column_name].apply(lambda x: x.split('/'))
    df['Proteins_List'] = df[column_name].apply(lambda x: [p.split('&') for p in x.split('/')])

    # Initialize a dictionary to keep count of each protein's frequency
    protein_counts = Counter()
    # Initialize a dictionary to keep track of proteins listed together
    protein_associations = defaultdict(lambda: Counter())

    # Count frequencies and associations
    for proteins in df['Proteins_List']:
        for protein in proteins:
            protein_counts[protein] += 1
            # Add counts for other proteins in the list to the associations
            for associated_protein in proteins:
                if associated_protein != protein:
                    protein_associations[protein][associated_protein] += 1

    # Prepare the final report data
    report_data = [{
        'Protein': protein,
        'Frequency': count,
        'Most_Common_Associations': [assoc for assoc, _ in protein_associations[protein].most_common(5)]
    } for protein, count in protein_counts.items()]

    # Convert report_data to a DataFrame
    report_df = pd.DataFrame(report_data)

    return report_df

# def expand_df_to(df):
#     """
#     Expands a DataFrame by processing each peptide sequence and constructing detailed rows
#     for each amino acid within the peptide sequences, including modification information and
#     simplified sample name.

#     Parameters:
#         df (pd.DataFrame): A pandas DataFrame containing peptide information.

#     Returns:
#         pd.DataFrame: An expanded DataFrame with detailed amino acid and modification information.
#     """
#     expanded_rows = []
#     for _, row in df.iterrows():
#         start_position = row['start_position']
#         modifications = parse_modifications(row.get('Modification', ''))
#         mod_dict = {mod[0]: {'type': mod[1], 'aa': mod[2]} for mod in modifications}

#         amino_acids = parse_peptide(row['Sequence'])  # Assuming this returns [(aa, position_in_sequence), ...]
#         spp_GNC = row['GNC&spp'].split('_')
#         if len(spp_GNC) != 2:
#             continue
#         GNC_species, GNC = spp_GNC

#         file_name = row['File_Name']
#         simplified_file_name = "_".join(file_name.split("_")[:2])

#         for aa, pos in amino_acids:
#             adjusted_pos = pos - 1 + start_position
#             ptm_info = mod_dict.get(pos, {'type': '', 'aa': ''})
#             ptm = ptm_info['type']

#             expanded_row = {
#                 'id': None,  # Placeholder for ID if available or necessary
#                 'sample_name': simplified_file_name,
#                 'GNC': GNC,
#                 'GNC_species': GNC_species,
#                 'aa': aa,
#                 'position': adjusted_pos,
#                 'ptm': ptm,
#                 'confidence': None,
#                 'spectraId': row['Scan_No'],
#                 'mz': row['Exp.MH+'],
#                 'z': row['Charge'],
#                 'pepMass': row['Calc.MH+'],
#                 'err': row['Mass_Shift(Exp.-Calc.)'],
#                 'score': row['Final_Score'],
#                 'scanNum': row['Scan_No'],
#                 'peptide_gene_name': row['Proteins'],
#                 'RT': None,
#                 'ppm': None,
#             }
#             expanded_rows.append(expanded_row)

#     return pd.DataFrame(expanded_rows)

# import pandas as pd


def expand_df_to(df):
    """
    Expands a DataFrame by processing each peptide sequence and constructing detailed rows
    for each amino acid within the peptide sequences, including modification information and
    simplified sample name.

    Parameters:
        df (pd.DataFrame): A pandas DataFrame containing peptide information.

    Returns:
        pd.DataFrame: An expanded DataFrame with detailed amino acid and modification information.
    """
    expanded_rows = []
    # for _, row in df.iterrows():
    #     try:
    #         start_position = float(row['start_position'])
    #         if not start_position.is_integer():
    #             continue  # Skip if start_position is not a whole number
    #         start_position = int(start_position)
    #     except (ValueError, TypeError):
    #         continue  # Skip if start_position is not convertible to float
    for _, row in df.iterrows():
        try:
            # Directly convert start_position to an integer
            start_position = int(row['start_position'])
        except ValueError:
            # Skip this row if start_position is not a valid integer
            continue

        modifications = parse_modifications(row.get('Modification', ''))
        mod_dict = {mod[0]: {'type': mod[1], 'aa': mod[2]} for mod in modifications}
        amino_acids = parse_peptide(row['Sequence'])

        gnc_spp_split = row.get('GNC&spp', '').split('-', maxsplit=1)
        if len(gnc_spp_split) != 2:
            continue  # Skip if the split does not result in two parts

        GNC, spp = gnc_spp_split
        simplified_file_name = "_".join(row.get('File_Name', '').split("_")[:2])

        for aa, pos in amino_acids:
            mod_info = mod_dict.get(pos, {'type': '', 'aa': ''})
            expanded_row = {
                'id': None,
                'sample_name': simplified_file_name,
                'GNC': GNC,  #Genome Nomenclature
                'spp': spp,  #Species
                'aa': aa,    #amino acid
                'position': pos - 1 + start_position, #-1 as python counts from 0
                'ptm': mod_info['type'], # infomation on any post translational modification
                'confidence': None,  #confidence score for the amino acid
                'spectraId': row['Scan_No'], #the ID of the MS/MS scan number
                'mz': row['Exp.MH+'],  #mass of the peptide detected (m/z, mass over charge)
                'z': row['Charge'],   #charge
                'pepMass': row['Calc.MH+'],  #mass
                'err': row['Mass_Shift(Exp.-Calc.)'],  #error between the observed and calculated mass
                'score': row['Final_Score'], # score for the peptide - note this is difference from the 'confidence' which is for the individual amion acid
                'scanNum': row['Scan_No'], #scan number
                'peptide_gene_name': row['Proteins'], #full information about the ID from mapping algorithm (e.g. MaxQuant, pFind, Orthrus)
                'RT': None, #Retention time
                'ppm': None, #error in ppm
            }
            expanded_rows.append(expanded_row)

    return pd.DataFrame(expanded_rows)

### Export functions

In [None]:
def generate_sequence_with_gaps(group):
    """
    Generate a sequence with gaps for missing positions from a sorted group of sequences.

    Parameters:
    - group: A pandas DataFrame containing 'position' and 'aa' (amino acid) columns.

    Returns:
    - A string representing the sequence with gaps for missing positions.
    """
    group_sorted = group.sort_values('position')
    full_range = np.arange(group_sorted['position'].min(), group_sorted['position'].max() + 1)
    sequence_parts = []

    for position in full_range:
        if position not in group_sorted['position'].values:
            sequence_parts.append("-")
        else:
            aa_list = group_sorted.loc[group_sorted['position'] == position, 'aa'].unique()
            sequence_parts.append(f"({''.join(aa_list)})" if len(aa_list) > 1 else ''.join(aa_list))

    return ''.join(sequence_parts)

def format_fasta_header(row):
    """
    Format a FASTA header string based on the sample name and GNC from a DataFrame row.

    Parameters:
    - row: A pandas Series representing a row in a DataFrame.

    Returns:
    - A string representing the formatted FASTA header.
    """
    return f">{row['sample_name']}_{row['GNC'].replace(' ', '_')}"



###Plotting Functions

In [None]:
def find_cleavage_sites_with_offset(collagen_sequences: dict) -> dict:
    """
    Find the positions of the letters 'K' and 'R' in given protein sequences stored in a dictionary,
    adjusting for an offset provided for each sequence.

    Parameters:
        collagen_sequences (dict): A dictionary where each key is a unique identifier for a protein,
                                   and each value is another dictionary with keys 'offset' and 'sequence'
                                   representing the offset from the start of the protein and the amino
                                   acid sequence, respectively.

    Returns:
        dict: A dictionary where each key is the protein name (GNC) and each value is a list of adjusted
              positions (1-based) where either 'K' or 'R' occurs in the sequence, considering the offset.
    """
    cleavage_sites_results = {}

    for key, value in collagen_sequences.items():
        # Use the new extract_gene_identifier function to get the gene name
        protein_name = extract_gene_identifier(key)

        sequence = value['sequence']
        offset = value['offset']
        cleavage_sites = []

        for i, amino_acid in enumerate(sequence):
            if amino_acid in ['K', 'R']:
                position_with_offset = i + 1 + offset
                cleavage_sites.append(position_with_offset)

        cleavage_sites_results[protein_name] = cleavage_sites

    return cleavage_sites_results

### Protein aggregation functions

In [None]:
# def process_group(group):
#     # Get the GNC
#     gnc = group['GNC'].unique()[0]

#     # Get the position
#     position = group['position'].unique()[0]

#     # Get the sample_name
#     sample_name = group['sample_name'].unique()[0]

#     # Get the counts for each amino acid
#     aa_counts = group['aa'].value_counts()

#     # Get the top 3 amino acids
#     top_3_aas = aa_counts.index[:3].tolist()

#     # Create a dictionary to store the results
#     result = {
#         'GNC': gnc,
#         'position': position,
#         'sample_name': sample_name
#     }

#     for i, aa in enumerate(top_3_aas, start=1):
#         aa_group = group[group['aa'] == aa]
#         result[f'aa{i}'] = aa
#         result[f'n_aa{i}'] = aa_counts[aa]
#         result[f'aa{i}_score_mean'] = aa_group['score'].mean()
#         result[f'aa{i}_score_max'] = aa_group['score'].max()
#         result[f'aa{i}_score_std'] = aa_group['score'].std()

#         # Get the top 3 scores and spectraIds for the amino acid
#         top_scores = aa_group.nlargest(3, 'score')[['score', 'spectraId']]
#         for j, (score, spectra_id) in enumerate(top_scores.itertuples(index=False), start=1):
#             result[f'score{j}_aa{i}'] = score
#             result[f'spectraId{j}_aa{i}'] = spectra_id

#         # Get the top 2 PTMs for the amino acid
#         ptm_counts = aa_group['ptm'].value_counts()
#         top_ptms = ptm_counts.index[:2].tolist()

#         for j, ptm in enumerate(top_ptms, start=1):
#             ptm_group = aa_group[aa_group['ptm'] == ptm]
#             result[f'ptm{j}_aa{i}'] = ptm
#             result[f'%ptm{j}_aa{i}'] = (ptm_group.shape[0] / aa_group.shape[0]) * 100
#             result[f'ptm{j}_aa{i}_score_mean'] = ptm_group['score'].mean()
#             result[f'ptm{j}_aa{i}_score_max'] = ptm_group['score'].max()
#             result[f'ptm{j}_aa{i}_score_std'] = ptm_group['score'].std()

#             # Get the top score and spectraId for the PTM
#             top_ptm_score = ptm_group.nlargest(1, 'score')[['score', 'spectraId']]
#             result[f'score1_ptm{j}_aa{i}'] = top_ptm_score['score'].values[0]
#             result[f'spectraId1_ptm{j}_aa{i}'] = top_ptm_score['spectraId'].values[0]

#     return pd.Series(result)
import pandas as pd

def process_group(group):
    """
    Process a pandas DataFrame group to extract specific analytics and return results as a pd.Series.

    This function computes various metrics including the unique GNC, position, and
    sample_name values, counts for each amino acid, and detailed statistics for the
    top 3 amino acids. It includes mean, max, and standard deviation of scores,
    top scores and spectraIds, and PTM analysis. The result is returned as a pd.Series
    to facilitate easy integration with pandas data structures.

    Parameters:
    group (pandas.DataFrame): The DataFrame group to process.

    Returns:
    pandas.Series: A Series containing the computed metrics and statistics.
    """

    gnc, position, sample_name = group['GNC'].iloc[0], group['position'].iloc[0], group['sample_name'].iloc[0]

    aa_counts = group['aa'].value_counts()
    top_3_aas = aa_counts.index[:3]

    result = {'GNC': gnc, 'position': position, 'sample_name': sample_name}

    for i, aa in enumerate(top_3_aas, start=1):
        aa_group = group[group['aa'] == aa]
        scores = aa_group['score']
        top_scores = aa_group.nlargest(3, 'score')[['score', 'spectraId']]

        ptm_counts = aa_group['ptm'].value_counts()
        top_ptms = ptm_counts.index[:2]

        aa_info = {
            f'aa{i}': aa,
            f'n_aa{i}': aa_counts[aa],
            f'aa{i}_score_mean': scores.mean(),
            f'aa{i}_score_max': scores.max(),
            f'aa{i}_score_std': scores.std(),
        }

        aa_info.update({
            **{f'score{j}_aa{i}': score for j, (score, _) in enumerate(top_scores['score'], start=1)},
            **{f'spectraId{j}_aa{i}': spectra_id for j, (_, spectra_id) in enumerate(top_scores['spectraId'], start=1)}
        })

        for j, ptm in enumerate(top_ptms, start=1):
            ptm_group = aa_group[aa_group['ptm'] == ptm]
            scores = ptm_group['score']
            top_ptm_score = ptm_group.nlargest(1, 'score')[['score', 'spectraId']].iloc[0]

            aa_info.update({
                f'ptm{j}_aa{i}': ptm,
                f'%ptm{j}_aa{i}': (ptm_group.shape[0] / aa_group.shape[0]) * 100,
                f'ptm{j}_aa{i}_score_mean': scores.mean(),
                f'ptm{j}_aa{i}_score_max': scores.max(),
                f'ptm{j}_aa{i}_score_std': scores.std(),
                f'score1_ptm{j}_aa{i}': top_ptm_score['score'],
                f'spectraId1_ptm{j}_aa{i}': top_ptm_score['spectraId']
            })

        result.update(aa_info)

    return pd.Series(result)

In [None]:
def get_top_n(df, column, n=3):
    """
    Returns the top N items from a dataframe based on the count of a specified column.
    """
    return df[column].value_counts().nlargest(n).reset_index().rename(columns={'index': column, column: f'n_{column}'})

def get_scores_info(df, aa, prefix):
    """
    Aggregate score information for a given amino acid.
    """
    filtered_df = df[df['aa'] == aa]
    top_scores = filtered_df.nlargest(3, 'score')[['score', 'spectraId']]
    score_info = {
        f'{prefix}_score_mean': filtered_df['score'].mean(),
        f'{prefix}_score_max': filtered_df['score'].max(),
        f'{prefix}_score_SD': filtered_df['score'].std(),
    }
    for i, (score, spectraId) in enumerate(zip(top_scores['score'], top_scores['spectraId']), start=1):
        score_info.update({
            f'{prefix}_score{i}': score,
            f'{prefix}_spectraId{i}': spectraId,
        })
    return score_info

def get_ptm_info(df, aa, prefix):
    """
    Aggregate PTM information for a given amino acid.
    """
    # Filtering for non-null PTMs
    filtered_df = df[(df['aa'] == aa) & (df['ptm'].notnull())]
    if filtered_df.empty:
        return {f'{prefix}_ptm': None}  # Return None or appropriate value if no PTMs are found

    # Assuming PTM data is categorical and can be directly counted
    top_ptms = filtered_df['ptm'].value_counts().nlargest(2).index
    ptm_info = {}
    for i, ptm in enumerate(top_ptms, start=1):
        ptm_group = filtered_df[filtered_df['ptm'] == ptm]
        ptm_info.update({
            f'{prefix}_ptm{i}': ptm,
            f'{prefix}_ptm{i}_count': ptm_group.shape[0],
            f'{prefix}_ptm{i}_percentage': (ptm_group.shape[0] / filtered_df.shape[0]) * 100,
            # Additional PTM-specific stats can be added here
        })

    return ptm_info

def aggregate_info(group):
    """
    Aggregates detailed information for each group of data.
    """
    top_aas = get_top_n(group, 'aa')
    aggregated_data = []

    for _, row in top_aas.iterrows():
        aa = row['aa']
        aa_info = get_scores_info(group, aa, 'aa1')
        ptm_info = get_ptm_info(group, aa, 'aa1')

        record = {
            'aa': aa,
            'n_aa': row['n_aa'],
            **aa_info,
            **ptm_info
        }

        aggregated_data.append(record)

    return pd.DataFrame(aggregated_data)

###Enzymatic Cleavage Sites

In [None]:
# Merge sequence dictionaries
all_sequences = {**sequences_dict,
                # **non_sequences_dict
                 }
# Find cleavage sites considering the offsets
cleavage_sites_with_offset = find_cleavage_sites_with_offset(all_sequences)

for protein, sites in cleavage_sites_with_offset.items():
    print(f"{protein}: {sites}")

COL1A1: [9, 42, 50, 59, 62, 66, 75, 87, 90, 99, 108, 126, 132, 134, 144, 174, 183, 192, 219, 237, 252, 264, 270, 290, 291, 294, 309, 315, 327, 333, 342, 350, 360, 374, 386, 396, 408, 416, 420, 434, 453, 479, 498, 501, 507, 519, 531, 555, 564, 567, 573, 581, 585, 603, 618, 624, 648, 657, 684, 687, 704, 725, 729, 732, 740, 756, 780, 789, 792, 806, 816, 836, 848, 855, 858, 884, 888, 906, 915, 918, 927, 930, 933, 963, 974, 990, 992, 1030, 1036]
COL1A2: [-2, 11, 44, 52, 61, 64, 68, 77, 89, 92, 101, 110, 128, 134, 136, 146, 176, 194, 221, 239, 254, 266, 272, 292, 293, 311, 317, 325, 335, 344, 352, 362, 376, 388, 398, 410, 418, 422, 431, 455, 481, 485, 500, 503, 521, 533, 557, 566, 569, 575, 583, 587, 605, 620, 626, 650, 653, 656, 659, 689, 706, 727, 731, 734, 742, 758, 791, 794, 818, 838, 850, 857, 860, 886, 890, 908, 917, 920, 926, 929, 935, 965, 976, 979, 992, 1030]
COL3A1: [-9, 59, 65, 68, 72, 81, 93, 96, 101, 105, 114, 132, 138, 140, 150, 156, 180, 198, 225, 243, 258, 266, 270, 276, 279,

# Load files

In [None]:
# Set the base path as per your Google Drive structure
STUDY_NAME = 'CE&Tuuli'
PFIND_FOLDER = f'pFind_{STUDY_NAME}'
BASE_PATH = f'/content/drive/MyDrive/Colab_Notebooks/NovorCloud/{STUDY_NAME}/{PFIND_FOLDER}/'

##Data Path

### Muli-directories

In [None]:
# # Find pFind-Filtered.spectra files
# spectra_files = find_spectra_files(BASE_PATH)
# # Combine into a single DataFrame
# combined_df = combine_spectra_files(spectra_files)
# #Add the length of the peptide
# combined_df['peptide_length'] = combined_df['Sequence'].apply(len)
# #----------------------Reporting
# #combined_df.shape
# #combined_df.info()
# ##combined_df.describe()
# #combined_df['Sequence'].nunique()
# #combined_df['Miss.Clv.Sites'].nunique()
# combined_df.head()

### Single directory

In [None]:
#Find pFind-Filtered.spectra files
#spectra_files = find_spectra_files(BASE_PATH)

spectra_files = [f'/content/drive/MyDrive/Colab_Notebooks/NovorCloud/{STUDY_NAME}/{PFIND_FOLDER}/result/pFind-Filtered.spectra']
# Combine into a single DataFrame
combined_df = combine_spectra_files(spectra_files)
#Add the length of the peptide

combined_df['peptide_length'] = combined_df['Sequence'].apply(len)
#----------------------Reporting
#combined_df.shape
#combined_df.info()
##combined_df.describe()
combined_df['Sequence'].nunique()
#combined_df['Miss.Clv.Sites'].nunique()
#combined_df.head()

5093

###Find Collagen motif
Iterates through each peptide in unique_peptides.
Uses a generator expression within all() to check if every third character, starting from the offset determined by len(peptide) % 3, is a 'G'.
This method directly reflects the requirement for 'G' to be every third character after an initial offset and accounts for sequences that may not start with 'G' or might be truncated.

# Dictionary Matching

In [None]:
sequence_position_dict = {}  # Stores peptides found in sequences with gene and position
matched_peptides = set()

unique_peptides = combined_df['Sequence'].unique().tolist()  #: Extract Unique Peptides


collagen_motif_peptides = [peptide for peptide in unique_peptides if collagen_specific_criteria(peptide)] #Pre-filtering collagen peptides
#print (collagen_motif_peptides)

for peptide in collagen_motif_peptides:
    for gene, details in sequences_dict.items():
        sequence = details['sequence']
        offset = details['offset']
        position = sequence.find(peptide)
        if position != -1:  # Peptide found in the sequence
            corrected_position = position + offset  # Adjust for sequence offset
            sequence_position_dict[peptide] = {'gene': gene, 'position': corrected_position}
            break

ncp_peptides = [peptide for peptide in unique_peptides if peptide not in collagen_motif_peptides]  #The Non Collagenous Proteins peptides

for peptide in ncp_peptides:
    for gene, details in sequences_dict.items():
        sequence = details['sequence']
        offset = details['offset']
        position = sequence.find(peptide)
        if position != -1:  # Peptide found in the sequence
            corrected_position = position + offset  # Adjust for sequence offset
            sequence_position_dict[peptide] = {'gene': gene, 'position': corrected_position}
            break

# Calculate totals
total_matched = len(sequence_position_dict)
total_unmatched = len(unique_peptides) - total_matched

# Reporting
print(f"Total matched peptides round 1: {total_matched}")
print(f"Total unmatched peptides: {total_unmatched}")



Total matched peptides round 1: 964
Total unmatched peptides: 4129


In [None]:
def isobaric_and_deamidation_matching(peptide, subsequence, max_mismatches):
    """
    Compares two peptides for similarity, allowing for isobaric amino acids (I/L) and
    potential deamidation (N/D, Q/E) as equivalent, up to a max difference.

    Parameters:
    - peptide, subsequence: Peptides to compare.
    - max_diff: Maximum allowed differences (default 1).

    Returns:
    - Boolean indicating whether the peptides are considered similar under the given rules.
    """
    if len(peptide) != len(subsequence):
        return False  # Peptides of different lengths are automatically not similar

    diff_count = 0
    for a, b in zip(peptide, subsequence):
        if a == b:
            continue  # Identical amino acids
        elif (a in ['I', 'L'] and b in ['I', 'L']) or (a in ['N', 'D'] and b in ['N', 'D']) or (a in ['Q', 'E'] and b in ['Q', 'E']):
            continue  # Treat specific substitutions as equivalent
        else:
            diff_count += 1
            if diff_count > max_mismatches:
                return False  # Exceeds max allowed differences

    return True  # Peptides are similar within the allowed differences


def match_peptides_with_mismatches(peptides, sequences_dict, max_mismatches):
    matched_peptides = {}
    remaining_peptides = peptides[:]  # Create a copy of the peptides list for safe iteration

    for peptide in peptides:
        for gene, details in sequences_dict.items():
            sequence = details['sequence']
            offset = details.get('offset', 0)  # Use get to provide a default of 0 for offset
            found_match = False

            for start_idx in range(len(sequence) - len(peptide) + 1):
                subsequence = sequence[start_idx:start_idx+len(peptide)]

                if isobaric_and_deamidation_matching(peptide, subsequence, max_mismatches):
                    corrected_position = start_idx + offset
                    matched_peptides[peptide] = {'gene': gene, 'position': corrected_position}
                    found_match = True
                    break  # Found a match, stop checking this peptide against other sequences

            if found_match:
                remaining_peptides.remove(peptide)  # Safe to remove since we're not iterating over `remaining_peptides`
                break  # Move on to the next peptide

    return matched_peptides, remaining_peptides


In [None]:
# Combine collagen and NCP peptides after initial matching for iterative mismatch matching
all_peptides = collagen_motif_peptides + ncp_peptides
unmatched_peptides = [pep for pep in all_peptides if pep not in sequence_position_dict]

# Iteratively match with mismatches
for max_mismatches in range(1, 3):  # 1 and then 2 mismatches allowed
    matched, unmatched_peptides = match_peptides_with_mismatches(unmatched_peptides, sequences_dict, max_mismatches)
    sequence_position_dict.update(matched)  # Update matched peptides into the main dictionary
    if not unmatched_peptides:  # Break early if all peptides have been matched
        break

# Reporting
total_matched = len(sequence_position_dict)
total_unmatched = len(unmatched_peptides)
print(f"Total matched peptides after iterative matching: {total_matched}")
print(f"Total unmatched peptides after iterative matching: {total_unmatched}")


Total matched peptides after iterative matching: 2240
Total unmatched peptides after iterative matching: 2853


In [279]:
# Get the current timestamp in a format suitable for filenames
now = datetime.now().strftime("%Y-%m-%d_%H")  # Example format YYYY-MM-DD_HH-MM-SS

# Create the filename (consider adding timestamp or version for clarity)
filename = f'{directory_path}{now}_sequence_positions.json'

# Open the file in write mode ('w')
with open(filename, 'w') as json_file:
    # Use json.dump() to serialize the dictionary and write it to the file
    json.dump(sequence_position_dict, json_file, indent=4)  # Add indentation for readability

print(f'Dictionary saved to JSON file: {filename}')


Dictionary saved to JSON file: /content/drive/MyDrive/Colab_Notebooks/NovorCloud/CE&Tuuli/pFind_CE&Tuuli/result/2024-03-13_18_sequence_positions.json


###   Direct Matching for Collagen-Motif Peptides

In [272]:
peptides_to_examine = [
    'IITHPNFNGNTLDNDIMLIK',
    'LGEHNIDVLEGNEQFINAAK',
    'YSQGNVSAVGVTYDGHTALTR',
    'TPPAGVFYQGWSATPIANGSLGHDIHHPR',
    'GPPGPMGPPGIAGPPGESGR',
    'SDLEMQYETLQEELMALKK'
]


print("Peptides from sequence_position_dict:")
for peptide in peptides_to_examine:
    if peptide in sequence_position_dict:
        details = sequence_position_dict[peptide]
        # Check if 'gene' and 'position' keys exist before accessing
        gene = details.get('gene', 'Gene information not available')
        position = details.get('position', 'Position information not available')
        print(f"Peptide: {peptide}, Gene: {gene}, Position: {position}")
    else:
        print(f"Peptide: {peptide} not found in sequence_position_dict.")


Peptides from sequence_position_dict:
Peptide: IITHPNFNGNTLDNDIMLIK, Gene: TRYUP-Sus_sco, Position: 69
Peptide: LGEHNIDVLEGNEQFINAAK, Gene: TRYUP-Sus_sco, Position: 49
Peptide: YSQGNVSAVGVTYDGHTALTR not found in sequence_position_dict.
Peptide: TPPAGVFYQGWSATPIANGSLGHDIHHPR not found in sequence_position_dict.
Peptide: GPPGPMGPPGIAGPPGESGR, Gene: COL1A1-Bos_tau, Position: 816
Peptide: SDLEMQYETLQEELMALKK not found in sequence_position_dict.


In [None]:
# Identifying collagen-motif peptides not matched directly
unmatched_collagen_peptides = [peptide for peptide in collagen_motif_peptides if peptide not in sequence_position_dict]

# Mismatch-tolerant matching for collagen-motif peptides
collagen_mismatched_peptides = match_peptides_with_mismatches(
    peptides=unmatched_collagen_peptides,
    sequence_dict=sequences_dict,
    max_mismatches=1  # Adjust based on tolerance
)
# Update the sequence_position_dict with these peptides
for peptide, match_info in collagen_mismatched_peptides.items():
    sequence_position_dict[peptide] = match_info

    # Calculate totals
total_matched = len(sequence_position_dict)
total_unmatched = len(unique_peptides) - total_matched

# Reporting
print(f"Total matched peptides round 1: {total_matched}")
print(f"Total unmatched peptides: {total_unmatched}")


# Parameters for the Hail Mary approach
allowed_mismatches_short_peptide = 2  # Allowing up to 2 mismatches for shorter peptides
min_length_short_peptide = 10  # Minimum length for considering short peptides
allowed_mismatches_long_peptide = 3  # Allowing up to 3 mismatches for longer peptides
min_length_long_peptide = 15  # Minimum length for considering long peptides

# Attempt Hail Mary matching for unmatched collagen peptides
hail_mary_matches_short = {}
hail_mary_matches_long = {}
for peptide in unmatched_collagen_peptides:
    if peptide in sequence_position_dict:  # Skip already matched peptides
        continue
    # Hail Mary matching criteria
    if len(peptide) >= min_length_short_peptide:
        match = hail_mary_match(peptide, sequence_position_dict, allowed_mismatches_short_peptide, min_length_short_peptide)
        if match:
            hail_mary_matches_short[peptide] = sequence_position_dict[match]
    if len(peptide) >= min_length_long_peptide:
        match = hail_mary_match(peptide, sequence_position_dict, allowed_mismatches_long_peptide, min_length_long_peptide)
        if match:
            hail_mary_matches_long[peptide] = sequence_position_dict[match]

# Update the main dictionary with Hail Mary matches
sequence_position_dict.update(hail_mary_matches_short)
sequence_position_dict.update(hail_mary_matches_long)

# Calculate totals
total_matched = len(sequence_position_dict)
total_unmatched = len(unique_peptides) - total_matched

# Reporting
print(f"Total matched peptides (Hail Mary Matching): {total_matched}")
print(f"Total unmatched peptides: {total_unmatched}")


Total matched peptides round 1: 5093
Total unmatched peptides: 0
Total matched peptides (Hail Mary Matching): 5093
Total unmatched peptides: 0


In [None]:
def iteratively_match_peptides_to_sequences(peptides, sequences_dict, max_mismatches=2):
    """
    Iteratively matches peptides to sequences, allowing for a specified number of mismatches while
    considering biochemical equivalences such as isobaric amino acids and potential deamidation.

    Args:
        peptides (list of str): The list of peptide sequences to match.
        sequences_dict (dict): A dictionary with sequences as values and additional details like species and offset.
        max_mismatches (int): The maximum number of mismatches allowed in the matching process.

    Returns:
        dict: A dictionary mapping matched peptides to their matched sequences and the position, adjusted for offset.
    """
    sequence_position_dict = {}

    for peptide in peptides:
        for gene, details in sequences_dict.items():
            sequence = details['sequence']
            offset = details['offset']

            # Attempt to match the peptide within the sequence
            for i in range(len(sequence) - len(peptide) + 1):
                subsequence = sequence[i:i+len(peptide)]

                if isobaric_and_deamidation_matching(peptide, subsequence, max_mismatches):
                    # Correct the position for the offset and record the match
                    corrected_position = i + offset
                    sequence_position_dict[peptide] = {'gene': gene, 'position': corrected_position}
                    break  # Found a match, move on to the next peptide

        # If a match is found, it exits the loop for the current peptide and continues with the next one
        if peptide in sequence_position_dict:
            continue  # Move to the next peptide if the current one has been matched

    return sequence_position_dict


sequence_position_dict = {}  # Stores peptides found in sequences with gene and position
matched_peptides = set()

unique_peptides = combined_df['Sequence'].unique().tolist()  # Extract Unique Peptides

collagen_motif_peptides = [peptide for peptide in unique_peptides if collagen_specific_criteria(peptide)]  # Pre-filtering collagen peptides
print(collagen_motif_peptides)

# One-pass loop with mismatch handling
for peptide in collagen_motif_peptides:
  matched_collagen, unmatched_collagen = iteratively_match_peptides([peptide], sequences_dict, max_mismatches=1)
  sequence_position_dict.update(matched_collagen)

['GPVGPTGPVGAAGPSGPNGPPGPAGSR', 'GDAGPPGPAGPAGPPGPIGNVGAPGPK', 'GITGPIGPPGPAGAPGDKGEAGPSGPAGPTGAR', 'GFSGIQGPPGPPGSPGEQGPSGASGPAGPR', 'GETGPAGRPGEVGPPGPPGPAGEK', 'SGDRGETGPAGPAGPIGPVGAR', 'GAPGPQGPPGAPGPLGIAGLTGAR', 'GEPGPPGPAGAAGPAGNPGADGQPGAK', 'GAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPR', 'GNDGATGAAGPPGPTGPAGPPGFPGAVGAK', 'GLTGPIGPPGPAGAPGDKGETGPSGPAGPTGAR', 'GITGPIGPPGPAGAPGDKGETGPSGPAGPTGAR', 'GPIGPPGPAGAPGDKGEAGPSGPAGPTGAR', 'GEPGPPGPAGAAGPAGNPGADGEPGAK', 'GLTGPIGPPGPAGAPGDKGEAGPSGPAGPTGAR', 'GLPGPPGSPGPQGFQGPPGEPGEPGASGPMGPR', 'GENGPVGPTGPVGAAGPSGPNGPPGPAGSR', 'GAPGADGPAGAPGTPGPQGIAGQR', 'GSDGSVGPVGPAGPIGSAGPPGFPGAPGPK', 'TGPIGPPGPAGAPGDKGEAGPSGPAGPTGAR', 'GLPGVAGSLGEPGPLGIAGPPGAR', 'GHNGLQGLPGLAGHHGDQGASGPVGPAGPR', 'GFSGLQGPPGPPGAPGEQGPSGASGPAGPR', 'SGDRGEAGPAGPAGPIGPVGAR', 'GPPGESGAAGPTGPIGSR', 'GETGPAGPAGPIGPVGAR', 'AGPPGPTGPAGPPGFPGAVGAK', 'GDSGPPGPAGPAGPPGPIGNVGAPGPK', 'GERGPPGESGAAGPTGPIGSR', 'GIPGVAGSIGEPGPIGIAGPPGAR', 'GNDGGPGAAGPPGPTGPAGPPGFPGAVGAK', 'GEPGPPGPAGFAGPPGADGQPGAK', 

KeyError: 'GPVGPTGPVGAAGPSGPNGPPGPAGSR'

In [None]:
genuinely_matched = [pep for pep, info in sequence_position_dict.items() if info['Gene'] != "Gene information not available"]
total_matched = len(genuinely_matched)
total_unmatched = len(unique_peptides) - total_matched

print(f"Total matched peptides round 1: {total_matched}")
print(f"Total unmatched peptides: {total_unmatched}")

KeyError: 'Gene'

In [None]:
peptides_to_examine = [
    'IITHPNFNGNTLDNDIMLIK',
    'LGEHNIDVLEGNEQFINAAK',
    'YSQGNVSAVGVTYDGHTALTR',
    'TPPAGVFYQGWSATPIANGSLGHDIHHPR',
    'GPPGPMGPPGIAGPPGESGR',
    'SDLEMQYETLQEELMALKK'
]


print("Peptides from sequence_position_dict:")
for peptide in peptides_to_examine:
    if peptide in sequence_position_dict:
        details = sequence_position_dict[peptide]
        # Check if 'gene' and 'position' keys exist before accessing
        gene = details.get('gene', 'Gene information not available')
        position = details.get('position', 'Position information not available')
        print(f"Peptide: {peptide}, Gene: {gene}, Position: {position}")
    else:
        print(f"Peptide: {peptide} not found in sequence_position_dict.")


Peptides from sequence_position_dict:
Peptide: IITHPNFNGNTLDNDIMLIK, Gene: TRYUP-Sus_sco, Position: 69
Peptide: LGEHNIDVLEGNEQFINAAK, Gene: TRYUP-Sus_sco, Position: 49
Peptide: YSQGNVSAVGVTYDGHTALTR, Gene: Gene information not available, Position: Position information not available
Peptide: TPPAGVFYQGWSATPIANGSLGHDIHHPR, Gene: Gene information not available, Position: Position information not available
Peptide: GPPGPMGPPGIAGPPGESGR, Gene: Gene information not available, Position: Position information not available
Peptide: SDLEMQYETLQEELMALKK, Gene: Gene information not available, Position: Position information not available


In [None]:
import random

random_entries = random.sample(list(sequence_position_dict.items()), 100)
for key, value in random_entries:
    print(f"Key: {key}, Value: {value}")

ValueError: Sample larger than population or is negative

In [None]:
#Peek at the output of the Dictionaries =============================== report

print(sequence_position_dict)
#  matched_peptides in sequence_position_dict  dictionary
for peptide, details in islice(sequence_position_dict.items(), 10):
    print(f"Peptide: {peptide}, Gene: {details['gene']}, Position: {details['position']}")

# print(NCP_peptide_positions_dict)
# #  matched_peptides is dictionary
# for peptide, details in islice(NCP_peptide_positions_dict.items(), 10):
#     print(f"Peptide: {peptide}, Gene: {details['gene']}, Position: {details['position']}")

{'GPVGPTGPVGAAGPSGPNGPPGPAGSR': {'gene': 'COL1A2-Bos_tau', 'position': 660}, 'GDAGPPGPAGPAGPPGPIGNVGAPGPK': {'gene': 'COL1A1-Bos_tau', 'position': 657}, 'GETGPAGRPGEVGPPGPPGPAGEK': {'gene': 'COL1A1-Bos_tau', 'position': 732}, 'SGDRGETGPAGPAGPIGPVGAR': {'gene': 'COL1A1-Bos_tau', 'position': 884}, 'GAPGPQGPPGAPGPLGIAGLTGAR': {'gene': 'COL3A1-BosxBos', 'position': 762}, 'GEPGPPGPAGAAGPAGNPGADGQPGAK': {'gene': 'COL1A1-Bos_tau', 'position': 192}, 'GAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPR': {'gene': 'COL1A2-Bos_tau', 'position': 585}, 'GNDGATGAAGPPGPTGPAGPPGFPGAVGAK': {'gene': 'COL1A1-Bos_tau', 'position': 144}, 'GLTGPIGPPGPAGAPGDKGETGPSGPAGPTGAR': {'gene': 'COL1A1-Equ_cab', 'position': 585}, 'GPIGPPGPAGAPGDKGEAGPSGPAGPTGAR': {'gene': 'COL1A1-Bos_tau', 'position': 588}, 'GLTGPIGPPGPAGAPGDKGEAGPSGPAGPTGAR': {'gene': 'COL1A1-Bos_tau', 'position': 585}, 'GENGPVGPTGPVGAAGPSGPNGPPGPAGSR': {'gene': 'COL1A2-Bos_tau', 'position': 657}, 'GAPGADGPAGAPGTPGPQGIAGQR': {'gene': 'COL1A1-Bos_tau', 'position': 756}

### Step 4: Identify Unmatched Peptides

After attempting direct matching, identify peptides that remain unmatched:

In [None]:
# Identifying NCP peptides not matched directly
unmatched_ncp_peptides = [peptide for peptide in ncp_peptides if peptide not in sequence_position_dict]

# Mismatch-tolerant matching for NCP peptides
ncp_mismatched_peptides = match_peptides_with_mismatches(
    peptides=unmatched_ncp_peptides,
    sequence_dict=sequences_dict,
    max_mismatches=2  # Adjust based on tolerance for NCPs
)

# Update the sequence_position_dict with these peptides
for peptide, match_info in ncp_mismatched_peptides.items():
    sequence_position_dict[peptide] = match_info


# Calculate totals
total_matched = len(sequence_position_dict)
total_unmatched = len(unique_peptides) - total_matched

# Reporting
print(f"Total matched peptides (Hail Mary Matching): {total_matched}")
print(f"Total unmatched peptides: {total_unmatched}")

print (unmatched_peptides)

Total matched peptides (Hail Mary Matching): 5093
Total unmatched peptides: 0
[]


In [None]:
# Identify all remaining unmatched peptides
unmatched_peptides = [peptide for peptide in unique_peptides if peptide not in sequence_position_dict]

# Mismatch-tolerant matching for all remaining unmatched peptides
for peptide in unmatched_peptides:
    match_info = match_peptides_with_mismatches(
        peptides=[peptide],
        sequence_dict=sequences_dict,
        max_mismatches=1  # Or adjust based on specific criteria
    )
    if match_info:
        sequence_position_dict.update(match_info)

# Calculate totals
total_matched = len(sequence_position_dict)
total_unmatched = len(unique_peptides) - total_matched

# Reporting
print(f"Total matched peptides (direct and mismatch allowance): {total_matched}")
print(f"Total unmatched peptides: {total_unmatched}")

print (unmatched_peptides)


Total matched peptides (direct and mismatch allowance): 5093
Total unmatched peptides: 0
[]


In [None]:
#Peek at the output of the Dictionaries =============================== report

print(sequence_position_dict)
#  matched_peptides in sequence_position_dict  dictionary
for peptide, details in islice(sequence_position_dict.items(), 10):
    print(f"Peptide: {peptide}, Gene: {details['gene']}, Position: {details['position']}")

# print(NCP_peptide_positions_dict)
# #  matched_peptides is dictionary
# for peptide, details in islice(NCP_peptide_positions_dict.items(), 10):
#     print(f"Peptide: {peptide}, Gene: {details['gene']}, Position: {details['position']}")

{'GPVGPTGPVGAAGPSGPNGPPGPAGSR': {'gene': 'COL1A2-Bos_tau', 'position': 660}, 'GDAGPPGPAGPAGPPGPIGNVGAPGPK': {'gene': 'COL1A1-Bos_tau', 'position': 657}, 'GETGPAGRPGEVGPPGPPGPAGEK': {'gene': 'COL1A1-Bos_tau', 'position': 732}, 'SGDRGETGPAGPAGPIGPVGAR': {'gene': 'COL1A1-Bos_tau', 'position': 884}, 'GAPGPQGPPGAPGPLGIAGLTGAR': {'gene': 'COL3A1-BosxBos', 'position': 762}, 'GEPGPPGPAGAAGPAGNPGADGQPGAK': {'gene': 'COL1A1-Bos_tau', 'position': 192}, 'GAPGAIGAPGPAGANGDRGEAGPAGPAGPAGPR': {'gene': 'COL1A2-Bos_tau', 'position': 585}, 'GNDGATGAAGPPGPTGPAGPPGFPGAVGAK': {'gene': 'COL1A1-Bos_tau', 'position': 144}, 'GLTGPIGPPGPAGAPGDKGETGPSGPAGPTGAR': {'gene': 'COL1A1-Equ_cab', 'position': 585}, 'GPIGPPGPAGAPGDKGEAGPSGPAGPTGAR': {'gene': 'COL1A1-Bos_tau', 'position': 588}, 'GLTGPIGPPGPAGAPGDKGEAGPSGPAGPTGAR': {'gene': 'COL1A1-Bos_tau', 'position': 585}, 'GENGPVGPTGPVGAAGPSGPNGPPGPAGSR': {'gene': 'COL1A2-Bos_tau', 'position': 657}, 'GAPGADGPAGAPGTPGPQGIAGQR': {'gene': 'COL1A1-Bos_tau', 'position': 756}

In [None]:
peptides_to_examine = [
    'IITHPNFNGNTLDNDIMLIK',
    'LGEHNIDVLEGNEQFINAAK',
    'YSQGNVSAVGVTYDGHTALTR',
    'TPPAGVFYQGWSATPIANGSLGHDIHHPR',
    'GPPGPMGPPGIAGPPGESGR',
    'SDLEMQYETLQEELMALKK'
]


print("Peptides from sequence_position_dict:")
for peptide in peptides_to_examine:
    if peptide in sequence_position_dict:
        details = sequence_position_dict[peptide]
        # Check if 'gene' and 'position' keys exist before accessing
        gene = details.get('gene', 'Gene information not available')
        position = details.get('position', 'Position information not available')
        print(f"Peptide: {peptide}, Gene: {gene}, Position: {position}")
    else:
        print(f"Peptide: {peptide} not found in sequence_position_dict.")


Peptides from sequence_position_dict:
Peptide: IITHPNFNGNTLDNDIMLIK, Gene: TRYUP-Sus_sco, Position: 69
Peptide: LGEHNIDVLEGNEQFINAAK, Gene: TRYUP-Sus_sco, Position: 49
Peptide: YSQGNVSAVGVTYDGHTALTR, Gene: Gene information not available, Position: Position information not available
Peptide: TPPAGVFYQGWSATPIANGSLGHDIHHPR, Gene: Gene information not available, Position: Position information not available
Peptide: GPPGPMGPPGIAGPPGESGR, Gene: Gene information not available, Position: Position information not available
Peptide: SDLEMQYETLQEELMALKK, Gene: Gene information not available, Position: Position information not available


In [None]:
combined_df

Unnamed: 0,File_Name,Scan_No,Exp.MH+,Charge,Q-value,Sequence,Calc.MH+,Mass_Shift(Exp.-Calc.),Raw_Score,Final_Score,Modification,Specificity,Proteins,Positions,Label,Target/Decoy,Miss.Clv.Sites,Avg.Frag.Mass.Shift,Others,peptide_length
0,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.638...,6381,2300.162702,2,0.000000,IITHPNFNGNTLDNDIMLIK,2300.159002,0.003700,41.306849,1.528900e-10,"8,Deamidated[N];17,Oxidation[M];",3,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,0.000153,32,20
1,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.622...,6226,2299.176369,2,0.000000,IITHPNFNGNTLDNDIMLIK,2299.174986,0.001383,38.866520,2.943950e-10,"17,Oxidation[M];",3,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|,target,0,0.000168,32,20
2,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.631...,6317,2300.164047,2,0.000000,IITHPNFNGNTLDNDIMLIK,2300.159002,0.005045,38.503685,5.677730e-10,"8,Deamidated[N];17,Oxidation[M];",3,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,0.000505,32,20
3,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.635...,6355,2300.162845,2,0.000000,IITHPNFNGNTLDNDIMLIK,2300.159002,0.003843,39.159320,6.162310e-10,"8,Deamidated[N];17,Oxidation[M];",3,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,-0.001049,32,20
4,20240209-0223_QEHF2_1007815_ONJ_TR_MC_CE12.446...,4465,2300.164123,2,0.000000,IITHPNFNGNTLDNDIMLIK,2300.159002,0.005121,39.978869,7.295450e-10,"8,Deamidated[N];17,Oxidation[M];",3,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,0.000511,32,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48111,20240209-0311_QEHF2_1007817_ONJ_TR_MC_CE14.368...,3684,1296.660347,2,0.009923,GFPGLPGPAGEPGK,1296.658220,0.002127,9.373214,3.387720e-01,"8,Oxidation[P];",3,XP_049416991.1/XP_033501735.1/XP_033501736.1/Q...,"954,R,P/954,R,P/900,R,P/925,R,Q/",1|0|,target,0,0.004944,36,14
48112,20240209-0311_QEHF2_1007817_ONJ_TR_MC_CE14.143...,1438,1845.901582,2,0.009923,TGPPGPSGISGPPGPPGPSGK,1845.897650,0.003932,13.103194,3.399330e-01,"3,Oxidation[P];4,Oxidation[P];13,Oxidation[P];",3,XP_028700691.1/XP_005550260.1/NP_001253266.1/,"702,R,E/794,R,E/794,R,E/",1|0|0|0|,target,0,0.002101,36,21
48113,20240209-0311_QEHF2_1007817_ONJ_TR_MC_CE14.521...,5211,1188.602257,2,0.009923,QPGFLGLADAR,1188.600710,0.001547,10.195329,3.404520e-01,"0,Formyl[AnyN-term];2,Oxidation[P];",1,OQX03951.1/,"68,G,I/",1|0|0|,target,0,0.001502,36,11
48114,20240209-0311_QEHF2_1007817_ONJ_TR_MC_CE14.672...,6724,1264.698667,2,0.009923,APGPLGIAGITGAR,1264.700750,-0.002083,7.586292,3.414180e-01,"2,Pro->pyro-Glu[P];",1,MBZ3891101.1/EFB27495.1/P04258.1/AAB59383.1/KA...,"557,G,G/916,G,G/786,G,G/82,G,G/756,G,G/936,G,G...",1|0|,target,0,0.001936,37,14


###Add Gene IDs and start postion to peptides

In [None]:
# Initialize new columns to avoid KeyError
combined_df['GNC&spp'] = None
combined_df['start_position'] = None

In [None]:
def annotate_row(row, positions_dict):
    """
    Annotate a single row with gene and start position based on a given positions dictionary.

    Parameters:
    - row: A row of a pandas DataFrame.
    - positions_dict: A dictionary with sequences as keys and dictionaries with 'gene' and 'position' as values.

    Returns:
    - row: The modified row with 'GNC&spp' and 'start_position' annotated if the sequence is found in positions_dict.
    """
    sequence = row['Sequence']
    if sequence in positions_dict:
        row['GNC&spp'] = positions_dict[sequence].get('gene')
        row['start_position'] = positions_dict[sequence].get('position')
    return row

# Apply the annotation function to each row of the DataFrame
combined_df = combined_df.apply(annotate_row, positions_dict=sequence_position_dict, axis=1)


In [None]:
# # Ensure all sequences are correctly annotated with 'GNC&spp' and 'position'
# for index, row in combined_df.iterrows():
#     sequence = row['Sequence']

#     # Check in NCP_peptide_positions_dict first
#     if sequence in NCP_peptide_positions_dict:
#         combined_df.at[index, 'GNC&spp'] = NCP_peptide_positions_dict[sequence].get('gene', None)
#         combined_df.at[index, 'start_position'] = NCP_peptide_positions_dict[sequence].get('position', None)
#     # Then, check in sequence_position_dict if not found in NCP_peptide_positions_dict
#     elif sequence in sequence_position_dict:
#         combined_df.at[index, 'GNC&spp'] = sequence_position_dict[sequence].get('gene', None)
#         combined_df.at[index, 'start_position'] = sequence_position_dict[sequence].get('position', None)


In [None]:
# # Identify NCP Matches and Update DataFrame
# NCP_peptide_positions_dict = find_peptide_positions_in_NCPs(non_sequences_dict, combined_df)
# combined_df = combined_df.apply(update_NCP_matches, axis=1, args=(NCP_peptide_positions_dict,))

# # Extract unique peptides without applying collagen-specific filters
# unique_peptides = combined_df['Sequence'].unique().tolist()

# # Apply collagen-specific filtering and matching logic
# collagen_motif_peptides = [peptide for peptide in unique_peptides if collagen_specific_criteria(peptide)]
# sequence_position_dict = find_collagen_matches(collagen_motif_peptides, sequences_dict)

# # Update DataFrame with collagen matches
# combined_df = combined_df.apply(update_collagen_matches, axis=1, args=(sequence_position_dict,))


In [None]:
combined_df.head()

Unnamed: 0,File_Name,Scan_No,Exp.MH+,Charge,Q-value,Sequence,Calc.MH+,Mass_Shift(Exp.-Calc.),Raw_Score,Final_Score,...,Proteins,Positions,Label,Target/Decoy,Miss.Clv.Sites,Avg.Frag.Mass.Shift,Others,peptide_length,GNC&spp,start_position
0,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.638...,6381,2300.162702,2,0.0,IITHPNFNGNTLDNDIMLIK,2300.159002,0.0037,41.306849,1.5289e-10,...,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,0.000153,32,20,TRYUP-Sus_sco,69.0
1,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.622...,6226,2299.176369,2,0.0,IITHPNFNGNTLDNDIMLIK,2299.174986,0.001383,38.86652,2.94395e-10,...,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|,target,0,0.000168,32,20,TRYUP-Sus_sco,69.0
2,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.631...,6317,2300.164047,2,0.0,IITHPNFNGNTLDNDIMLIK,2300.159002,0.005045,38.503685,5.67773e-10,...,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,0.000505,32,20,TRYUP-Sus_sco,69.0
3,20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.635...,6355,2300.162845,2,0.0,IITHPNFNGNTLDNDIMLIK,2300.159002,0.003843,39.15932,6.16231e-10,...,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,-0.001049,32,20,TRYUP-Sus_sco,69.0
4,20240209-0223_QEHF2_1007815_ONJ_TR_MC_CE12.446...,4465,2300.164123,2,0.0,IITHPNFNGNTLDNDIMLIK,2300.159002,0.005121,39.978869,7.29545e-10,...,5XW1_A/P00761.1/1AKS_A/1AVW_A/1FMG_A/1AN1_E/3M...,"77,K,L/77,K,L/69,K,L/69,K,L/69,K,L/69,K,L/69,K...",1|0|0|,target,0,0.000511,32,20,TRYUP-Sus_sco,69.0


In [None]:
# #To view the matches
# print("Collagen Matches:")
# for peptide, details in sequence_position_dict.items():
#     print(f"Peptide: {peptide}, Gene: {details['gene']}, Position: {details['position']}")
# print("\nNCP Matches:")
# for peptide, details in NCP_peptide_positions_dict.items():
#     print(f"Peptide: {peptide}, Gene: {details['gene']}, Position: {details['position']}")

In [None]:
# # Report on the most frequent proteins - this is very SLOW!!!
# report_df = extract_protein_frequencies_and_associations(combined_df, 'Proteins')

# # Display or export `report_df` as needed
# print(report_df)

In [None]:
# #Writing this report.csv
# # Construct the file path first
# csv_file_path = f'{BASE_PATH}/{STUDY_NAME}_prot_freq.csv'

# # Export the DataFrame to CSV using the correct method name
# report_df.to_csv(csv_file_path)

In [None]:
##Check the df again!
#combined_df.head()
#combined_df.shape
#combined_df.info()
##combined_df.describe()
#combined_df['Sequence'].nunique()
#combined_df['Miss.Clv.Sites'].nunique()
combined_df.head()
list(combined_df.columns)
#print(combined_df[['GNC&spp', 'Sequence','Proteins', 'Positions', 'start_position']].head(40))
spp_GNC_counts = combined_df['GNC&spp'].value_counts()

# Print the counts of different entries
print(spp_GNC_counts)

COL1A1-Bos_tau       8240
COL1A2-Bos_tau       5316
TRYUP-Sus_sco        3163
COL3A1-BosxBos       1558
COL1A2-Hip_amp       1115
KS1-Hom_sap          1084
COL1A2-Equ_asi        932
COL3A1-Equ_cab        710
COL1A2-Cap_hir        517
COL1A1-Equ_cab        385
COL1A2-Lox_afr        300
COL1A1-Pog_vit        192
COL1A1-Tha_sir        112
COL1A1-Mam_mam         89
COL1A1-Gal_gal         80
ALB-Bos_tau            33
COL1A2-Bos_mut         25
COL1A2-Phy_cas         24
COL1A2-Bub_Bub         18
COL1A1-Vul_vul         17
COL1A2-Chk_COL1A2       8
COL1A1-Apr_xxx          6
COL1A1-Sus_Scr          5
ALB-Equ_cab             4
COL1A2-Bos_ind          4
COL1A1-Cam_fer          2
COL1A1-Nop_xxx          2
ALB-Sus_scr             1
COL1A1-Caa_xxx          1
Name: GNC&spp, dtype: int64


In [None]:
#Generate smaller df - iltered_combined_df- for the Expand step
# Replace 'None' and empty strings with np.nan for uniformity
combined_df['GNC&spp'] = combined_df['GNC&spp'].replace({None: np.nan, '': np.nan})

# Filter rows where 'GNC&spp' is not NaN
filtered_combined_df = combined_df.dropna(subset=['GNC&spp'])

# Now, filtered_combined_df contains only the rows where 'GNC&spp' is not None, NaN, or blank
# You can perform your operations on filtered_combined_df

# Print the head of the filtered DataFrame to verify the filtering
print(filtered_combined_df.head())


                                           File_Name  Scan_No      Exp.MH+  \
0  20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.638...     6381  2300.162702   
1  20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.622...     6226  2299.176369   
2  20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.631...     6317  2300.164047   
3  20240209-0359_QEHF2_1007819_ONJ_TR_MC_CE16.635...     6355  2300.162845   
4  20240209-0223_QEHF2_1007815_ONJ_TR_MC_CE12.446...     4465  2300.164123   

   Charge  Q-value              Sequence     Calc.MH+  Mass_Shift(Exp.-Calc.)  \
0       2      0.0  IITHPNFNGNTLDNDIMLIK  2300.159002                0.003700   
1       2      0.0  IITHPNFNGNTLDNDIMLIK  2299.174986                0.001383   
2       2      0.0  IITHPNFNGNTLDNDIMLIK  2300.159002                0.005045   
3       2      0.0  IITHPNFNGNTLDNDIMLIK  2300.159002                0.003843   
4       2      0.0  IITHPNFNGNTLDNDIMLIK  2300.159002                0.005121   

   Raw_Score   Final_Score  ...  \
0  41.306

In [None]:
#COUNT GNC
unique_names_GNC = filtered_combined_df['GNC&spp'].unique()
name_counts_GNC = filtered_combined_df['GNC&spp'].value_counts()
print("Unique GNC:", unique_names_GNC)
print("Counts of each GNC:", name_counts_GNC)

Unique GNC: ['TRYUP-Sus_sco' 'COL1A1-Bos_tau' 'COL1A2-Bos_tau' 'KS1-Hom_sap'
 'COL3A1-BosxBos' 'COL1A1-Gal_gal' 'COL1A1-Equ_cab' 'COL1A2-Equ_asi'
 'COL1A1-Mam_mam' 'COL1A2-Hip_amp' 'COL1A2-Cap_hir' 'COL1A2-Phy_cas'
 'COL1A2-Lox_afr' 'COL3A1-Equ_cab' 'COL1A1-Tha_sir' 'ALB-Bos_tau'
 'COL1A1-Pog_vit' 'COL1A1-Sus_Scr' 'COL1A2-Bub_Bub' 'COL1A1-Vul_vul'
 'ALB-Equ_cab' 'ALB-Sus_scr' 'COL1A2-Chk_COL1A2' 'COL1A2-Bos_ind'
 'COL1A2-Bos_mut' 'COL1A1-Cam_fer' 'COL1A1-Apr_xxx' 'COL1A1-Caa_xxx'
 'COL1A1-Nop_xxx']
Counts of each GNC: COL1A1-Bos_tau       8240
COL1A2-Bos_tau       5316
TRYUP-Sus_sco        3163
COL3A1-BosxBos       1558
COL1A2-Hip_amp       1115
KS1-Hom_sap          1084
COL1A2-Equ_asi        932
COL3A1-Equ_cab        710
COL1A2-Cap_hir        517
COL1A1-Equ_cab        385
COL1A2-Lox_afr        300
COL1A1-Pog_vit        192
COL1A1-Tha_sir        112
COL1A1-Mam_mam         89
COL1A1-Gal_gal         80
ALB-Bos_tau            33
COL1A2-Bos_mut         25
COL1A2-Phy_cas         24
COL1A2-B

## Expand df

In [None]:
# Assuming 'combined_df' is original dataframe with columns [''GNC&spp'', 'start_position', 'peptide_length', 'sequence']
df_sorted = combined_df.sort_values(by=['GNC&spp','start_position', 'peptide_length','Sequence'], ascending=[True, False, False,True])

In [None]:
# Expand combined_df to detail each peptide's amino acid positions and associated PTMs
#exp_df = expand_df_to(filtered_combined_df) only contains collagen
exp_df = expand_df_to(combined_df)

In [None]:
# Display the first few rows of the expanded DataFrame to verify the result
print(exp_df.head())
list(exp_df.columns)
exp_df.shape

     id          sample_name    GNC      spp aa  position ptm confidence  \
0  None  20240209-0359_QEHF2  TRYUP  Sus_sco  I        69           None   
1  None  20240209-0359_QEHF2  TRYUP  Sus_sco  I        70           None   
2  None  20240209-0359_QEHF2  TRYUP  Sus_sco  T        71           None   
3  None  20240209-0359_QEHF2  TRYUP  Sus_sco  H        72           None   
4  None  20240209-0359_QEHF2  TRYUP  Sus_sco  P        73           None   

   spectraId           mz  z      pepMass     err         score  scanNum  \
0       6381  2300.162702  2  2300.159002  0.0037  1.528900e-10     6381   
1       6381  2300.162702  2  2300.159002  0.0037  1.528900e-10     6381   
2       6381  2300.162702  2  2300.159002  0.0037  1.528900e-10     6381   
3       6381  2300.162702  2  2300.159002  0.0037  1.528900e-10     6381   
4       6381  2300.162702  2  2300.159002  0.0037  1.528900e-10     6381   

                                   peptide_gene_name    RT   ppm  
0  5XW1_A/P00761.1/

(400588, 18)

In [None]:
unique_names_GNC = exp_df['GNC'].unique()
name_counts_GNC = exp_df['GNC'].value_counts()
print("Unique sample names:", unique_names_GNC)
print("Counts of each sample name:", name_counts_GNC)

Unique sample names: ['TRYUP' 'COL1A1' 'COL1A2' 'KS1' 'COL3A1' 'ALB']
Counts of each sample name: COL1A1    156458
COL1A2    136899
TRYUP      54993
COL3A1     38277
KS1        13556
ALB          405
Name: GNC, dtype: int64


In [None]:
# Define a dictionary to map old names to new names
rename_dict = {
    'M3_20231109-1713': 'M3',
    'M1_20231109-1625': 'M1',
    'M4_20231109-1737': 'M4',
    'P3_20231109-1848': 'P3',
    'P4_20231109-1912': 'P4',
    'P2_20231109-1824': 'P2',
    'P1_20231109-1801': 'P1',
    'M2_20231109-1649': 'M2'
}

# Rename entries in 'sample_name' column using the rename dictionary
exp_df['sample_name'] = exp_df['sample_name'].replace(rename_dict)

In [None]:
unique_names = exp_df['sample_name'].unique()
name_counts = exp_df['sample_name'].value_counts()
print("Unique sample names:", unique_names)
print("Counts of each sample name:", name_counts)

Unique sample names: ['20240209-0359_QEHF2' '20240209-0223_QEHF2' 'M2' '20240209-0247_QEHF2'
 '20240208-2249_QEHF2' '20240208-2313_QEHF2' '20240209-0422_QEHF2'
 '20240209-0112_QEHF2' '20240209-0534_QEHF2' '20240209-0645_QEHF2'
 '20240209-0510_QEHF2' 'M1' 'P3' '20240209-0048_QEHF2'
 '20240209-0311_QEHF2' '20240209-0622_QEHF2' '20240209-0000_QEHF2'
 '20240209-0335_QEHF2' '20240208-2201_QEHF2' 'P4' '20240208-2337_QEHF2'
 '20240209-0024_QEHF2' 'M3' 'M4' '20240209-0136_QEHF2'
 '20240208-2225_QEHF2' '20240209-0159_QEHF2' 'P2' 'P1'
 '20240209-0558_QEHF2']
Counts of each sample name: 20240209-0000_QEHF2    42192
20240209-0112_QEHF2    39568
20240209-0422_QEHF2    34469
20240209-0645_QEHF2    27276
20240208-2201_QEHF2    22789
20240209-0534_QEHF2    21514
20240209-0311_QEHF2    20790
20240209-0159_QEHF2    19918
20240209-0510_QEHF2    18446
M3                     17703
P3                     17164
20240208-2337_QEHF2    14141
20240209-0048_QEHF2    12474
20240208-2249_QEHF2    12045
20240208-23

In [None]:
#COUNT GNC
unique_names_GNC = exp_df['GNC'].unique()
name_counts_GNC = exp_df['GNC'].value_counts()
print("Unique GNC:", unique_names_GNC)
print("Counts of each GNC:", name_counts_GNC)

Unique GNC: ['TRYUP' 'COL1A1' 'COL1A2' 'KS1' 'COL3A1' 'ALB']
Counts of each GNC: COL1A1    156458
COL1A2    136899
TRYUP      54993
COL3A1     38277
KS1        13556
ALB          405
Name: GNC, dtype: int64


###Explore PTMs

In [None]:
# Filter for rows where ptm is not null and not an empty string
df_filtered = exp_df[exp_df['ptm'].notnull() & exp_df['ptm'] != '']

#Rename PTMs based on conditions
df_filtered['ptm'] = df_filtered.apply(
    lambda x: 'Hydroxylation' if x['aa'] == 'P' and x['ptm'] == 'Oxidation' else
             'DeamidationN' if x['aa'] == 'N' and x['ptm'] == 'Deamidated' else
             'DeamidationQ' if x['aa'] == 'Q' and x['ptm'] == 'Deamidated' else
             x['ptm'],  # Leave other PTMs unchanged
    axis=1
)

In [None]:
# Get the value counts of ptm entries
ptm_counts = df_filtered['ptm'].value_counts()

# Print the ptm entries and their counts
print(ptm_counts.head(20))

# Calculate ptm_counts without aggregating sample_name
ptm_counts = df_filtered.groupby(['sample_name','GNC', 'position', 'aa', 'ptm']).size().reset_index(name='ptm_count')

# Calculate total_counts without aggregating sample_name
total_counts = df_filtered.groupby(['sample_name','GNC', 'position', 'aa']).size().reset_index(name='total_count')

# Merge ptm_counts with the original df_filtered to preserve sample_name and score
df_merged = pd.merge(df_filtered, ptm_counts, on=['sample_name','GNC', 'position', 'aa', 'ptm'], how='left')
df_merged = pd.merge(df_merged, total_counts, on=['sample_name','GNC', 'position', 'aa'], how='left')

# Calculate the percentage of aa in each position having a specific ptm
df_merged['ptm_percentage'] = (df_merged['ptm_count'] / df_merged['total_count']) * 100

# Filter to include only rows where ptm is not None or empty (i.e., ptm is present)
result_with_ptms = df_merged[df_merged['ptm'].notna() & (df_merged['ptm'] != '')]

# Display the filtered result with ptm details included
print(result_with_ptms[['GNC', 'sample_name', 'position', 'aa', 'ptm', 'ptm_percentage', 'sample_name', 'score']].head())



                  366769
Hydroxylation      27255
Oxidation           2498
DeamidationQ        1909
DeamidationN        1503
Methyl                72
Dioxidation           57
Ammonia-loss          52
Pro->pyro-Glu         43
Dehydrated            35
Ethyl                 33
Carbonyl              27
Carboxymethyl         23
Arg->Asn              21
Delta_H(4)C(2)        20
Trioxidation          19
Ethanolyl             18
Pro->Asn              16
Asn->Met              16
Formyl                13
Name: ptm, dtype: int64
      GNC          sample_name  position aa           ptm  ptm_percentage  \
7   TRYUP  20240209-0359_QEHF2        76  N  DeamidationN       18.627451   
16  TRYUP  20240209-0359_QEHF2        85  M     Oxidation       86.792453   
36  TRYUP  20240209-0359_QEHF2        85  M     Oxidation       86.792453   
47  TRYUP  20240209-0359_QEHF2        76  N  DeamidationN       18.627451   
56  TRYUP  20240209-0359_QEHF2        85  M     Oxidation       86.792453   

            s

###Reshaping exp_df

In [None]:
def create_grouped_df(exp_df):
    """
    Create a grouped dataframe from the input dataframe `exp_df`.

    Args:
        exp_df (pandas.DataFrame): The input dataframe containing protein identification data.

    Returns:
        pandas.DataFrame: The grouped dataframe containing the requested analysis.
    """
    # Group the data by GNC, sample_name, and position
    grouped = exp_df.groupby(['GNC', 'sample_name', 'position'])

    # Create a list to store the grouped analysis
    grouped_analysis = []

    # Iterate over each group
    for group_keys, group_data in grouped:
        # Unpack the group keys
        gnc, sample_name, position = group_keys

        # Get the top 3 amino acids and their counts
        aa_counts = group_data['aa'].value_counts().head(3)
        top_aas = aa_counts.index.tolist()
        aa_counts = aa_counts.to_dict()

        # Initialize a dictionary to store the analysis for this group
        group_analysis = {
            'GNC': gnc,
            'sample_name': sample_name,
            'position': position
        }

        # Analyze the top 3 amino acids
        for i, aa in enumerate(top_aas, start=1):
            aa_key = f'aa{i}'
            group_analysis[aa_key] = aa  # Add the amino acid name
            n_aa_key = f'n_{aa_key}'
            score_mean_key = f'{aa_key}_score_mean'
            score_max_key = f'{aa_key}_score_max'
            score_sd_key = f'{aa_key}_score_SD'
            score_keys = [f'score{j}_{aa_key}' for j in range(1, 4)]
            spectra_keys = [f'spectraId{j}_{aa_key}' for j in range(1, 4)]

            # Count of the amino acid
            group_analysis[n_aa_key] = aa_counts[aa]

            # Mean, max, and standard deviation of the confidence scores
            aa_scores = group_data.loc[group_data['aa'] == aa, 'confidence']
            group_analysis[score_mean_key] = aa_scores.mean()
            group_analysis[score_max_key] = aa_scores.max()
            group_analysis[score_sd_key] = aa_scores.std()

            # Top 3 scores and spectra IDs
            aa_scores = group_data.loc[group_data['aa'] == aa, ['score', 'spectraId']].nlargest(3, 'score')
            for j, (score_key, spectra_key) in enumerate(zip(score_keys, spectra_keys)):
                if j < len(aa_scores):
                    group_analysis[score_key] = aa_scores.iloc[j]['score']
                    group_analysis[spectra_key] = aa_scores.iloc[j]['spectraId']
                else:
                    group_analysis[score_key] = None
                    group_analysis[spectra_key] = None

            # Analyze the PTMs for this amino acid
            aa_ptms = group_data.loc[group_data['aa'] == aa, 'ptm'].value_counts()
            top_ptms = aa_ptms.index.tolist()[:2]
            aa_ptms = aa_ptms.to_dict()

            for j, ptm in enumerate(top_ptms, start=1):
                ptm_key = f'ptm{j}_{aa_key}'
                ptm_perc_key = f'%{ptm_key}'
                ptm_score_mean_key = f'{ptm_key}_score_mean'
                ptm_score_max_key = f'{ptm_key}_score_max'
                ptm_score_sd_key = f'{ptm_key}_score_SD'
                ptm_score_keys = [f'score{k}_{ptm_key}' for k in range(1, 4)]
                ptm_spectra_keys = [f'spectraId{k}_{ptm_key}' for k in range(1, 4)]

                # PTM name and percentage
                group_analysis[ptm_key] = ptm
                group_analysis[ptm_perc_key] = aa_ptms.get(ptm, 0) / group_analysis[n_aa_key] * 100

                # Mean, max, and standard deviation of the confidence scores for the PTM
                ptm_scores = group_data.loc[(group_data['aa'] == aa) & (group_data['ptm'] == ptm), 'confidence']
                group_analysis[ptm_score_mean_key] = ptm_scores.mean()
                group_analysis[ptm_score_max_key] = ptm_scores.max()
                group_analysis[ptm_score_sd_key] = ptm_scores.std()

                # Top 3 scores and spectra IDs for the PTM
                ptm_scores = group_data.loc[(group_data['aa'] == aa) & (group_data['ptm'] == ptm), ['score', 'spectraId']].nlargest(3, 'score')
                for k, (score_key, spectra_key) in enumerate(zip(ptm_score_keys, ptm_spectra_keys)):
                    if k < len(ptm_scores):
                        group_analysis[score_key] = ptm_scores.iloc[k]['score']
                        group_analysis[spectra_key] = ptm_scores.iloc[k]['spectraId']
                    else:
                        group_analysis[score_key] = None
                        group_analysis[spectra_key] = None

        # Append the analysis for this group to the list
        grouped_analysis.append(group_analysis)

    # Create the grouped dataframe from the list of dictionaries
    grouped_df = pd.DataFrame(grouped_analysis)

    return grouped_df

In [None]:
group_df = create_grouped_df(exp_df)

In [None]:
# with list constructor
col_list = list(group_df.columns)
# with tolist method
col_list = group_df.columns.tolist()
print(col_list)

['GNC', 'sample_name', 'position', 'aa1', 'n_aa1', 'aa1_score_mean', 'aa1_score_max', 'aa1_score_SD', 'score1_aa1', 'spectraId1_aa1', 'score2_aa1', 'spectraId2_aa1', 'score3_aa1', 'spectraId3_aa1', 'ptm1_aa1', '%ptm1_aa1', 'ptm1_aa1_score_mean', 'ptm1_aa1_score_max', 'ptm1_aa1_score_SD', 'score1_ptm1_aa1', 'spectraId1_ptm1_aa1', 'score2_ptm1_aa1', 'spectraId2_ptm1_aa1', 'score3_ptm1_aa1', 'spectraId3_ptm1_aa1', 'ptm2_aa1', '%ptm2_aa1', 'ptm2_aa1_score_mean', 'ptm2_aa1_score_max', 'ptm2_aa1_score_SD', 'score1_ptm2_aa1', 'spectraId1_ptm2_aa1', 'score2_ptm2_aa1', 'spectraId2_ptm2_aa1', 'score3_ptm2_aa1', 'spectraId3_ptm2_aa1', 'aa2', 'n_aa2', 'aa2_score_mean', 'aa2_score_max', 'aa2_score_SD', 'score1_aa2', 'spectraId1_aa2', 'score2_aa2', 'spectraId2_aa2', 'score3_aa2', 'spectraId3_aa2', 'ptm1_aa2', '%ptm1_aa2', 'ptm1_aa2_score_mean', 'ptm1_aa2_score_max', 'ptm1_aa2_score_SD', 'score1_ptm1_aa2', 'spectraId1_ptm1_aa2', 'score2_ptm1_aa2', 'spectraId2_ptm1_aa2', 'score3_ptm1_aa2', 'spectraId3

In [None]:
group_df

Unnamed: 0,GNC,sample_name,position,aa1,n_aa1,aa1_score_mean,aa1_score_max,aa1_score_SD,score1_aa1,spectraId1_aa1,...,%ptm2_aa3,ptm2_aa3_score_mean,ptm2_aa3_score_max,ptm2_aa3_score_SD,score1_ptm2_aa3,spectraId1_ptm2_aa3,score2_ptm2_aa3,spectraId2_ptm2_aa3,score3_ptm2_aa3,spectraId3_ptm2_aa3
0,ALB,20240209-0311_QEHF2,346,D,1,,,,0.208305,8543.0,...,,,,,,,,,,
1,ALB,20240209-0311_QEHF2,347,A,1,,,,0.208305,8543.0,...,,,,,,,,,,
2,ALB,20240209-0311_QEHF2,348,F,1,,,,0.208305,8543.0,...,,,,,,,,,,
3,ALB,20240209-0311_QEHF2,349,L,1,,,,0.208305,8543.0,...,,,,,,,,,,
4,ALB,20240209-0311_QEHF2,350,G,1,,,,0.208305,8543.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31615,TRYUP,P4,102,V,1,,,,0.127446,2890.0,...,,,,,,,,,,
31616,TRYUP,P4,103,S,1,,,,0.127446,2890.0,...,,,,,,,,,,
31617,TRYUP,P4,104,L,1,,,,0.127446,2890.0,...,,,,,,,,,,
31618,TRYUP,P4,105,P,1,,,,0.127446,2890.0,...,,,,,,,,,,


In [None]:
print(group_df)

         GNC          sample_name  position aa1  n_aa1  aa1_score_mean  \
0        ALB  20240209-0311_QEHF2       346   D      1             NaN   
1        ALB  20240209-0311_QEHF2       347   A      1             NaN   
2        ALB  20240209-0311_QEHF2       348   F      1             NaN   
3        ALB  20240209-0311_QEHF2       349   L      1             NaN   
4        ALB  20240209-0311_QEHF2       350   G      1             NaN   
...      ...                  ...       ...  ..    ...             ...   
31615  TRYUP                   P4       102   V      1             NaN   
31616  TRYUP                   P4       103   S      1             NaN   
31617  TRYUP                   P4       104   L      1             NaN   
31618  TRYUP                   P4       105   P      1             NaN   
31619  TRYUP                   P4       106   R      1             NaN   

       aa1_score_max  aa1_score_SD  score1_aa1  spectraId1_aa1  ...  \
0                NaN           NaN    0.

In [None]:
directory_path = f'/content/drive/MyDrive/Colab_Notebooks/NovorCloud/{STUDY_NAME}/{PFIND_FOLDER}/result/'
group_df.to_csv(f'{directory_path}pFind_grouped3.csv', index=False)

In [None]:
#Writing exp.csv
# Construct the file path first
csv_file_path = f'{BASE_PATH}/{STUDY_NAME}exp.csv'

# Export the DataFrame to CSV using the correct method name
exp_df.to_csv(csv_file_path)

In [None]:
#grouped_CGPT_df = analyze_protein_data(test_df)

In [None]:
#print(grouped_CGPT_df)

In [None]:
# # Filter the DataFrame to find rows with ptm "Deamidated"
# filtered_ptm_df = df_filtered[df_filtered['ptm'] == "Deamidated"]

# # Check if any rows were found
# if not filtered_ptm_df.empty:
#   # Print a limited number of rows (e.g., the first 3)
#   print(filtered_ptm_df.head(3))
# else:
#   # If no rows are found, print a message
#   print("No rows found with ptm='Deamidated'")

In [None]:
# unique_names = result_with_ptms['sample_name'].unique()
# name_counts = result_with_ptms['sample_name'].value_counts()
# print("Unique sample names:", unique_names)
# print("Counts of each sample name:", name_counts)