# Pfam Score Retrieval

## Pfam Score Parser

This notebook contains functions to download and parse Pfam domain data for a given protein accession number. The main functionalities include:

1. **Download Pfam Data**: The `download_pfam_domains` function takes a protein accession number and retrieves the Pfam domain data from the InterPro API.
2. **Create Domain Dictionary**: The `create_domain_dict` function processes the downloaded Pfam data and creates a list of dictionaries, each containing the start and end positions of the domains along with their scores.
3. **Assign Conservation Value**: The `assign_conservation` function assigns a conservation value to each row based on the domain dictionary.

These functions facilitate the analysis of protein domains and their conservation scores.

In [None]:
import requests
import pandas as pd
import re

# Function to download Pfam data of a protein
def download_pfam_domains(protein_accession):
    url = f"https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/uniprot/{protein_accession}/"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Errore nel download dei domini Pfam per {protein_accession}: {response.status_code}")
        return None

# Function to create list of dicts of Pfam domains
def create_domain_dict(pfam_data):
    domain_list = []
    for result in pfam_data.get("results", []):
        for protein in result.get("proteins", []):
            for location in protein.get("entry_protein_locations", []):
                score = location.get("score", 0)  # Get the score from the location level
                for fragment in location.get("fragments", []):
                    domain_map = {
                        "start": fragment["start"],
                        "end" : fragment["end"],
                        "score": score
                    }
                    domain_list.append(domain_map)
    return domain_list

# Function to assign domain and conseravtion value
def assign_conservation(row, domain_dict):
    position = row["Intron"]
    if position == True:
        return 0  # Introns

    for map in domain_dict:
        start, end, score = map["start"]*3, map["end"]*3, map["score"]
        if start <= position <= end:
            return score
    return 0  # Unknown mutations

# Example
protein_accession = "P01112"  # Accession of HRAS
pfam_data = download_pfam_domains(protein_accession)
print(pfam_data)

if pfam_data:
    domain_dict = create_domain_dict(pfam_data)
    print(f"Dict: {domain_dict}")

    # Example Dataset
data = pd.DataFrame({
    "Name": ["NM_005343.4(HRAS):c.451-4C>T", "NM_005343.4(HRAS):c.567C>T"],
    "Position": [451, 567]
})

    #Add Conservation Column
data["Conservation"] = data.apply(lambda row: assign_conservation(row, domain_dict), axis=1)

# Print Result
print(data)

{'count': 1, 'next': None, 'previous': None, 'results': [{'metadata': {'accession': 'PF00071', 'name': 'Ras family', 'source_database': 'pfam', 'type': 'domain', 'integrated': 'IPR001806', 'member_databases': None, 'go_terms': None}, 'proteins': [{'accession': 'p01112', 'protein_length': 189, 'source_database': 'reviewed', 'organism': '9606', 'in_alphafold': True, 'entry_protein_locations': [{'fragments': [{'start': 5, 'end': 164, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': 'PF00071', 'score': 2.3e-57}]}]}]}
Dict: [{'start': 5, 'end': 164, 'score': 2.3e-57}]
                           Name  Position  Conservation
0  NM_005343.4(HRAS):c.451-4C>T       451  2.300000e-57
1    NM_005343.4(HRAS):c.567C>T       567  0.000000e+00
