In [7]:
import os
import pandas as pd
import numpy as np

import requests

def extract_uniprot_id(protein_id_with_chain):
    """
    Extrahiert den UniProt-konformen Teil der Protein-ID.
    """
    # Extrahiert den Teil vor dem Unterstrich, falls vorhanden
    return protein_id_with_chain.split('_')[0]

def fetch_ec_numbers(protein_ids):
    """
    Holt EC-Nummern für eine Liste von Protein-IDs von UniProt.
    """
    ec_numbers = {}
    base_url = "https://rest.uniprot.org/uniprotkb/search?"
    query = ' OR '.join([f'accession:{extract_uniprot_id(pid)}' for pid in protein_ids])
    params = {
        'query': query,
        'fields': 'pdb,ec,protein_name',
        'format': 'json',
        'size': len(protein_ids)
    }
    
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        if 'results' in data:
            for item in data['results']:
                ec_list = item.get('ecNumbers', [])
                if ec_list:
                    ec_numbers[item['primaryAccession']] = ec_list[0]['value']
    else:
        print(f"Failed to fetch data: {response.status_code}")
        print(response.text)
    return ec_numbers

def add_ec_numbers(features_df):
    """
    Fügt EC-Nummern basierend auf Protein-IDs in einem DataFrame hinzu.
    """
    unique_proteins = features_df['protein'].unique()
    ec_numbers = fetch_ec_numbers(list(unique_proteins))
    # Abbildung der originalen IDs auf EC-Nummern
    features_df['EC_Number'] = features_df['protein'].apply(lambda x: ec_numbers.get(extract_uniprot_id(x)))
    return features_df


def read_ply_features(file_path):
    """
    Liest die Feature-Daten aus einer PLY-Datei und erstellt einen DataFrame.
    """
    with open(file_path, 'r') as file:
        data = file.readlines()
    
    start = next(i for i, line in enumerate(data) if line.startswith('end_header'))
    vertex_data = data[start+1:]
    
    features = [line.strip().split() for line in vertex_data]
    
    num_columns = len(features[0]) if features else 0
    columns = ['x', 'y', 'z', 'nx', 'ny', 'nz', 'charge', 'hphob', 'hbond']  # Beispiel für typische Spalten
    if num_columns > len(columns):
        columns.extend([f'extra_{i}' for i in range(num_columns - len(columns))])
    
    return pd.DataFrame(features, columns=columns, dtype=float)

def extract_protein_name(filename):
    """
    Extrahiert den Proteinnamen oder eine eindeutige ID aus dem Dateinamen.
    Hier muss ggf. angepasst werden, basierend auf dem Format der Dateinamen.
    """
    # Beispiel: '1A0G_A.ply' wird zu '1A0G_A'
    protein_name = filename.split('.')[0]
    return protein_name

def load_features_from_folder(folder_path):
    """
    Lädt alle PLY-Dateien in einem Ordner und kombiniert ihre Features in einem DataFrame.
    """
    all_features = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.ply'):
            file_path = os.path.join(folder_path, filename)
            features = read_ply_features(file_path)
            features['filename'] = filename  # Optional: Füge Dateinamen als Identifikator hinzu
            features['protein'] = extract_protein_name(filename)  # Hinzufügen der Protein-Spalte
            all_features.append(features)
    
    # Kombiniere alle Features in einem einzigen DataFrame
    combined_features = pd.concat(all_features, ignore_index=True)
    # Only use the first 10 proteins for now
    combined_features = combined_features[combined_features['protein'].isin(combined_features['protein'].unique()[:10])]
    return combined_features


In [2]:
# Pfad zum Ordner, der die PLY-Dateien enthält
folder_path = '../masif_seed/masif/data/masif_peptides/data_preparation/01-benchmark_surfaces'
all_features = load_features_from_folder(folder_path)
# Save the features to a CSV file
all_features.to_csv('features.csv', index=False)

In [8]:
all_features = pd.read_csv('features.csv')
all_features = add_ec_numbers(all_features)
all_features.to_csv('features_with_ec.csv', index=False)
all_features.head()

Failed to fetch data: 400
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value '2NNN' has invalid format. It should be a valid UniProtKB accession","The 'accession' filter value '2J12' has invalid format. It should be a valid UniProtKB accession","The 'accession' filter value '4YDL' has invalid format. It should be a valid UniProtKB accession","The 'accession' filter value '1M93' has invalid format. It should be a valid UniProtKB accession","The 'accession' filter value '3G3Z' has invalid format. It should be a valid UniProtKB accession","The 'accession' filter value '3K1I' has invalid format. It should be a valid UniProtKB accession","Invalid fields parameter value 'ecNumbers'","The 'accession' filter value '1Z7M' has invalid format. It should be a valid UniProtKB accession","The 'accession' filter value '3FOK' has invalid format. It should be a valid UniProtKB accession","The 'accession' filter value '1I07' has invalid format. It should be a val

Unnamed: 0,x,y,z,nx,ny,nz,charge,hphob,hbond,extra_0,filename,protein,EC_Number
0,47.037,53.95,48.229,0.136246,0.0,2.8,1.0,0.705926,-0.595034,-0.384192,3Q4H_A.ply,3Q4H_A,
1,40.468,51.22,68.075,0.226074,0.0,-3.2,0.0,0.082086,0.041442,0.995763,3Q4H_A.ply,3Q4H_A,
2,51.2935,57.6295,44.894,0.688055,0.0,4.2,0.0,0.96657,0.207327,-0.150857,3Q4H_A.ply,3Q4H_A,
3,44.8835,54.724,25.4115,-0.333874,0.0,2.8,1.0,0.372051,-0.278963,-0.885301,3Q4H_A.ply,3Q4H_A,
4,31.484,46.07,62.097,-0.039893,0.0,2.8,0.0,-0.545154,-0.837331,0.041033,3Q4H_A.ply,3Q4H_A,


In [12]:
def pdb_to_uniprot(pdb_ids):
    """
    Map PDB IDs to UniProt Accessions using the RCSB PDB API.
    """
    url = "https://data.rcsb.org/graphql"
    query = """
    {
      entries(entry_ids: [%s]) {
        rcsb_entry_container_identifiers {
          relatedEntries {
            entry {
              id
              relation
              resource
            }
          }
        }
      }
    }
    """ % ', '.join([f'"{pdb}"' for pdb in pdb_ids])

    response = requests.post(url, json={'query': query})
    mapping = {}
    if response.status_code == 200:
        try:
            data = response.json()
            entries = data['data']['entries']
            if entries:
                for entry in entries:
                    relations = entry['rcsb_entry_container_identifiers']['relatedEntries']
                    for relation in relations:
                        if relation['resource'] == 'UniProt':
                            mapping[relation['entry']['id']] = relation['entry']['id']
        except (KeyError, TypeError) as e:
            print(f"Error processing data: {e}")
            print("Response content:", response.text)
    else:
        print(f"Failed to fetch data: {response.status_code}")
        print("Response:", response.text)
    return mapping

# Beispielanwendung
pdb_ids = ['2J12', '4YDL', '1M93', '1I07', '2NNN', '3Q4H', '1Z7M', '3FOK', '3G3Z', '3K1I']
mapping = pdb_to_uniprot(pdb_ids)
print(mapping)

Error processing data: 'NoneType' object is not subscriptable
Response content: {"errors":[{"message":"Validation error (FieldUndefined@[entries/rcsb_entry_container_identifiers/relatedEntries]) : Field 'relatedEntries' in type 'RcsbEntryContainerIdentifiers' is undefined","locations":[{"line":5,"column":11}],"extensions":{"classification":"ValidationError"}}],"data":null}
{}
