In [5]:
"Converts PDB codes to Uniprot accession codes. This script only converts the 4 alphanumeric characters, not the chain and residue number."
#!/usr/bin/env python3

from urllib import request
import sys
import re
import pandas as pd
import numpy as np
import time



In [13]:
def clean_data(file):
    df = pd.read_csv('E2.csv')

    #Remove unrequired NaNs, blank spaces, reset index to run from 0
    df.dropna(inplace = True)
    df.replace(' ', '_', regex=True, inplace=True)
    df.reset_index(drop=True, inplace = True)

    #Encodes class labels to numeric values (0 or 1)
    cleaned_encoded = pd.get_dummies(df, columns=['dataset']) #Encode the PD and SNP columns
    cleaned = cleaned_encoded.drop(['dataset_snp'],axis = 1)
    
    return cleaned

In [22]:
def group_data(cleaned):
    group_cleaned = cleaned.sort_values(by=['pdbcode:chain:resnum:mutation'])

    PDB_codes = []
    for i in range(len(group_cleaned)):
        PDB_codes.append(group_cleaned.iloc[i][0].partition(':')[0]) #Split the identifier and takes only PDB code

    group_cleaned.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True) #Remove 'pdbcode:chain:resnum:mutation' column
    group_cleaned.reset_index(inplace = True, drop = True)

    return group_cleaned, PDB_codes

In [23]:
""" Main program """
start = time.time()

file = "E2.csv"
cleaned = clean_data(file)
group_cleaned, PDB_codes = group_data(cleaned)

end = time.time()

In [25]:
group_cleaned

Unnamed: 0,Binding,SProtFT0,SProtFT1,SProtFT2,SProtFT3,SProtFT4,SProtFT5,SProtFT6,SProtFT7,SProtFT8,...,NLargest6,NLargest7,NLargest8,NLargest9,NLargest10,Clash,Glycine,Proline,CisPro,dataset_pd
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,72.924,72.765,71.665,70.664,64.469,-4.19,-100.0,-100.0000,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,72.924,72.765,71.665,70.664,64.469,-0.29,-100.0,-100.0000,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,58.939,56.680,52.867,49.065,44.336,-5.72,-100.0,-100.0000,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,69.638,64.705,60.273,60.189,54.902,-4.87,-100.0,-100.0000,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,69.638,64.705,60.273,60.189,54.902,-7.57,-100.0,-100.0000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.348,17.344,16.832,16.270,15.453,-7.94,-100.0,-100.0000,0.0,1
3361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.348,17.344,16.832,16.270,15.453,-10.50,-100.0,-100.0000,0.0,1
3362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.348,17.344,16.832,16.270,15.453,-6.17,-100.0,-100.0000,0.0,1
3363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.348,17.344,16.832,16.270,15.453,3551.52,-100.0,2.4384,0.0,1


In [None]:
def ReadPDBSWS(PDB_codes):
    AC_codes = []
    for i in range(len(PDB_codes)):
        url = 'http://www.bioinf.org.uk/servers/pdbsws/query.cgi?plain=1&qtype=pdb' #REST output
        url += '&id=' + PDB_codes[i] #URL for the specific PDB code of interest

        result = request.urlopen(url).read() #Reads the URL
        result = str(result, encoding='utf-8') #Encodes the URL into utf-8 format
        result = result.replace('\n', '#') #Replaces all the new line returns with #, allowing easy pattern matches

        pattern  = re.compile('.*AC:\s+(.*?)#') #Recognises the accession code pattern
        match    = pattern.match(result) #Saves the pattern to match
        AC_codes.append(match.group(1)) #Saves only the accession code to variable
    return AC_codes

In [None]:
def Input_Code(Input_CV_NoCode, AC_codes):
    Input_CV = Input_CV_NoCode.insert(0, 'AC Code', AC_codes)
    return Input_CV

In [None]:
end - start