In [1]:
"Converts PDB codes to Uniprot accession codes. This script only converts the 4 alphanumeric characters, not the chain and residue number."
#!/usr/bin/env python3

from urllib import request
import sys
import re
import pandas as pd
import numpy as np
import time



In [2]:
def Clean_data(file):

    df = pd.read_csv('E2.csv')

    #Remove unrequired NaNs, blank spaces, reset index to run from 0
    df.dropna(inplace = True)
    df.replace(' ', '_', regex=True, inplace=True)
    df.reset_index(drop=True, inplace = True)

    Cleaned_encoded = pd.get_dummies(df, columns=['dataset']) #Encode the PD and SNP columns
    Cleaned = Cleaned_encoded.drop(['dataset_snp'],axis = 1)
    
    return Cleaned

In [3]:
def Group_data(Cleaned):
    Group_df = Cleaned.sort_values(by=['pdbcode:chain:resnum:mutation'])

    PDB_codes = []
    for i in range(len(Group_df)):
        PDB_codes.append(Group_df.iloc[i][0].partition(':')[0]) #Split the identifier and takes only PDB code

    Group_df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True) #Remove 'pdbcode:chain:resnum:mutation' column
    Group_df.reset_index(inplace = True, drop = True)

    Input_CV_NoCode = Group_df.drop(['dataset_pd'], axis =1)
    Output_CV = Group_df['dataset_pd'].copy().astype('int32') 

    return Input_CV_NoCode, Output_CV, PDB_codes

In [7]:
def ReadPDBSWS(PDB_codes):
    AC_codes = []
    for i in range(len(PDB_codes)):
        url = 'http://www.bioinf.org.uk/servers/pdbsws/query.cgi?plain=1&qtype=pdb' #REST output
        url += '&id=' + PDB_codes[i] #URL for the specific PDB code of interest

        result = request.urlopen(url).read() #Reads the URL
        result = str(result, encoding='utf-8') #Encodes the URL into utf-8 format
        result = result.replace('\n', '#') #Replaces all the new line returns with #, allowing easy pattern matches

        pattern  = re.compile('.*AC:\s+(.*?)#') #Recognises the accession code pattern
        match    = pattern.match(result) #Saves the pattern to match
        AC_codes.append(match.group(1)) #Saves only the accession code to variable
    return AC_codes

In [8]:
def Input_Code(Input_CV_NoCode, AC_codes):
    Input_CV = Input_CV_NoCode.insert(0, 'AC Code', AC_codes)
    return Input_CV

In [9]:
""" Main program """
start = time.time()

file = "E2.csv"
Cleaned = Clean_data(file)
Input_CV_NoCode, Output_CV, PDB_codes = Group_data(Cleaned)
AC_codes = ReadPDBSWS(PDB_codes)
Input_CV = Input_Code(Input_CV_NoCode, AC_codes)

Input_CV, Output_CV, AC_codes
end = time.time()



In [10]:
Input_CV