In [1]:
#!/usr/bin/env python3

from urllib import request
import sys
import re
import pandas as pd
import numpy as np
import time



In [2]:
def Clean_data(file):

    df = pd.read_csv('E2.csv')

    #Remove unrequired NaNs, blank spaces, reset index to run from 0
    df.dropna(inplace = True)
    df.replace(' ', '_', regex=True, inplace=True)
    df.reset_index(drop=True, inplace = True)

    Cleaned_encoded = pd.get_dummies(df, columns=['dataset']) #Encode the PD and SNP columns
    Cleaned = Cleaned_encoded.drop(['dataset_snp'],axis = 1)
    
    return Cleaned

In [3]:
def Group_data(Cleaned):
    Group_df = Cleaned.sort_values(by=['pdbcode:chain:resnum:mutation'])

    PDB_codes = []
    for i in range(len(Group_df)):
        PDB_codes.append(Group_df.iloc[i][0].partition(':')[0]) #Split the identifier and takes only PDB code

    Group_df.drop(['pdbcode:chain:resnum:mutation'], axis=1, inplace=True) #Remove 'pdbcode:chain:resnum:mutation' column
    Group_df.insert(0, 'PDB code', PDB_codes)
    Group_df.reset_index(inplace = True, drop = True)

    Input_CV = Group_df.drop(['dataset_pd'], axis =1)
    Output_CV = Group_df['dataset_pd'].copy().astype('int32') 
    Protein_Groups = Group_df['PDB code'].to_list()

    return Input_CV, Output_CV, Protein_Groups

In [4]:
def Convert(Input_CV):
    codes_list = []
    for i in range(len(Input_CV)):
        codes = Input_CV.iloc[i][0]
        codes_list.append(codes)
        
    return codes_list

In [5]:
def ReadPDBSWS(codes_list):
    AC_codes = []
    for i in range(len(codes_list)):
        url = 'http://www.bioinf.org.uk/servers/pdbsws/query.cgi?plain=1&qtype=pdb' #REST output
        url += '&id=' + codes_list[i] #URL for the specific PDB code of interest

        result = request.urlopen(url).read() #Reads the URL
        result = str(result, encoding='utf-8') #Encodes the URL into utf-8 format
        result = result.replace('\n', '#') #Replaces all the new line returns with #, allowing easy pattern matches

        pattern  = re.compile('.*AC:\s+(.*?)#') #Recognises the accession code pattern
        match    = pattern.match(result) #Saves the pattern to match
        AC_codes.append(match.group(1)) #Saves only the accession code to variable
    return AC_codes

In [6]:
""" Main program """
file = "E2.csv"
Cleaned = Clean_data(file)
Input_CV, Output_CV, Protein_Groups = Group_data(Cleaned)
codes_list = Convert(Input_CV)

start = time.time()
AC_codes = ReadPDBSWS(codes_list)
end = time.time()



In [7]:
print(f"Total time: {end-start} \n\n {AC_codes}")

Total time: 499.6717882156372 

 ['P09211', 'P09211', 'Q13469', 'P10912', 'P10912', 'P10912', 'P10912', 'P10912', 'P00709', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P42771', 'P23526', 'P23526', 'P11362', 'P11362', 'P11362', 'P11362', 'P04075', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'Q5TC78', 'P20933', 'P20933', 'P20933', 'P20933', 'P01011', 'P08311', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P04070', 'P29218', 'P41159', 'P12429', 'P12429', 'P35247', 'P35247', 'P03950', 'P03950', 'P03950', 'P03950', 'P1014