In [17]:
#!/usr/bin/env python3

from urllib import request
import sys
import re
import pandas as pd
import numpy as np



In [18]:
def Clean_data(file):
    """ Input:      file        The dataset to read

        Returns:    Cleaned      Cleaned dataframe with numeric values for class

        Create, clean and convert dataset E2.csv to PD dataframe. Removes blank spaces and NaNs,
        and applies One Hot Encoding to convert classes (PD/SNP) to 1/0
    """
    df = pd.read_csv('E2.csv')

    #Remove unrequired NaNs, blank spaces, reset index to run from 0
    df.dropna(inplace = True)
    df.replace(' ', '_', regex=True, inplace=True)
    df.reset_index(drop=True, inplace = True)

    Cleaned_encoded = pd.get_dummies(df, columns=['dataset']) #Encode the PD and SNP columns
    Cleaned = Cleaned_encoded.drop(['dataset_snp'],axis = 1)
    
    return Cleaned

In [None]:
def ReadPDBSWS(pdbcode):
    url = 'http://www.bioinf.org.uk/servers/pdbsws/query.cgi?plain=1&qtype=pdb' #REST output
    url += '&id=' + pdbcode #URL for the specific PDB code of interest
    url += '&res=' + str(resnum) #remove
    
    result = request.urlopen(url).read() #Reads the URL
    result = str(result, encoding='utf-8') #Encodes the URL into utf-8 format
    result = result.replace('\n', '#') #Replaces all the new line returns with #, allowing easy pattern matches

    pattern  = re.compile('.*AC:\s+(.*?)#') #Recognises the accession code pattern
    match    = pattern.match(result) #Saves the pattern to match
    ac       = match.group(1) #Saves only the accession code to variable
    
    return(ac)


In [22]:
""" Main program """
file = "E2.csv"
Cleaned = Clean_data(file)

ac = ReadPDBSWS(pdbcode)

print(f"Accession:{ac}"), print(Cleaned)

Accession:Q16647
     pdbcode:chain:resnum:mutation  Binding  SProtFT0  SProtFT1  SProtFT2  \
0                      3odq:B:90:K      0.0       0.0       0.0       0.0   
1                     3thc:A:214:Y      0.0       0.0       0.0       0.0   
2                     2ary:A:103:A      0.0       0.0       0.0       0.0   
3                     2r3v:A:301:T      0.0       0.0       0.0       0.0   
4                     1bhg:A:408:S      0.0       0.0       0.0       0.0   
...                            ...      ...       ...       ...       ...   
3360                  3cot:A:261:C      0.0       0.0       0.0       0.0   
3361                   2l0w:A:47:V      0.0       0.0       0.0       0.0   
3362                  2l9n:A:126:T      0.0       0.0       0.0       0.0   
3363                  2xwt:C:252:P      0.0       0.0       0.0       0.0   
3364                  3b63:B:177:C      0.0       0.0       0.0       0.0   

      SProtFT3  SProtFT4  SProtFT5  SProtFT6  SProtFT7  ..

(None, None)