In [1]:
# prointvar.pdx -- Using PDBXreader to parse a mmCIF formatted macromolecular structure.

import os
from prointvar.config import config as cfg
from prointvar.pdbx import PDBXreader
from prointvar.fetchers import download_structure_from_pdbe

# os.chdir('/homes/2524591/data')

2022-08-17 12:31:29,743 numexpr.utils INFO     Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-08-17 12:31:29,744 numexpr.utils INFO     NumExpr defaulting to 8 threads.




In [2]:
download_structure_from_pdbe('1a9x')
input_struct = os.path.join(cfg.db_root, cfg.db_pdbx, '1a9x.cif')
df = PDBXreader(inputfile=input_struct).atoms(format_type="mmcif")
# pandas DataFrame
df.head()

2022-08-17 12:31:33,355 prointvar    INFO     Parsing mmCIF atoms from lines...
  table['label_seq_id_full'] = table['label_seq_id'] + table['pdbx_PDB_ins_code'].str.replace('?', '')

  table['auth_seq_id_full'] = table['auth_seq_id'] + table['pdbx_PDB_ins_code'].str.replace('?', '')

2022-08-17 12:31:33,882 prointvar    INFO     PDBx added full res (res + ins_code)...
2022-08-17 12:31:33,948 prointvar    INFO     PDBx removed existing hydrogens...
2022-08-17 12:31:33,994 prointvar    INFO     PDBx reset atom numbers...


Unnamed: 0,group_PDB,id,type_symbol,label_atom_id,label_alt_id,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,...,auth_asym_id,auth_atom_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res,label_seq_id_full,auth_seq_id_full
0,ATOM,1,N,N,.,MET,A,1,1,?,...,A,N,1,1,UNP,P00968,1,M,1,1
1,ATOM,2,C,CA,.,MET,A,1,1,?,...,A,CA,1,1,UNP,P00968,1,M,1,1
2,ATOM,3,C,C,.,MET,A,1,1,?,...,A,C,1,1,UNP,P00968,1,M,1,1
3,ATOM,4,O,O,.,MET,A,1,1,?,...,A,O,1,1,UNP,P00968,1,M,1,1
4,ATOM,5,C,CB,.,MET,A,1,1,?,...,A,CB,1,1,UNP,P00968,1,M,1,1


In [3]:
# convert mmcif to pdbx format
from prointvar.pdbx import PDBXwriter

output_struct = os.path.join(cfg.db_root, cfg.db_pdbx, '1a9x.pdb')
w = PDBXwriter(outputfile=output_struct)
w.run(df, format_type="pdb")

In [4]:
# read DSSP formatted files and generate DSSP output for mmCIF/PDB structures.

from prointvar.dssp import DSSPrunner, DSSPreader

input_struct = os.path.join(cfg.db_root, cfg.db_pdbx, '1a9x.cif')
output_dssp = os.path.join(cfg.db_root, cfg.db_dssp, '1a9x.dssp')


DSSPrunner(inputfile=input_struct, outputfile=output_dssp).write()

df2 = DSSPreader(inputfile=output_dssp).read(add_ss_reduced=True)
# pandas DataFrame
df2.head()

2022-08-17 12:31:46,286 prointvar    INFO     Parsing DSSP from lines...
2022-08-17 12:31:46,512 prointvar    INFO     DSSP added full chain...
2022-08-17 12:31:46,576 prointvar    INFO     DSSP added reduced SS...
2022-08-17 12:31:46,709 prointvar    INFO     DSSP added RSA...


Unnamed: 0,RES,CHAIN,AA,SS,ACC,TCO,KAPPA,ALPHA,PHI,PSI,CHAIN_FULL,SS_CLASS,RSA
0,1,A,M,,83,0.0,360.0,360.0,360.0,-65.8,A,C,44.149
1,2,A,P,,82,-0.022,360.0,-34.8,-82.3,-171.0,A,C,60.294
2,3,A,K,,96,-0.212,67.5,-110.5,-57.2,133.3,A,C,46.829
3,4,A,R,,85,-0.218,23.3,-162.6,-58.6,139.9,A,C,34.274
4,5,A,T,S,121,0.747,76.9,70.1,-101.1,-28.1,A,C,85.211


In [5]:
# SIFTS

from prointvar.sifts import SIFTSreader
from prointvar.fetchers import download_sifts_from_ebi

download_sifts_from_ebi('1a9x')
input_sifts = os.path.join(cfg.db_root, cfg.db_sifts, '1a9x.xml')
df3 = SIFTSreader(inputfile=input_sifts).read()
# # pandas DataFrame
df3.head()

2022-08-17 12:31:52,770 prointvar    INFO     Decompressed ./sifts/1a9x.xml.gz to ./sifts/1a9x.xml
2022-08-17 12:31:52,771 prointvar    INFO     Parsing SIFTS residues from lines...
2022-08-17 12:31:52,771 prointvar    INFO     Parsing SIFTS regions from lines...


Unnamed: 0,PDB_regionId,PDB_regionStart,PDB_regionEnd,PDB_regionResNum,PDB_dbAccessionId,PDB_dbResNum,PDB_dbResName,PDB_dbChainId,PDB_Annotation,PDB_entityId,...,SCOP2B_regionEnd,SCOP2B_regionResNum,SCOP2B_dbAccessionId,PDB_codeSecondaryStructure,PDB_nameSecondaryStructure,Pfam_regionId,Pfam_regionStart,Pfam_regionEnd,Pfam_regionResNum,Pfam_dbAccessionId
0,1,1,1073,1,1a9x,1,MET,A,Observed,A,...,127,1,SF-DOMID:8044230,T,loop,-,0,0,,
1,1,1,1073,2,1a9x,2,PRO,A,Observed,A,...,127,2,SF-DOMID:8044230,T,loop,-,0,0,,
2,1,1,1073,3,1a9x,3,LYS,A,Observed,A,...,127,3,SF-DOMID:8044230,T,loop,-,0,0,,
3,1,1,1073,4,1a9x,4,ARG,A,Observed,A,...,127,4,SF-DOMID:8044230,T,loop,-,0,0,,
4,1,1,1073,5,1a9x,5,THR,A,Observed,A,...,127,5,SF-DOMID:8044230,T,loop,-,0,0,,


In [6]:
# Merge


from prointvar.merger import TableMerger

mdf = TableMerger(pdbx_table=df, dssp_table=df2, sifts_table=df3).merge()
# pandas DataFrame
mdf.head()

mdf.to_csv('/Users/Sophia/Dundee/trial.csv')

2022-08-17 12:32:09,392 prointvar    INFO     Merged mmCIF and DSSP tables...
2022-08-17 12:32:09,472 prointvar    INFO     Merged mmCIF and SIFTS tables...


In [13]:
mdf.columns

Index(['group_PDB', 'id', 'type_symbol', 'label_atom_id', 'label_alt_id',
       'label_comp_id', 'label_asym_id', 'label_entity_id', 'label_seq_id',
       'pdbx_PDB_ins_code', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy',
       'B_iso_or_equiv', 'pdbx_formal_charge', 'auth_seq_id', 'auth_comp_id',
       'auth_asym_id', 'auth_atom_id', 'pdbx_PDB_model_num',
       'pdbx_label_index', 'pdbx_sifts_xref_db_name', 'pdbx_sifts_xref_db_acc',
       'pdbx_sifts_xref_db_num', 'pdbx_sifts_xref_db_res', 'label_seq_id_full',
       'auth_seq_id_full', 'RES', 'CHAIN', 'AA', 'SS', 'ACC', 'TCO', 'KAPPA',
       'ALPHA', 'PHI', 'PSI', 'CHAIN_FULL', 'SS_CLASS', 'RSA', 'PDB_regionId',
       'PDB_regionStart', 'PDB_regionEnd', 'PDB_regionResNum',
       'PDB_dbAccessionId', 'PDB_dbResNum', 'PDB_dbResName', 'PDB_dbChainId',
       'PDB_Annotation', 'PDB_entityId', 'UniProt_regionId',
       'UniProt_regionStart', 'UniProt_regionEnd', 'UniProt_regionResNum',
       'UniProt_dbAccessionId', 'UniProt_db

In [40]:
# choose columns


x = mdf[[
#     'group_PDB', 'id', 'type_symbol', 'label_atom_id', 'label_alt_id',
#        'label_comp_id', 'label_asym_id', 'label_entity_id', 'label_seq_id',
#        'pdbx_PDB_ins_code', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy',
#        'B_iso_or_equiv', 'pdbx_formal_charge',
    'auth_seq_id', 'auth_comp_id',
#        'auth_asym_id', 'auth_atom_id', 
    'pdbx_PDB_model_num',
       'pdbx_label_index', 'pdbx_sifts_xref_db_name', 'pdbx_sifts_xref_db_acc',
       'pdbx_sifts_xref_db_num', 'pdbx_sifts_xref_db_res', 
#     'label_seq_id_full','auth_seq_id_full', 
    'RES', 'CHAIN', 'AA', 'SS', 'ACC', 'TCO', 'KAPPA',
       'ALPHA', 'PHI', 'PSI', 'CHAIN_FULL', 'SS_CLASS', 'RSA', 'PDB_regionId',
#        'PDB_regionStart', 'PDB_regionEnd', 'PDB_regionResNum',
       'PDB_dbAccessionId', 'PDB_dbResNum', 'PDB_dbResName', 'PDB_dbChainId',
#        'PDB_Annotation', 'PDB_entityId', 'UniProt_regionId',
#        'UniProt_regionStart', 'UniProt_regionEnd', 'UniProt_regionResNum',
       'UniProt_dbAccessionId', 'UniProt_dbResNum', 'UniProt_dbResName',
#        'SCOP_regionId', 'SCOP_regionStart', 'SCOP_regionEnd',
#        'SCOP_regionResNum',
#     'SCOP_dbAccessionId', 
#     'Ensembl_regionId', 'Ensembl_regionStart', 'Ensembl_regionEnd', 'Ensembl_regionResNum',
#        'Ensembl_dbAccessionId', 'Ensembl_dbTranscriptId',
#        'Ensembl_dbTranslationId', 'Ensembl_dbExonId', 'CATH_regionId',
#        'CATH_regionStart', 'CATH_regionEnd', 'CATH_regionResNum',
#        'CATH_dbAccessionId', 
#     'SCOP2_regionId', 'SCOP2_regionStart',
#        'SCOP2_regionEnd', 'SCOP2_regionResNum', 
#     'SCOP2_dbAccessionId',
       'PDB_codeSecondaryStructure', 'PDB_nameSecondaryStructure',
#        'Pfam_regionId', 'Pfam_regionStart', 'Pfam_regionEnd',
#        'Pfam_regionResNum', 'Pfam_dbAccessionId', 'SCOP2B_regionId',
#        'SCOP2B_regionStart', 'SCOP2B_regionEnd', 'SCOP2B_regionResNum',
#        'SCOP2B_dbAccessionId'
]]


x.to_csv('/homes/2524591/data/trial.csv')

In [47]:
mdf[['PDB_dbResName']]

Unnamed: 0,PDB_dbResName
0,LYS
1,LYS
2,LYS
3,LYS
4,LYS
...,...
9879,
9880,
9881,
9882,


In [3]:
from prointvar.config import config

config.dssp_bin

'/path/to/dssp/bin/dssp'

In [4]:
! which mkdssp

/homes/2524591/miniconda3/envs/prointvar-dssp/bin/mkdssp


In [8]:
from prointvar.dssp import DSSPrunner, DSSPreader

input_struct = os.path.join(cfg.db_root, cfg.db_pdbx, '1a12.cif')
output_dssp = os.path.join(cfg.db_root, cfg.db_dssp, '1a12.dssp')

type(input_struct)

DSSPrunner(inputfile=input_struct, outputfile=output_dssp).write()

df2 = DSSPreader(inputfile=output_dssp).read(add_ss_reduced=True)
# pandas DataFrame
df2.head()

2022-07-21 14:12:56,428 prointvar    INFO     DSSP for ./dssp/1a12.dssp already available...
2022-07-21 14:12:56,429 prointvar    INFO     Parsing DSSP from lines...
2022-07-21 14:12:56,465 prointvar    INFO     DSSP added full chain...
2022-07-21 14:12:56,474 prointvar    INFO     DSSP added reduced SS...
2022-07-21 14:12:56,493 prointvar    INFO     DSSP added RSA...


Unnamed: 0,RES,CHAIN,AA,SS,ACC,TCO,KAPPA,ALPHA,PHI,PSI,CHAIN_FULL,SS_CLASS,RSA
0,21,A,K,,173,0.0,360.0,360.0,360.0,129.7,A,C,84.39
1,22,A,K,,164,-0.433,360.0,-153.3,-64.1,85.0,A,C,80.0
2,23,A,V,,41,-0.429,8.7,-128.1,-66.5,129.9,A,C,28.873
3,24,A,K,,133,-0.476,21.9,-164.8,-79.9,149.3,A,C,64.878
4,25,A,V,,11,-0.942,6.3,-177.4,-126.4,157.4,A,C,7.746


In [10]:
print(input_struct)
print(output_struct)

./pdbx/1a12.cif
./pdbx/2pah.pdb


In [None]:
from prointvar.dssp import DSSPrunner, DSSPreader

dir = os.listdir('AF_pdb_files')


for file in dir:
    input_struct = os.path.join('/cluster/gjb_lab/2524591/data/Alphafold/AF_pdb_files', file)   
    
    x = file[:-4]
    
    output_dssp = os.path.join('/cluster/gjb_lab/2524591/data/Alphafold/AF_dssp_files', x + '.dssp')
    try:
        # DSSPrunner(inputfile=input_struct, outputfile=output_dssp).write()  
        df2 = DSSPreader(inputfile=output_dssp).read(add_ss_reduced=True)
        df2.head()
        #df2.to_csv('/cluster/gjb_lab/2524591/data/Alphafold/AF_initial/AF_pred_day2.csv', index=False)

    except:
        continue

In [11]:
# SIFTS
# ***
from prointvar.sifts import SIFTSreader
from prointvar.fetchers import download_sifts_from_ebi

download_sifts_from_ebi('2pah')
input_sifts = os.path.join(cfg.db_root, cfg.db_sifts, '2pah.xml')
df3 = SIFTSreader(inputfile=input_sifts).read()
# pandas DataFrame
df3.head()

FileNotFoundError: [Errno 2] No such file or directory: './sifts/2pah.xml.gz'