In [14]:
import os
import glob

In [2]:
with open('data/1OLG.pdb', 'r') as f:
    print(type(f))

<class '_io.TextIOWrapper'>


In [1]:
# Read file into string
with open('data/1OLG.pdb', 'r') as f:
    f_str = f.read()

# Let's look at the first 1000 characters
f_str[:1000]

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \nTITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \nTITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \nCOMPND    MOL_ID: 1;                                                            \nCOMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \nCOMPND   3 CHAIN: A, B, C, D;                                                   \nCOMPND   4 ENGINEERED: YES                                                      \nSOURCE    MOL_ID: 1;                                                            \nSOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \nSOURCE   3 ORGANISM_COMMON: HUMAN;                                              \nSOURCE   4 ORGANISM_TAXID: 9606                                                 \nKEYWDS    ANTI-ONCOGENE                                                         \nEXPDTA    SOLUT

In [2]:
# Read contents of the file in as a list
with open('data/1OLG.pdb', 'r') as f:
    f_list = f.readlines()

# Look at the list (first ten entries)
f_list[:10]

['HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \n',
 'TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \n',
 'TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \n',
 'COMPND    MOL_ID: 1;                                                            \n',
 'COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \n',
 'COMPND   3 CHAIN: A, B, C, D;                                                   \n',
 'COMPND   4 ENGINEERED: YES                                                      \n',
 'SOURCE    MOL_ID: 1;                                                            \n',
 'SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \n',
 'SOURCE   3 ORGANISM_COMMON: HUMAN;                                              \n']

In [3]:
#remove whitespace from end
f_list[0].rstrip()

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG'

In [4]:
#reads first 10 lines w/o reading file as list
with open('data/1OLG.pdb', 'r') as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i >= 10:
            break

HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG
TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION
TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);
COMPND   3 CHAIN: A, B, C, D;
COMPND   4 ENGINEERED: YES
SOURCE    MOL_ID: 1;
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;
SOURCE   3 ORGANISM_COMMON: HUMAN;
SOURCE   4 ORGANISM_TAXID: 9606


In [7]:
#alternative method to cell 4
with open('data/1OLG.pdb','r') as f:
    i = 0
    while i <= 10:
        print(f.readline().rstrip())
        i += 1

HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG
TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION
TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);
COMPND   3 CHAIN: A, B, C, D;
COMPND   4 ENGINEERED: YES
SOURCE    MOL_ID: 1;
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;
SOURCE   3 ORGANISM_COMMON: HUMAN;
SOURCE   4 ORGANISM_TAXID: 9606


In [8]:
#os.path.isfile() checks to see if file exists
os.path.isfile('data/1OLG.pdb')

True

In [1]:
if os.path.isfile('mastery.txt'):
    raise RuntimeError('File mastery.txt already exists')
with open ('mastery.txt', 'w') as f:
    f.write('This is my file\n')
    f.write('There are many like it, but this one is mine\n')
    f.write('I must master my file like I must master my life.\n')

In [8]:
#this will raise an exception because write() can only take str arguments
with open('gimme_phi.txt', 'w') as f:
    f.write('The golden ratio is phi = ')
    f.write('{phi:.8f}'.format(phi=1.61803398875))

## **An exercise: extract atomic coordinates for first chain in tetramer**

As an example on how to do file I/O, we will take the PDB file and extract only the `ATOM` records for the first chain of the tetramer and write only those entries to a new file.

It is useful to know that according to the [PDB format specification](http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM), column 21 in the `ATOM` entry gives the ID of the chain

We also conveniently use the fact that we can have multiple files open in our `with` block, seperating them with commas.

In [12]:
with open('data/1OLG.pdb', 'r') as f, open('atoms_chain_A.txt', 'w') as f_out:
    #put ATOM lines from chain A in new file
    for line in f:
        if len(line) > 21 and line[:4] == 'ATOM' and line[21] == 'A':
            f_out.write(line)

## **Finding files and with glob**

In the above snippet of code, we extracted all atom records from a PDB file. We might want to do this (or some other operation) for many files. For example, the directory `~/git/data/` has four PDB files in it. For the present discussion, let’s say we want to pull the sequence of chain A out of each PDB file.

The `glob` module from the standard library enables us to get a list of all files that match a pattern. In our case, we want all files matching `data/*.pdb`, where `*` is a **wild card character**, meaning that any matches of characters where * appears are allowed. Let’s see what `glob.glob()` gives us.

In [15]:
file_list = glob.glob('data/*.pdb')

file_list

['data\\1FAG.pdb', 'data\\1J6Z.pdb', 'data\\1OLG.pdb', 'data\\2ERK.pdb']

In [20]:
#dictionary to hold sequences
seqs = {}

#loop through all matching files
for file_name in file_list:
    #extract PDB ID
    pdb_id = file_name[file_name.find('\\')+1:file_name.rfind('.')]

    #intialize sequence string, which we build as we go along
    seq = ''
    with open(file_name, 'r') as f:
        for line in f:
            if len(line) > 11 and line[:6] == 'SEQRES' and line[11] == 'A':
                seq += line[19:].rstrip() + ' '

    #build sequence w/ dash-joined three letter codes
    seq = '-'.join(seq.split())

    #store in the dictionary
    seqs[pdb_id] = seq
seqs

{'1FAG': 'THR-ILE-LYS-GLU-MET-PRO-GLN-PRO-LYS-THR-PHE-GLY-GLU-LEU-LYS-ASN-LEU-PRO-LEU-LEU-ASN-THR-ASP-LYS-PRO-VAL-GLN-ALA-LEU-MET-LYS-ILE-ALA-ASP-GLU-LEU-GLY-GLU-ILE-PHE-LYS-PHE-GLU-ALA-PRO-GLY-ARG-VAL-THR-ARG-TYR-LEU-SER-SER-GLN-ARG-LEU-ILE-LYS-GLU-ALA-CYS-ASP-GLU-SER-ARG-PHE-ASP-LYS-ASN-LEU-SER-GLN-ALA-LEU-LYS-PHE-VAL-ARG-ASP-PHE-ALA-GLY-ASP-GLY-LEU-PHE-THR-SER-TRP-THR-HIS-GLU-LYS-ASN-TRP-LYS-LYS-ALA-HIS-ASN-ILE-LEU-LEU-PRO-SER-PHE-SER-GLN-GLN-ALA-MET-LYS-GLY-TYR-HIS-ALA-MET-MET-VAL-ASP-ILE-ALA-VAL-GLN-LEU-VAL-GLN-LYS-TRP-GLU-ARG-LEU-ASN-ALA-ASP-GLU-HIS-ILE-GLU-VAL-PRO-GLU-ASP-MET-THR-ARG-LEU-THR-LEU-ASP-THR-ILE-GLY-LEU-CYS-GLY-PHE-ASN-TYR-ARG-PHE-ASN-SER-PHE-TYR-ARG-ASP-GLN-PRO-HIS-PRO-PHE-ILE-THR-SER-MET-VAL-ARG-ALA-LEU-ASP-GLU-ALA-MET-ASN-LYS-LEU-GLN-ARG-ALA-ASN-PRO-ASP-ASP-PRO-ALA-TYR-ASP-GLU-ASN-LYS-ARG-GLN-PHE-GLN-GLU-ASP-ILE-LYS-VAL-MET-ASN-ASP-LEU-VAL-ASP-LYS-ILE-ILE-ALA-ASP-ARG-LYS-ALA-SER-GLY-GLU-GLN-SER-ASP-ASP-LEU-LEU-THR-HIS-MET-LEU-ASN-GLY-LYS-ASP-PRO-GLU-THR-GLY-GLU-PR

In [21]:
seqs['1J6Z']

'ASP-GLU-ASP-GLU-THR-THR-ALA-LEU-VAL-CYS-ASP-ASN-GLY-SER-GLY-LEU-VAL-LYS-ALA-GLY-PHE-ALA-GLY-ASP-ASP-ALA-PRO-ARG-ALA-VAL-PHE-PRO-SER-ILE-VAL-GLY-ARG-PRO-ARG-HIS-GLN-GLY-VAL-MET-VAL-GLY-MET-GLY-GLN-LYS-ASP-SER-TYR-VAL-GLY-ASP-GLU-ALA-GLN-SER-LYS-ARG-GLY-ILE-LEU-THR-LEU-LYS-TYR-PRO-ILE-GLU-HIC-GLY-ILE-ILE-THR-ASN-TRP-ASP-ASP-MET-GLU-LYS-ILE-TRP-HIS-HIS-THR-PHE-TYR-ASN-GLU-LEU-ARG-VAL-ALA-PRO-GLU-GLU-HIS-PRO-THR-LEU-LEU-THR-GLU-ALA-PRO-LEU-ASN-PRO-LYS-ALA-ASN-ARG-GLU-LYS-MET-THR-GLN-ILE-MET-PHE-GLU-THR-PHE-ASN-VAL-PRO-ALA-MET-TYR-VAL-ALA-ILE-GLN-ALA-VAL-LEU-SER-LEU-TYR-ALA-SER-GLY-ARG-THR-THR-GLY-ILE-VAL-LEU-ASP-SER-GLY-ASP-GLY-VAL-THR-HIS-ASN-VAL-PRO-ILE-TYR-GLU-GLY-TYR-ALA-LEU-PRO-HIS-ALA-ILE-MET-ARG-LEU-ASP-LEU-ALA-GLY-ARG-ASP-LEU-THR-ASP-TYR-LEU-MET-LYS-ILE-LEU-THR-GLU-ARG-GLY-TYR-SER-PHE-VAL-THR-THR-ALA-GLU-ARG-GLU-ILE-VAL-ARG-ASP-ILE-LYS-GLU-LYS-LEU-CYS-TYR-VAL-ALA-LEU-ASP-PHE-GLU-ASN-GLU-MET-ALA-THR-ALA-ALA-SER-SER-SER-SER-LEU-GLU-LYS-SER-TYR-GLU-LEU-PRO-ASP-GLY-GLN-VAL-ILE-THR-ILE

In [22]:
%load_ext watermark
%watermark -v -p jupyterlab

Python implementation: CPython
Python version       : 3.8.11
IPython version      : 7.27.0

jupyterlab: 3.1.7

