In [2]:
def load_CSV(filename, column_names=[], separator=","):
    l_data = []
    column_names.reverse()
    
    try:
        with open(filename, 'r') as file:
            for line in file.readlines():
                temp = line.strip().split(separator)

                data = {}
                for column in column_names:
                    data[column] = temp.pop()
                    data[column] = data[column].strip()
                    
                l_data.append(data)
    except:
        print("/!\ Malformed file {} /!\ Expecting a CSV with following columns: {}".format(filename, column_names))
        
    return l_data

In [3]:
def load_CATH(filename="cath_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'protein_family'], separator="\t")

def load_DSSP(filename="dssp_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

def load_STRIDE(filename="stride_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

In [4]:
cath_info = load_CATH()
dssp_info = load_DSSP()
stride_info = load_STRIDE()

print("Loaded {} entries for CATH".format(len(cath_info)))
print("Loaded {} entries for DSSP".format(len(dssp_info)))
print("Loaded {} entries for STRIDE".format(len(stride_info)))

Loaded 498 entries for CATH
Loaded 71391 entries for DSSP
Loaded 71077 entries for STRIDE


In [5]:
l_residue = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS',
             'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 
             'LEU', 'LYS', 'MET', 'PHE', 'PRO', 
             'SER', 'THR', 'TRP', 'TYR', 'VAL']

l_conformation = ['Alpha', 'Beta', 'Coil']

print("{} valid amino acid".format(len(l_residue)))
print("{} valid conformation".format(len(l_conformation)))

20 valid amino acid
3 valid conformation


In [6]:
# Merge DSSP/STRIDE entries according to the PDB code
def merge(data):
    l_error = []
    
    merged = {}
    for i in range(len(data)):
        entry = data[i]
        
        key = "{}_{}".format(entry['PDB_code'], entry['PDB_chain_code'])
        
        if key not in merged:
            merged[key] = {}
            merged[key]['chain'] = entry['PDB_chain_code']
            merged[key]['seq'] = []
        
        try:
            # Test data consistency
            #assert (entry['PDB_chain_code'] == merged[key]['chain']), "Inconsistent data: Chain code differs for same PDB entry {}, {}".format(
            #    entry['PDB_chain_code'], merged[entry['PDB_code']]['chain']
            #)

            # Valid amino acid
            assert (entry['residue_name'] in l_residue), "Invalid amino acid: {}".format(entry['residue_name'])
        except AssertionError as e:
            l_error.append("Entry #{}: {}".format(i+1, e.args[0]))
            continue
        
        # Assignement requierement
        if entry['secondary_structure'] == 'Other':
            entry['secondary_structure'] = 'Coil'
        elif entry['secondary_structure'] == 'Helix':
            entry['secondary_structure'] = 'Alpha'
        
        merged[key]['seq'].append([
            entry['PDB_seq_code'],
            entry['residue_name'],
            entry['secondary_structure'],
        ])
    
    print("{} entries ignored".format(len(l_error)))
    #print("{}".format('\n'.join(l_error)))
    
    return merged

print("Merging DSSP")
dssp_info_merged = merge(dssp_info)
print("{} entries in DSSP".format(len(dssp_info_merged)))

print("Merging STRIDE")
stride_info_merged = merge(stride_info)
print("{} entries in STRIDE".format(len(stride_info_merged)))

Merging DSSP
804 entries ignored
498 entries in DSSP
Merging STRIDE
4 entries ignored
498 entries in STRIDE


In [7]:
print("Protein count: CATH: {}, DSSP: {}, STRIDE: {}".format(len(cath_info), len(dssp_info_merged), len(stride_info_merged)))

# Check loading results
try:
    assert len(cath_info) == len(dssp_info_merged) == len(stride_info_merged), "Warning: Protein count does not match !"
except AssertionError as e:
    print(e)
else:
    print("Loading went well !")

Protein count: CATH: 498, DSSP: 498, STRIDE: 498
Loading went well !


In [37]:
# Processing I(S;R) using the following formula:
# I(S;R)=log[(fS,R/fR)/(fS/N)]
# S=conformation, R=amino acid, N=total amino acid
# fR=total R, fS=total conformation, fS,R=total R->S
def process_f(cath, dssp, stride):
    global l_residue, l_conformation
    assert len(cath_info) == len(dssp_info_merged) == len(stride_info_merged)
    
    # Initialize data structure
    N = 0
    fr = {}
    fs = {}
    f = {}
    
    for conformation in l_conformation:
        fs[conformation] = 0
    
    for residue in l_residue:
        fr[residue] = 0
        f[residue] = {}
        
        for conformation in l_conformation:
            f[residue][conformation] = 0
    
    # Process all protein
    for cath_protein in cath:
        PDB_code = cath_protein['PDB_code']
        PDB_chain_code = cath_protein['PDB_chain_code']
        cath_familly = cath_protein['protein_family']
        
        key = "{}_{}".format(PDB_code, PDB_chain_code)
        
        assert key in dssp and key in stride, "CATH data inconsistent with DSSP and STRIDE"
            
        all_seq = []
        all_seq.extend(dssp[key]['seq'])
        all_seq.extend(stride[key]['seq'])
        
        assert len(all_seq) == len(dssp[key]['seq']) + len(stride[key]['seq']), "Sequences tables not fully merged for uni-processing !"
        
        for seq in all_seq[:200]:
            position = seq[0]
            residue = seq[1]
            conformation = seq[2]
            
            assert conformation in l_conformation, "Unknown conformation: {}".format(conformation)
            
            N += 1
            fr[residue] += 1
            fs[conformation] += 1
            f[residue][conformation] += 1
            
            # Exploit CATH information
            # The whole protein residues are considered to have the same familly
            if cath_familly == "None":
                continue
            elif cath_familly == "Alpha/beta":
                f[residue]['Alpha'] += 1
                cath_familly = "Beta"
            
            assert cath_familly in l_conformation, "Unknown conformation: {}".format(cath_familly)
            
            f[residue][cath_familly] += 1

    return N, fr, fs, f

def display_f(N, fr, fs, f):
    global l_residue, l_conformation
    
    content_html = ""
    conformation_html = ""
    total_html = ""
    
    for conformation in l_conformation:
        conformation_html += "<th>{}</th>".format(conformation)
        total_html += "<td>{}</td>".format(fs[conformation])
    
    for residue in l_residue:
        content_html += "<td>{}</td>".format(residue)
        
        for conformation in l_conformation:
            content_html += "<td>{}</td>".format(f[residue][conformation])
            
        content_html += "<td>{}</td>".format(fr[residue])
        content_html = "<tr>{}</tr>".format(content_html)
    
    from IPython.display import HTML, display
    display(HTML("""
        <table>
            <tr><th>Residue</th>{}<th>Total</th></tr>
            {}
            <tr><th>Total</th>{}<td>{}</td></tr>
        </table>""".format(conformation_html, content_html, total_html, N)))

N, fr, fs, f = process_f(cath_info, dssp_info_merged, stride_info_merged)
display_f(N, fr, fs, f)

Residue,Alpha,Beta,Coil,Total
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,


In [38]:
def process_ISR(N, fr, fs, f):
    global l_residue, l_conformation
    pass
    