In [1]:
def load_CSV(filename, column_names=[], separator=","):
    l_data = []
    column_names.reverse()
    
    try:
        with open(filename, 'r') as file:
            for line in file.readlines():
                temp = line.strip().split(separator)

                data = {}
                for column in column_names:
                    data[column] = temp.pop()
                    data[column] = data[column].strip()
                    
                l_data.append(data)
    except:
        print("/!\ Malformed file {} /!\ Expecting a CSV with following columns: {}".format(filename, column_names))
        
    return l_data

In [2]:
def load_CATH(filename="cath_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'protein_family'], separator="\t")

def load_DSSP(filename="dssp_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

def load_STRIDE(filename="stride_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

In [3]:
cath_info = load_CATH()
dssp_info = load_DSSP()
stride_info = load_STRIDE()

print("Loaded {} entries for CATH".format(len(cath_info)))
print("Loaded {} entries for DSSP".format(len(dssp_info)))
print("Loaded {} entries for STRIDE".format(len(stride_info)))

Loaded 498 entries for CATH
Loaded 71391 entries for DSSP
Loaded 71077 entries for STRIDE


In [4]:
l_short_residue_mapping = {
    'A': 'ALA',
    'R': 'ARG', 
    'N': 'ASN', 
    'D': 'ASP', 
    'C': 'CYS',
    'Q': 'GLN', 
    'E': 'GLU', 
    'G': 'GLY', 
    'H': 'HIS', 
    'I': 'ILE', 
    'L': 'LEU', 
    'K': 'LYS', 
    'M': 'MET', 
    'F': 'PHE', 
    'P': 'PRO',  
    'S': 'SER', 
    'T': 'THR', 
    'W': 'TRP', 
    'Y': 'TYR', 
    'V': 'VAL',
}
l_long_residue_mapping = {y:x for x,y in l_short_residue_mapping.items()}
l_residue = [y for x,y in l_short_residue_mapping.items()]
l_short_residue = [x for x,y in l_short_residue_mapping.items()]

l_conformation = ['Alpha', 'Beta', 'Coil']
l_short_conformation = ['H', 'E', 'C']
l_conformation_mapping = {'Alpha': 'H', 'Beta': 'E', 'Coil': 'C'}

print("{} valid amino acid".format(len(l_residue)))
print("{} valid conformation".format(len(l_conformation)))

20 valid amino acid
3 valid conformation


In [5]:
# Merge DSSP/STRIDE entries according to the PDB code
def merge(data):
    l_error = []
    
    merged = {}
    for i in range(len(data)):
        entry = data[i]
        
        key = "{}_{}".format(entry['PDB_code'], entry['PDB_chain_code'])
        
        if key not in merged:
            merged[key] = {}
            merged[key]['temp'] = {}
            merged[key]['PDB_chain_code'] = entry['PDB_chain_code']
            merged[key]['sequence'] = ''
            merged[key]['prediction'] = ''
        
        try:
            # Test data consistency
            #assert (entry['PDB_chain_code'] == merged[key]['chain']), "Inconsistent data: Chain code differs for same PDB entry {}, {}".format(
            #    entry['PDB_chain_code'], merged[entry['PDB_code']]['chain']
            #)

            # Valid amino acid
            assert (entry['residue_name'] in l_residue), "Invalid amino acid: {}".format(entry['residue_name'])
        except AssertionError as e:
            l_error.append("Entry #{}: {}".format(i+1, e.args[0]))
            continue
        
        # Assignement requierement
        if entry['secondary_structure'] == 'Other':
            entry['secondary_structure'] = 'Coil'
        elif entry['secondary_structure'] == 'Helix':
            entry['secondary_structure'] = 'Alpha'
        
        position = int(entry['PDB_seq_code'])
            
        merged[key]['temp'][position] = [
            l_long_residue_mapping[entry['residue_name']],
            l_conformation_mapping[entry['secondary_structure']]
        ]
    
    for protein, data in merged.items():
        import collections
        od = collections.OrderedDict(sorted(data['temp'].items()))
        merged[protein]['sequence'] = ''.join([value[0] for key, value in od.items()])
        merged[protein]['prediction'] = ''.join([value[1] for key, value in od.items()])
    
    print("{} entries ignored".format(len(l_error)))
    #print("{}".format('\n'.join(l_error)))
    
    return merged

print("Merging DSSP")
dssp_info_merged = merge(dssp_info)
print("{} entries in DSSP".format(len(dssp_info_merged)))

print("Merging STRIDE")
stride_info_merged = merge(stride_info)
print("{} entries in STRIDE".format(len(stride_info_merged)))

Merging DSSP
804 entries ignored
498 entries in DSSP
Merging STRIDE
4 entries ignored
498 entries in STRIDE


In [6]:
print("Protein count: CATH: {}, DSSP: {}, STRIDE: {}".format(len(cath_info), len(dssp_info_merged), len(stride_info_merged)))

# Check loading results
try:
    # Every CATH protein is in DSSP and STRIDE
    for cath_protein in cath_info:
        PDB_code = cath_protein['PDB_code']
        PDB_chain_code = cath_protein['PDB_chain_code']
        
        key = "{}_{}".format(PDB_code, PDB_chain_code)
        
        assert key in dssp_info_merged and key in stride_info_merged, "CATH data inconsistent with DSSP and STRIDE: {}".format(key)
    
    assert len(cath_info) == len(dssp_info_merged) == len(stride_info_merged), "Warning: Protein count does not match !"
except AssertionError as e:
    print(e)
else:
    print("Loading went well !")

Protein count: CATH: 498, DSSP: 498, STRIDE: 498
Loading went well !


In [7]:
# Processing I(S;R) using the following formula:
# I(S;R)=log[(fS,R/fR)/(fS/N)]
# S=conformation, R=amino acid, N=total amino acid
# fR=total R, fS=total conformation, fS,R=total R->S
# Skip the given protein
def process_f(protein_set, without_protein):
    global l_residue, l_conformation
    
    #if without_protein in protein_set:
    #    print("Protein {} is in provided set, ignored !".format(without_protein))
    #else:
    #    print("Protein {} is not in provided set".format(without_protein))
    
    # Initialize data structure
    N = 0
    fr = {}
    fs = {}
    f = {}
    
    Pair = {}
    
    for conformation in l_short_conformation:
        fs[conformation] = 0
    
    for residue in l_short_residue:
        fr[residue] = 0
        f[residue] = {}
        Pair[residue] = {}
        
        for conformation in l_short_conformation:
            f[residue][conformation] = 0
            Pair[residue][conformation] = {}
            
            for i in range(-8, 9):
                Pair[residue][conformation][8 + i] = {}
                
                for _residue in l_short_residue:
                    Pair[residue][conformation][8 + i][_residue] = 0
    
    # Process all protein
    for name, data in protein_set.items():
        # Skip protein being predicted
        if name == without_protein:
            continue
        
        for i in range(len(data['sequence'])):
            residue = data['sequence'][i]
            conformation = data['prediction'][i]
            
            assert conformation in l_short_conformation, "Unknown conformation: {}".format(conformation)
            
            # Pair information
            for m in range(-8, 9):
                if i+m < 0 or i+m >= len(data['sequence']):
                    continue
                
                _residueim = data['sequence'][i + m]
                Pair[residue][conformation][8 + m][_residueim] += 1
            
            N += 1
            fr[residue] += 1
            fs[conformation] += 1
            f[residue][conformation] += 1

    return N, fr, fs, f, Pair

def display_f(N, fr, fs, f):
    global l_residue, l_conformation
    
    content_html = ""
    conformation_html = ""
    total_html = ""
    
    for conformation in l_conformation:
        short_conformation = l_conformation_mapping[conformation]
        
        conformation_html += "<th>{}</th>".format(conformation)
        total_html += "<td>{}</td>".format(fs[short_conformation])
    
    for residue in l_residue:
        short_residue = l_long_residue_mapping[residue]
        
        content_html += "<td>{}</td>".format(residue)
        
        for conformation in l_conformation:
            short_conformation = l_conformation_mapping[conformation]
            content_html += "<td>{}</td>".format(f[short_residue][short_conformation])
            
        content_html += "<td>{}</td>".format(fr[short_residue])
        content_html = "<tr>{}</tr>".format(content_html)
    
    from IPython.display import HTML, display
    display(HTML("""
        <table>
            <tr><th>Residue</th>{}<th>Total</th></tr>
            {}
            <tr><th>Total</th>{}<td>{}</td></tr>
        </table>""".format(conformation_html, content_html, total_html, N)))

# Usage:
#N, fr, fs, f, Pair = process_f(dssp_info_merged, "")
#display_f(N, fr, fs, f)
#
# N, fr, fs, f, Pair = process_f(stride_info_merged, "")
# display_f(N, fr, fs, f)

In [8]:
import os
    
def load_protein(filename):
    protein = ''
    sequence = ''
    secondary_structure = ''
    
    with open("protein/{}".format(filename), 'r') as f:
        for line in f:
            line = line.strip('\n')
            if line.startswith('>') and protein == '':
                protein = line[1:7]
                continue
            elif line.startswith('>') or line == '':
                continue
            
            if sequence == '':
                sequence = line
            else:
                secondary_structure = line
                
    return protein, sequence, secondary_structure

l_protein = []
for filename in os.listdir('protein'):
    protein, sequence, secondary_structure = load_protein(filename)
    
    assert len(sequence) == len(secondary_structure), "Protein size and secondary structure size does not match ! Protein: {}".format(filename)
    
    l_protein.append({
        'protein': protein,
        'sequence': sequence,
        'secondary_structure': secondary_structure,
    })

In [9]:
import math

# Compute self-information using the following formula:
# I(delta Sj; Rj) = log(fSj,Rj / fn-Sj,Rj) + log(fn-S / fS)
# fn-S,R = fR - fS,R
# fn-S   = N - fS
def compute_self_information(N, fr, fs, f, R):
    global l_residue, l_conformation
    
    I = {}
    for S in l_short_conformation:
        fnsr = fr[R] - f[R][S]
        fns = N - fs[S]
        
        I[S] = math.log(f[R][S] / fnsr) + math.log(fns / fs[S])
    
    return I

In [48]:
# Compute pair information
def compute_pair_information(N, fr, fs, f, Pair, sequence, j):
    I = {S:0 for S in l_short_conformation}
    
    Rj = sequence[j]
    
    for m in range(-8,9):
        for S in l_short_conformation:
            if j + m < 0 or j + m >= len(sequence):
                continue
            
            Rjm = sequence[j + m]
            
            fsrr = Pair[Rj][S][8 + m][Rjm]
            
            fnsrr = 0
            for nS in l_short_conformation:
                if nS == S:
                    continue
                fnsrr += Pair[Rj][nS][8 + m][Rjm]
            
            fnsr = fr[Rj] - f[Rj][S]
            
            temp = 0
            try:
                temp = math.log(fsrr / fnsrr)
            except:
                pass
            
            I[S] += temp + math.log(fnsr / f[Rj][S])

    return I

In [49]:
def MCC(real, predicted):
    TP, TN, FP, FN = 0.0, 0.0, 0.0, 0.0
    
    for i in range(len(real)):
        if real[i] == predicted[i]:
            if real[i] == 'H':
                TP += 1
            else:
                TN += 1
        else:
            if real[i] == 'H':
                FN += 1
            else:
                FP += 1
        
    up = (TP * TN) - (FP * FN)
    bottom = math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    
    try:
        return up / bottom
    except:
        return 'ERROR'

In [50]:
def Q3(real, predicted):
    correct = 0.0
    total = len(real)
    
    for i in range(total):
        if real[i] == predicted[i]:
            correct += 1
        
    return correct / total

In [51]:
def prediction(set_info, protein):
    sequence = set_info[protein]['sequence']
    secondary_structure = set_info[protein]['prediction']
    
    N, fr, fs, f, Pair = process_f(set_info, protein)
    #display_f(N, fr, fs, f)
    
    # Compute local informations for each amino acid
    self_information = {}
    for residue in l_short_residue:
        self_information[residue] = compute_self_information(N, fr, fs, f, residue)
    
    prediction = ""
    for j in range(len(sequence)):
        R = sequence[j]
        I = compute_pair_information(N, fr, fs, f, Pair, sequence, j)
        
        si = self_information[R]
        for S in l_short_conformation:
            I[S] += si[S]
            
        import operator
        gor_prediction = max(I.items(), key=operator.itemgetter(1))[0]
        prediction += gor_prediction
        
    Q3_value = Q3(secondary_structure, prediction)
    MCC_value = MCC(secondary_structure, prediction)
    
    family = ''
    
    return protein, prediction, Q3_value, MCC_value, family

In [52]:
# Example of f used
N, fr, fs, f, Pair = process_f(dssp_info_merged, "")
display_f(N, fr, fs, f)

Residue,Alpha,Beta,Coil,Total
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,


# Results

First block is DSSP results for every protein in CATH file and second block is STRIDE results for every protein in CATH file

## Explainations

The self-information function is made using the table shown one cell above.

For the pair information: When processing DSSP or STRIDE informations, we count the appearance of a given amino acid in position 1 to 17 to copute the influence of observing a given Amino Acid in a given position within the windows considered (1 to 17) on the given conformation. The  influence is computed using the formula on course page 5 on the bottom right.

In [None]:
# DSSP results
count = 0
q3_sum = 0.0
for data in cath_info[:25]:
    key = "{}_{}".format(data['PDB_code'], data['PDB_chain_code'])
    
    
    protein, prediction, Q3_value, MCC_value, family = prediction(dssp_info_merged, key)
    print("{} {} {} {} {}".format(protein, prediction, Q3_value, MCC_value, family))
    
    count += 1
    q3_sum += Q3_value

print("Mean Q3: {}".format(q3_sum / count))

In [None]:
# Stride results
for data in cath_info:
    key = "{}_{}".format(data['PDB_code'], data['PDB_chain_code'])
    
    protein, prediction, Q3_value, MCC_value, family = prediction(stride_info_merged, key)
    print("{} {} {} {} {}".format(protein, prediction, Q3_value, MCC_value, family))