In [88]:
def load_CSV(filename, column_names=[], separator=","):
    l_data = []
    column_names.reverse()
    
    try:
        with open(filename, 'r') as file:
            for line in file.readlines():
                temp = line.strip().split(separator)

                data = {}
                for column in column_names:
                    data[column] = temp.pop()
                    data[column] = data[column].strip()
                    
                l_data.append(data)
    except:
        print("/!\ Malformed file {} /!\ Expecting a CSV with following columns: {}".format(filename, column_names))
        
    return l_data

In [89]:
def load_CATH(filename="cath_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'protein_family'], separator="\t")

def load_DSSP(filename="dssp_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

def load_STRIDE(filename="stride_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

In [90]:
cath_info = load_CATH()
dssp_info = load_DSSP()
stride_info = load_STRIDE()

print("Loaded {} entries for CATH".format(len(cath_info)))
print("Loaded {} entries for DSSP".format(len(dssp_info)))
print("Loaded {} entries for STRIDE".format(len(stride_info)))

Loaded 498 entries for CATH
Loaded 71391 entries for DSSP
Loaded 71077 entries for STRIDE


In [178]:
l_short_residue_mapping = {
    'A': 'ALA',
    'R': 'ARG', 
    'N': 'ASN', 
    'D': 'ASP', 
    'C': 'CYS',
    'Q': 'GLN', 
    'E': 'GLU', 
    'G': 'GLY', 
    'H': 'HIS', 
    'I': 'ILE', 
    'L': 'LEU', 
    'K': 'LYS', 
    'M': 'MET', 
    'F': 'PHE', 
    'P': 'PRO',  
    'S': 'SER', 
    'T': 'THR', 
    'W': 'TRP', 
    'Y': 'TYR', 
    'V': 'VAL',
}
l_long_residue_mapping = {y:x for x,y in l_short_residue_mapping.items()}
l_residue = [y for x,y in l_short_residue_mapping.items()]
l_short_residue = [x for x,y in l_short_residue_mapping.items()]

l_conformation = ['Alpha', 'Beta', 'Coil']
l_conformation_mapping = {'Alpha': 'H', 'Beta': 'E', 'Coil': 'C'}

print("{} valid amino acid".format(len(l_residue)))
print("{} valid conformation".format(len(l_conformation)))

20 valid amino acid
3 valid conformation


In [123]:
# Merge DSSP/STRIDE entries according to the PDB code
def merge(data):
    l_error = []
    
    merged = {}
    for i in range(len(data)):
        entry = data[i]
        
        key = "{}_{}".format(entry['PDB_code'], entry['PDB_chain_code'])
        
        if key not in merged:
            merged[key] = {}
            merged[key]['chain'] = entry['PDB_chain_code']
            merged[key]['seq'] = []
        
        try:
            # Test data consistency
            #assert (entry['PDB_chain_code'] == merged[key]['chain']), "Inconsistent data: Chain code differs for same PDB entry {}, {}".format(
            #    entry['PDB_chain_code'], merged[entry['PDB_code']]['chain']
            #)

            # Valid amino acid
            assert (entry['residue_name'] in l_residue), "Invalid amino acid: {}".format(entry['residue_name'])
        except AssertionError as e:
            l_error.append("Entry #{}: {}".format(i+1, e.args[0]))
            continue
        
        # Assignement requierement
        if entry['secondary_structure'] == 'Other':
            entry['secondary_structure'] = 'Coil'
        elif entry['secondary_structure'] == 'Helix':
            entry['secondary_structure'] = 'Alpha'
        
        merged[key]['seq'].append([
            entry['PDB_seq_code'],
            entry['residue_name'],
            entry['secondary_structure'],
        ])
    
    print("{} entries ignored".format(len(l_error)))
    #print("{}".format('\n'.join(l_error)))
    
    return merged

print("Merging DSSP")
dssp_info_merged = merge(dssp_info)
print("{} entries in DSSP".format(len(dssp_info_merged)))

print("Merging STRIDE")
stride_info_merged = merge(stride_info)
print("{} entries in STRIDE".format(len(stride_info_merged)))

Merging DSSP
804 entries ignored
498 entries in DSSP
Merging STRIDE
4 entries ignored
498 entries in STRIDE


In [124]:
print("Protein count: CATH: {}, DSSP: {}, STRIDE: {}".format(len(cath_info), len(dssp_info_merged), len(stride_info_merged)))

# Check loading results
try:
    assert len(cath_info) == len(dssp_info_merged) == len(stride_info_merged), "Warning: Protein count does not match !"
   
    # Every CATH protein is in DSSP and STRIDE
    for cath_protein in cath_info:
        PDB_code = cath_protein['PDB_code']
        PDB_chain_code = cath_protein['PDB_chain_code']
        
        key = "{}_{}".format(PDB_code, PDB_chain_code)
        
        assert key in dssp_info_merged and key in stride_info_merged, "CATH data inconsistent with DSSP and STRIDE"
except AssertionError as e:
    print(e)
else:
    print("Loading went well !")

Protein count: CATH: 498, DSSP: 498, STRIDE: 498
Loading went well !


In [125]:
# Processing I(S;R) using the following formula:
# I(S;R)=log[(fS,R/fR)/(fS/N)]
# S=conformation, R=amino acid, N=total amino acid
# fR=total R, fS=total conformation, fS,R=total R->S
def process_f(protein_set):
    global l_residue, l_conformation
    
    # Initialize data structure
    N = 0
    fr = {}
    fs = {}
    f = {}
    
    for conformation in l_conformation:
        fs[conformation] = 0
    
    for residue in l_residue:
        fr[residue] = 0
        f[residue] = {}
        
        for conformation in l_conformation:
            f[residue][conformation] = 0
    
    # Process all protein
    for key, protein in protein_set.items():
        for seq in protein['seq']:
            position = seq[0]
            residue = seq[1]
            conformation = seq[2]
            
            assert conformation in l_conformation, "Unknown conformation: {}".format(conformation)
            
            N += 1
            fr[residue] += 1
            fs[conformation] += 1
            f[residue][conformation] += 1

    return N, fr, fs, f

def display_f(N, fr, fs, f):
    global l_residue, l_conformation
    
    content_html = ""
    conformation_html = ""
    total_html = ""
    
    for conformation in l_conformation:
        conformation_html += "<th>{}</th>".format(conformation)
        total_html += "<td>{}</td>".format(fs[conformation])
    
    for residue in l_residue:
        content_html += "<td>{}</td>".format(residue)
        
        for conformation in l_conformation:
            content_html += "<td>{}</td>".format(f[residue][conformation])
            
        content_html += "<td>{}</td>".format(fr[residue])
        content_html = "<tr>{}</tr>".format(content_html)
    
    from IPython.display import HTML, display
    display(HTML("""
        <table>
            <tr><th>Residue</th>{}<th>Total</th></tr>
            {}
            <tr><th>Total</th>{}<td>{}</td></tr>
        </table>""".format(conformation_html, content_html, total_html, N)))

# Usage:
# N, fr, fs, f = process_f(dssp_info_merged)
# display_f(N, fr, fs, f)
#
# N, fr, fs, f = process_f(stride_info_merged)
# display_f(N, fr, fs, f)

In [197]:
import os
    
def load_protein(filename):
    protein = ''
    secondary_structure = ''
    
    with open("protein/{}".format(filename), 'r') as f:
        for line in f:
            line = line.strip('\n')
            if line.startswith('>') or line == '':
                continue
            
            if protein == '':
                protein = line
            else:
                secondary_structure = line
    
    return protein, secondary_structure

l_protein = []
for filename in os.listdir('protein'):
    protein, secondary_structure = load_protein(filename)
    
    assert len(protein) == len(secondary_structure), "Protein size and secondary structure size does not match ! Protein: {}".format(filename)
    
    l_protein.append({
        'name': filename.split('.')[0],
        'protein': protein,
        'secondary_structure': secondary_structure,
    })

In [198]:
import math

# Compute self-information using the following formula:
# I(delta Sj; Rj) = log(fSj,Rj / fn-Sj,Rj) + log(fn-S / fS)
# fn-S,R = fR - fS,R
# fn-S   = N - fS
def compute_self_information(N, fr, fs, f, R):
    global l_residue, l_conformation
    
    I = {}
    for S in l_conformation:
        fnsr = fr[R] - f[R][S]
        fns = N - fs[S]
        
        I[S] = math.log(f[R][S] / fnsr) + math.log(fns / fs[S])
    
    return I

In [199]:
# Compute pair information
def compute_pair_information(N, fr, fs, f, l_R, Rj, self_information):
    I = {S:0 for S in l_conformation}
    
    for Rjm in l_R:
        Rjm = l_short_residue_mapping[Rjm]
        
        for S in l_conformation:
            fsrr = f[Rj][S] + f[Rjm][S]
            fnsr = fr[Rj] - f[Rj][S]
            fnsrr = fnsr + (fr[Rjm] - f[Rjm][S])
            I[S] += math.log(fsrr / fnsrr) + math.log(fnsr / f[Rj][S])

    return I

In [200]:
def MCC(real, predicted):
    TP, TN, FP, FN = 0.0, 0.0, 0.0, 0.0
    
    for i in range(len(real)):
        if real[i] == predicted[i]:
            if real[i] == 'H':
                TP += 1
            else:
                TN += 1
        else:
            if real[i] == 'H':
                FN += 1
            else:
                FP += 1
        
    up = (TP * TN) - (FP * FN)
    bottom = math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    
    return up / bottom

In [201]:
def Q3(real, predicted):
    correct = 0.0
    total = len(real)
    
    for i in range(total):
        if real[i] == predicted[i]:
            correct += 1
        
    return correct / total

In [205]:
# Process all proteins
for protein in l_protein:
    name = protein['name']
    secondary_structure = protein['secondary_structure']
    protein = protein['protein']
    
    N, fr, fs, f = process_f(dssp_info_merged)
    #display_f(N, fr, fs, f)
    
    # Compute local informations for each amino acid
    self_information = {}
    for residue in l_residue:
        self_information[residue] = compute_self_information(N, fr, fs, f, residue)
    
    prediction = ""
    for j in range(len(protein)):
        R = l_short_residue_mapping[protein[j]]
        
        # Validation for the window
        # a ='abcdefghijklmnopqrstuvwxyz'
        # for j in range(len(a)):
        #    print(a[max(0,j-8):j] + a[j+1:min(j+8,len(a))])
        
        window = protein[max(0, j - 8) : j]
        window += protein[j + 1 : min(j + 8, len(protein))]
        
        I = compute_pair_information(N, fr, fs, f, window, R, self_information)
        
        si = self_information[R]
        for S in l_conformation:
            I[S] += si[S]
            
        import operator
        gor_prediction = max(I.items(), key=operator.itemgetter(1))[0]
        prediction += l_conformation_mapping[gor_prediction]
    
    print("Protein:", name)
    print("PROT>", protein)
    print("REAL>", secondary_structure)
    print("PRED>", prediction)
    print("Q3:", Q3(secondary_structure, prediction))
    print("MCC:", MCC(secondary_structure, prediction))
    break # DEBUG: Remove

Protein: 1arl
PROT> ARSTNTFNYATYHTLDEIYDFMDLLVAEHPQLVSKLQIGRSYEGRPIYVLKFSTGGSNRPAIWIDLGIHSREWITQATGVWFAKKFTEDYGQDPSFTAILDSMDIFLEIVTNPDGFAFTHSQNRLWRKTRSVTSSSLCVGVDANRNWDAGFGKAGASSSPCSETYHGKYANSEVEVKSIVDFVKDHGNFKAFLSIHSYSQLLLYPYGYTTQSIPDKTELNQVAKSAVAALKSLYGTSYKYGSIITTIYQASGGSIDWSYNQGIKYSFTFELRDTGRYGFLLPASQIIPTAQETWLGVLTIMEHTVNN
REAL> CCCCCCCCCCCCCCHHHHHHHHHHHHHHCCCCEEEEEEEECCCCCEEEEEEECCCCCCCCEEEEEECCCCCCHHHHHHHHHHHHHHHHHCCCCHHHHHHHHHCEEEEECCCCHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCHHHCCCCCCCCCCCECCCCCCCECCCCCCCCHHHHHHHHHHHHHCCEEEEEEEEECCCEEEECCCCCCCCCCCHHHHHHHHHHHHHHHHHHHCCCCEEEEHHHHCCCCCCCHHHHHHHCCCCEEEEEEECCCCCCHHHCCHHHHHHHHHHHHHHHHHHHHHHHHC
PRED> CCHHEHCECEHCHHCEECCEHEECCCEEHHECCHECECHCECEHEECCCCCCEHHHEECECCCCECHCHHEECCHEEHHCCCEEECHCECHCEHHCHCCCEHEECCCECCHEEEHCCCHHHCECCCCEHCHCHEEECCCHCECECECECHCHCCHCHHHHCHCHCCHCCEEEECECEHCCECCEEHHECEECCHCHHCHECCCCHCHCHHCECHEEHECEECEEHECEECEHCCHHECECHECCHHCCEEEHHECECECECHCECHCHCECCEHHCCHCCCECHECCHHEEEHCCHCCHCEEEHCEE
Q3: 0.28664495114006516
MCC: -0.436674282356