In [1]:
def load_CSV(filename, column_names=[], separator=","):
    l_data = []
    column_names.reverse()
    
    try:
        with open(filename, 'r') as file:
            for line in file.readlines():
                temp = line.strip().split(separator)

                data = {}
                for column in column_names:
                    data[column] = temp.pop()
                    data[column] = data[column].strip()
                    
                l_data.append(data)
    except:
        print("/!\ Malformed file {} /!\ Expecting a CSV with following columns: {}".format(filename, column_names))
        
    return l_data

In [2]:
def load_CATH(filename="cath_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'protein_family'], separator="\t")

def load_DSSP(filename="dssp_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

def load_STRIDE(filename="stride_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

In [3]:
cath_info = load_CATH()
dssp_info = load_DSSP()
stride_info = load_STRIDE()

print("Loaded {} entries for CATH".format(len(cath_info)))
print("Loaded {} entries for DSSP".format(len(dssp_info)))
print("Loaded {} entries for STRIDE".format(len(stride_info)))

Loaded 498 entries for CATH
Loaded 71391 entries for DSSP
Loaded 71077 entries for STRIDE


In [65]:
l_short_residue_mapping = {
    'A': 'ALA',
    'R': 'ARG', 
    'N': 'ASN', 
    'D': 'ASP', 
    'C': 'CYS',
    'Q': 'GLN', 
    'E': 'GLU', 
    'G': 'GLY', 
    'H': 'HIS', 
    'I': 'ILE', 
    'L': 'LEU', 
    'K': 'LYS', 
    'M': 'MET', 
    'F': 'PHE', 
    'P': 'PRO',  
    'S': 'SER', 
    'T': 'THR', 
    'W': 'TRP', 
    'Y': 'TYR', 
    'V': 'VAL',
}
l_long_residue_mapping = {y:x for x,y in l_short_residue_mapping.items()}
l_residue = [y for x,y in l_short_residue_mapping.items()]
l_short_residue = [x for x,y in l_short_residue_mapping.items()]

l_conformation = ['Alpha', 'Beta', 'Coil']
l_short_conformation = ['H', 'E', 'C']
l_conformation_mapping = {'Alpha': 'H', 'Beta': 'E', 'Coil': 'C'}
l_short_conformation_mapping = {'H': 'Alpha', 'E': 'Beta', 'C': 'Coil'}

print("{} valid amino acid".format(len(l_residue)))
print("{} valid conformation".format(len(l_conformation)))

20 valid amino acid
3 valid conformation


In [5]:
# Merge DSSP/STRIDE entries according to the PDB code
def merge(data):
    l_error = []
    
    merged = {}
    for i in range(len(data)):
        entry = data[i]
        
        key = "{}_{}".format(entry['PDB_code'], entry['PDB_chain_code'])
        
        if key not in merged:
            merged[key] = {}
            merged[key]['temp'] = {}
            merged[key]['PDB_chain_code'] = entry['PDB_chain_code']
            merged[key]['sequence'] = ''
            merged[key]['prediction'] = ''
        
        try:
            # Test data consistency
            #assert (entry['PDB_chain_code'] == merged[key]['chain']), "Inconsistent data: Chain code differs for same PDB entry {}, {}".format(
            #    entry['PDB_chain_code'], merged[entry['PDB_code']]['chain']
            #)

            # Valid amino acid
            assert (entry['residue_name'] in l_residue), "Invalid amino acid: {}".format(entry['residue_name'])
        except AssertionError as e:
            l_error.append("Entry #{}: {}".format(i+1, e.args[0]))
            continue
        
        # Assignement requierement
        if entry['secondary_structure'] == 'Other':
            entry['secondary_structure'] = 'Coil'
        elif entry['secondary_structure'] == 'Helix':
            entry['secondary_structure'] = 'Alpha'
        
        position = int(entry['PDB_seq_code'])
            
        merged[key]['temp'][position] = [
            l_long_residue_mapping[entry['residue_name']],
            l_conformation_mapping[entry['secondary_structure']]
        ]
    
    for protein, data in merged.items():
        import collections
        od = collections.OrderedDict(sorted(data['temp'].items()))
        merged[protein]['sequence'] = ''.join([value[0] for key, value in od.items()])
        merged[protein]['prediction'] = ''.join([value[1] for key, value in od.items()])
    
    print("{} entries ignored".format(len(l_error)))
    #print("{}".format('\n'.join(l_error)))
    
    return merged

print("Merging DSSP")
dssp_info_merged = merge(dssp_info)
print("{} entries in DSSP".format(len(dssp_info_merged)))

print("Merging STRIDE")
stride_info_merged = merge(stride_info)
print("{} entries in STRIDE".format(len(stride_info_merged)))

Merging DSSP
804 entries ignored
498 entries in DSSP
Merging STRIDE
4 entries ignored
498 entries in STRIDE


In [6]:
print("Protein count: CATH: {}, DSSP: {}, STRIDE: {}".format(len(cath_info), len(dssp_info_merged), len(stride_info_merged)))

# Check loading results
try:
    # Every CATH protein is in DSSP and STRIDE
    for cath_protein in cath_info:
        PDB_code = cath_protein['PDB_code']
        PDB_chain_code = cath_protein['PDB_chain_code']
        
        key = "{}_{}".format(PDB_code, PDB_chain_code)
        
        assert key in dssp_info_merged and key in stride_info_merged, "CATH data inconsistent with DSSP and STRIDE: {}".format(key)
    
    assert len(cath_info) == len(dssp_info_merged) == len(stride_info_merged), "Warning: Protein count does not match !"
except AssertionError as e:
    print(e)
else:
    print("Loading went well !")

Protein count: CATH: 498, DSSP: 498, STRIDE: 498
Loading went well !


# Leave one out

The leave-one-out magic occurs here. We simply ignore the given protein in the computation of the data needed to compute self-information and pair-information

In [7]:
# Processing I(S;R) using the following formula:
# I(S;R)=log[(fS,R/fR)/(fS/N)]
# S=conformation, R=amino acid, N=total amino acid
# fR=total R, fS=total conformation, fS,R=total R->S
# Skip the given protein
def process_f(protein_set, without_protein):
    global l_residue, l_conformation
    
    #if without_protein in protein_set:
    #    print("Protein {} is in provided set, ignored !".format(without_protein))
    #else:
    #    print("Protein {} is not in provided set".format(without_protein))
    
    # Initialize data structure
    N = 0
    fr = {}
    fs = {}
    f = {}
    
    Pair = {}
    
    for conformation in l_short_conformation:
        fs[conformation] = 0
    
    for residue in l_short_residue:
        fr[residue] = 0
        f[residue] = {}
        Pair[residue] = {}
        
        for conformation in l_short_conformation:
            f[residue][conformation] = 0
            Pair[residue][conformation] = {}
            
            for i in range(-8, 9):
                Pair[residue][conformation][8 + i] = {}
                
                for _residue in l_short_residue:
                    Pair[residue][conformation][8 + i][_residue] = 0
    
    # Process all protein
    for name, data in protein_set.items():
        # Skip protein being predicted
        if name == without_protein:
            continue
        
        for i in range(len(data['sequence'])):
            residue = data['sequence'][i]
            conformation = data['prediction'][i]
            
            assert conformation in l_short_conformation, "Unknown conformation: {}".format(conformation)
            
            # Pair information
            for m in range(-8, 9):
                if i+m < 0 or i+m >= len(data['sequence']):
                    continue
                
                _residueim = data['sequence'][i + m]
                Pair[residue][conformation][8 + m][_residueim] += 1
            
            N += 1
            fr[residue] += 1
            fs[conformation] += 1
            f[residue][conformation] += 1

    return N, fr, fs, f, Pair

def display_f(N, fr, fs, f):
    global l_residue, l_conformation
    
    content_html = ""
    conformation_html = ""
    total_html = ""
    
    for conformation in l_conformation:
        short_conformation = l_conformation_mapping[conformation]
        
        conformation_html += "<th>{}</th>".format(conformation)
        total_html += "<td>{}</td>".format(fs[short_conformation])
    
    for residue in l_residue:
        short_residue = l_long_residue_mapping[residue]
        
        content_html += "<td>{}</td>".format(residue)
        
        for conformation in l_conformation:
            short_conformation = l_conformation_mapping[conformation]
            content_html += "<td>{}</td>".format(f[short_residue][short_conformation])
            
        content_html += "<td>{}</td>".format(fr[short_residue])
        content_html = "<tr>{}</tr>".format(content_html)
    
    from IPython.display import HTML, display
    display(HTML("""
        <table>
            <tr><th>Residue</th>{}<th>Total</th></tr>
            {}
            <tr><th>Total</th>{}<td>{}</td></tr>
        </table>""".format(conformation_html, content_html, total_html, N)))

# Usage:
#N, fr, fs, f, Pair = process_f(dssp_info_merged, "")
#display_f(N, fr, fs, f)
#
# N, fr, fs, f, Pair = process_f(stride_info_merged, "")
# display_f(N, fr, fs, f)

In [8]:
import os
    
def load_protein(filename):
    protein = ''
    sequence = ''
    secondary_structure = ''
    
    with open("protein/{}".format(filename), 'r') as f:
        for line in f:
            line = line.strip('\n')
            if line.startswith('>') and protein == '':
                protein = line[1:7]
                continue
            elif line.startswith('>') or line == '':
                continue
            
            if sequence == '':
                sequence = line
            else:
                secondary_structure = line
                
    return protein, sequence, secondary_structure

l_protein = []
for filename in os.listdir('protein'):
    protein, sequence, secondary_structure = load_protein(filename)
    
    assert len(sequence) == len(secondary_structure), "Protein size and secondary structure size does not match ! Protein: {}".format(filename)
    
    l_protein.append({
        'protein': protein,
        'sequence': sequence,
        'secondary_structure': secondary_structure,
    })

In [9]:
import math

# Compute self-information using the following formula:
# I(delta Sj; Rj) = log(fSj,Rj / fn-Sj,Rj) + log(fn-S / fS)
# fn-S,R = fR - fS,R
# fn-S   = N - fS
def compute_self_information(N, fr, fs, f, R):
    global l_residue, l_conformation
    
    I = {}
    for S in l_short_conformation:
        fnsr = fr[R] - f[R][S]
        fns = N - fs[S]
        
        I[S] = math.log(f[R][S] / fnsr) + math.log(fns / fs[S])
    
    return I

In [10]:
# Compute pair information
def compute_pair_information(N, fr, fs, f, Pair, sequence, j):
    I = {S:0 for S in l_short_conformation}
    
    Rj = sequence[j]
    
    for m in range(-8,9):
        for S in l_short_conformation:
            if j + m < 0 or j + m >= len(sequence):
                continue
            
            Rjm = sequence[j + m]
            
            fsrr = Pair[Rj][S][8 + m][Rjm]
            
            fnsrr = 0
            for nS in l_short_conformation:
                if nS == S:
                    continue
                fnsrr += Pair[Rj][nS][8 + m][Rjm]
            
            fnsr = fr[Rj] - f[Rj][S]
            
            temp = 0
            try:
                temp = math.log(fsrr / fnsrr)
            except:
                pass
            
            I[S] += temp + math.log(fnsr / f[Rj][S])

    return I

In [11]:
def MCC(real, predicted):
    TP, TN, FP, FN = 0.0, 0.0, 0.0, 0.0
    
    for i in range(len(real)):
        if real[i] == predicted[i]:
            if real[i] == 'H':
                TP += 1
            else:
                TN += 1
        else:
            if real[i] == 'H':
                FN += 1
            else:
                FP += 1
        
    up = (TP * TN) - (FP * FN)
    bottom = math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    
    try:
        return up / bottom
    except: # Bottom is =0
        return 'ERROR'

In [12]:
def Q3(real, predicted):
    correct = 0.0
    total = len(real)
    
    for i in range(total):
        if real[i] == predicted[i]:
            correct += 1
        
    return correct / total

In [63]:
def compute_prediction(set_info, protein, sequence='', secondary_structure=''):
    if sequence == '':
        sequence = set_info[protein]['sequence']
    if secondary_structure == '':
        secondary_structure = set_info[protein]['prediction']
    
    N, fr, fs, f, Pair = process_f(set_info, protein)
    #display_f(N, fr, fs, f)
    
    # Compute local informations for each amino acid
    self_information = {}
    for residue in l_short_residue:
        self_information[residue] = compute_self_information(N, fr, fs, f, residue)
    
    prediction = ""
    for j in range(len(sequence)):
        R = sequence[j]
        I = compute_pair_information(N, fr, fs, f, Pair, sequence, j)
        
        si = self_information[R]
        for S in l_short_conformation:
            I[S] += si[S]
            
        import operator
        gor_prediction = max(I.items(), key=operator.itemgetter(1))[0]
        prediction += gor_prediction
        
    Q3_value = Q3(secondary_structure, prediction)
    MCC_value = MCC(secondary_structure, prediction)
    
    # Familly prediction
    conformation_count = {}
    for conformation in l_short_conformation:
        conformation_count[conformation] = 0
    
    for x in prediction:
        conformation_count[x] += 1
        
    family = l_short_conformation_mapping[max(conformation_count, key=conformation_count.get)]
    
    return protein, prediction, Q3_value, MCC_value, family

In [14]:
# Example of f used
N, fr, fs, f, Pair = process_f(dssp_info_merged, "")
display_f(N, fr, fs, f)

Residue,Alpha,Beta,Coil,Total
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,
,,,,


# Results

First block is DSSP results for every protein in CATH file and second block is STRIDE results for every protein in CATH file

## Explainations

The self-information function is made using the table shown one cell above.

For the pair information: When processing DSSP or STRIDE informations, we count the appearance of a given amino acid in position 1 to 17 to copute the influence of observing a given Amino Acid in a given position within the windows considered (1 to 17) on the given conformation. The  influence is computed using the formula on course page 5 on the bottom right.

### Family prediction

I implemented a naive family prediction, it counts the number of occurence of each conformation in the predicted secondary-structure and take the maximum value as the predicted family.

### Results

Results are already pre-computed in the file 'results.json' To avoid painful computation by yourself, you can skip the next three blocks and run the 'loading' block to parse previous results. Of course, you can run those three block as well and get the exact same results.

In [73]:
from IPython.display import clear_output
results = {}

# DSSP results
results['DSSP'] = {}
for i in range(len(cath_info)):
    clear_output()
    print("({}/{})".format(i+1, len(cath_info)))
    
    data = cath_info[i]
    key = "{}_{}".format(data['PDB_code'], data['PDB_chain_code'])
    
    protein, prediction, Q3_value, MCC_value, family = compute_prediction(dssp_info_merged, key)
    results['DSSP'][key] = [protein, prediction, Q3_value, MCC_value, family]
print("DONE !")

(498/498)
family Alpha
DONE !


In [74]:
# Stride results
results['STRIDE'] = {}
for i in range(len(cath_info)):
    clear_output()
    print("({}/{})".format(i+1, len(cath_info)))
    
    data = cath_info[i]
    key = "{}_{}".format(data['PDB_code'], data['PDB_chain_code'])
    
    protein, prediction, Q3_value, MCC_value, family = compute_prediction(stride_info_merged, key)
    results['STRIDE'][key] = [protein, prediction, Q3_value, MCC_value, family]
print("DONE !")

(498/498)
family Alpha
DONE !


In [76]:
# Save results to a file
import json
with open('results.json', 'w') as fp:
    json.dump(results, fp)

In [31]:
# Load results
import json
with open('results.json') as data_file:    
    results = json.load(data_file)

In [32]:
# Display of results for DSSP
print("DSSP results:")
for _, data in results['DSSP'].items():
    protein = data[0]
    prediction = data[1]
    Q3_value = data[2]
    MCC_value = data[3]
    family = data[4]
    try:
        print("{} {} {:.2f} {:.2f} {}".format(protein, prediction, Q3_value, MCC_value, family))
    except:
        print("{} {} {:.2f} {} {}".format(protein, prediction, Q3_value, MCC_value, family))

DSSP results:
2aeb_A ECEEEEECCCECCCCCCCCEECCCHEHHHHCHHHHHHHHHHCHHCHCCCCECCCCCCCCEEEEECCCHHCHHHHHHHHHHHHHHCCCEEEEEECCCCEEEEEEEEECEEEECCCCEEEEEECCCECECEEEECCCCCCCCHHEHHHHHCCCCCCCCCCEEEEEEECCCHEEEEEEECCCCCCEEEEEHECEEHEEECHEEHECEEHEHHHHHHHHHCCCCCCEEEEECCCCCCCCECCCCCECEEEEEEEECHEEEEHEEHHHEHECCECEHEECCCCCCCCHHEEHEHHHHEHEHHHHHHHHHCCCCCCEECE 0.59 0.15 
1kq6_A CCHHEHHHEHHCHECCECCCCEEEEEHEEHHHCCCHHEEEEHECEEHHEHHHCHCEEHCCCECCCHEEEEECCCCCHCCCHHHHHHHHCEEEHECEHCECCCCHCCCCCHEHEHCEECCCCCCCCCHCCCCCHHHH 0.53 0.02 
2j8b_A EEEECCCCCCCEEEECCCCHHHHHHHHCHHEHHHEEHECCCCEEEHHHHHHHHHEHHHCCHHHHHHHH 0.37 -0.17 
1p1x_A HHHHHHHHHHHHHHHCHEEECCCCCCHHHHHHHHCCCCCCCCHEEEEECCEEEHHEHHEHCHCCCHCEEEEEEECCCCCCCCEHEHHHHHHHHHHHCCCEEEEHEEHHHHHCCCHHHCCHHHHHHHHHCHHHHEHHHEHEHHCCHHHHHHHHHHHHEHEHCCCHEEHEHECEEEEECCHHHHHEHHHHHHHHCEEEEECEECCCCEHEHHHHHHHHHHHHHHHHHCHHHHHHHEEHHHHHHHHHHHHHCC 0.66 0.37 
1ybk_A CEEHHEHCCHHEEEEEEEEHCCHHCHHHHHHHHHHHHHEHHHHCHHEHHEHH 0.60 -0.03 
3mol_A EEEEEEECEECEEEHHCHHHCHHHEECCCCCCCCCEEECECECCECCCCCCCCHEHHHHHHH

In [33]:
# Display of results for STRIDE
print("STRIDE results:")
for _, data in results['STRIDE'].items():
    protein = data[0]
    prediction = data[1]
    Q3_value = data[2]
    MCC_value = data[3]
    family = data[4]
    try:
        print("{} {} {:.2f} {:.2f} {}".format(protein, prediction, Q3_value, MCC_value, family))
    except:
        print("{} {} {:.2f} {} {}".format(protein, prediction, Q3_value, MCC_value, family))

STRIDE results:
2aeb_A EEEEEEECCEEECCCCCCCEEECHHEHHHHCHHHHHHHHHHEEHCHCCCCECCCCCCCCEEEEECCCHHCHHHHHHHHHHHHHEHCCEEEEEECCCCEEEEEEEEECEEEECCECEEEEEECCCECECEEEECCCCCCCCHHEHHHHHECCCCCCCCCEEEECEECCCHEEEEEEEECCCCCEEEEEHHCEEEEEHCHEEHHCEEHEHHHHHHHHHHHCCCCEEEEECCCCCCCCECCCCCEEEEEEEEEECEEEEEHEEHHHEHCCCECEHEEECCHCCCCHHEEHEHCCEHHEHHHHHHHHHHCCCCCEECE 0.57 0.12 
1kq6_A CCHHEHHHHHHCHECCECCCCEHEEEHEEHCCCCHHHEEEEHHCEEHHHHHHCHHEEHHCCCCCCHEEECECCCCCHCCCHHHHHHHHCEEEHECEHCCCCCCHCCCCCHEHEHCECCCCCCCCCCHCCCCCHHHH 0.57 0.09 
2j8b_A HHCCCCCCCCCCCEEHEHCCCCCCCEHHEHHHCEHEHHHEEEEECECCHECCHHHHHCCHHEEEHCHCHEHHHHHHHH 0.44 -0.04 
1p1x_A HHHHHHHHHHHHHHHCHEEECCCCCCHHHHHHHECCCCCCCCHEEEEECCCEEEHEHHEHCHCCCHHEEEEEEECCCCCCCCEHEHHHHHHHHHHHCCCEEEEHEEHHHHHHCCHHHCCHHHHHHHHHHHHHHEHEHEHEHHCHHHHHHHHHHHHHEEEHHCCHEEHEHECEEEEECCHHHHHEHHHHHHHHCEEEEECEECCCCEHEHHHHHHHHHHHHHHHHHCHHHHHHHEEHHHHHHHHHHHHHCC 0.68 0.37 
1ybk_A CEEHHCCCCHHEEEEEEEEECCHHHHHHHHHHHHHHHHEHHHHCCHEEHEHH 0.54 -0.05 
3mol_A EEEEEEECEECEEEHHCHHHCHHHEHCHHCCCECEEEECECCCCECCCCC

In [77]:
# Comparaison of results
sum_dssp_q3 = 0.0
sum_stride_q3 = 0.0
sum_dssp_mcc = 0.0
sum_stride_mcc = 0.0
    
dssp_q3_better = 0
stride_q3_better = 0
dssp_mcc_better = 0
stride_mcc_better = 0

content_html = ""
for i in range(len(cath_info)):
    data = cath_info[i]
    key = "{}_{}".format(data['PDB_code'], data['PDB_chain_code'])
    
    dssp_q3 = results['DSSP'][key][2]
    dssp_mcc = results['DSSP'][key][3]
    stride_q3 = results['STRIDE'][key][2]
    stride_mcc = results['STRIDE'][key][3]
    dssp_family = results['DSSP'][key][4]
    stride_family = results['STRIDE'][key][4]
    
    if dssp_mcc == 'ERROR':
        dssp_mcc = 0.0
    if stride_mcc == 'ERROR':
        stride_mcc = 0.0
    
    sum_dssp_q3 += dssp_q3
    sum_stride_q3 += stride_q3
    sum_dssp_mcc += dssp_mcc
    sum_stride_mcc += stride_mcc
    
    dssp_q3_color = ''
    stride_q3_color = ''
    dssp_mcc_color = ''
    stride_mcc_color = ''
    
    if dssp_q3 > stride_q3:
        dssp_q3_better += 1
        
        dssp_q3_color = "green"
        stride_q3_color = "red"
    elif dssp_q3 < stride_q3:
        stride_q3_better += 1
        
        dssp_q3_color = "red"
        stride_q3_color = "green"
        
    if dssp_mcc > stride_mcc:
        dssp_mcc_better += 1
        
        dssp_mcc_color = "green"
        stride_mcc_color = "red"
    elif dssp_mcc < stride_mcc:
        stride_mcc_better += 1
        
        dssp_mcc_color = "red"
        stride_mcc_color = "green"
    
    content_html += """
        <tr>
            <td>{protein}</td>
            <td style='color: {dssp_q3_color};'>{dssp_q3:.2f}</td>
            <td style='color: {stride_q3_color};'>{stride_q3:.2f}</td>
            <td style='color: {dssp_mcc_color};'>{dssp_mcc:.2f}</td>
            <td style='color: {stride_mcc_color};'>{stride_mcc:.2f}</td>
            <td>{dssp_family}</td>
            <td>{stride_family}</td>
        </tr>""".format(
        protein=key,
        dssp_q3=dssp_q3, 
        dssp_mcc=dssp_mcc, 
        stride_q3=stride_q3,
        stride_mcc=stride_mcc,
        dssp_q3_color=dssp_q3_color,
        stride_q3_color=stride_q3_color,
        dssp_mcc_color=dssp_mcc_color,
        stride_mcc_color=stride_mcc_color,
        dssp_family=dssp_family,
        stride_family=stride_family
    )

mean_dssp_q3 = sum_dssp_q3 / len(cath_info)
mean_stride_q3 = sum_stride_q3 / len(cath_info)
mean_dssp_mcc = sum_dssp_mcc / len(cath_info)
mean_stride_mcc = sum_stride_mcc / len(cath_info)

print("0.0 in MCC means ERROR, calculation tried to divide by zero")
print("For Q3: STRIDE data set is better {} times".format(stride_q3_better))
print("For Q3: DSSP data set is better {} times".format(dssp_q3_better))
print("For Q3: Draw {} times".format(len(cath_info) - stride_q3_better - dssp_q3_better))
print("For MCC: STRIDE data set is better {} times".format(stride_mcc_better))
print("For MCC: DSSP data set is better {} times".format(dssp_mcc_better))
print("For MCC: Draw {} times".format(len(cath_info) - stride_mcc_better - dssp_mcc_better))

from IPython.display import HTML, display
display(HTML("""
<table>
  <tr>
    <th rowspan="2">Protein</th>
    <th colspan="2">Q3</th>
    <th colspan="2">MCC</th>
    <th colspan="2">Family</th>
  </tr>
  <tr>
    <th>DSSP</th>
    <th>STRIDE</th>
    <th>DSSP</th>
    <th>STRIDE</th>
    <th>DSSP</th>
    <th>STRIDE</th>
  </tr>
  <tr>
      <th>Mean</th>
      <td>{:.2f}</td>
      <td>{:.2f}</td>
      <td>{:.2f}</td>
      <td>{:.2f}</td>
      <td></td>
      <td></td>
  </tr>
  {}
</table>
""".format(mean_dssp_q3, mean_stride_q3, mean_dssp_mcc, mean_stride_mcc, content_html)))

0.0 in MCC means ERROR, calculation tried to divide by zero
For Q3: STRIDE data set is better 246 times
For Q3: DSSP data set is better 198 times
For Q3: Draw 54 times
For MCC: STRIDE data set is better 223 times
For MCC: DSSP data set is better 232 times
For MCC: Draw 43 times


Protein,Q3,Q3,MCC,MCC,Family,Family
Protein,DSSP,STRIDE,DSSP,STRIDE,DSSP,STRIDE
Mean,0.59,0.59,0.17,0.18,,
1w0n_A,0.57,0.53,0.0,0.0,Beta,Beta
2gpi_A,0.58,0.58,0.35,0.37,Alpha,Alpha
1vbw_A,0.62,0.68,0.3,0.35,Coil,Coil
2odk_A,0.64,0.72,0.28,0.43,Alpha,Alpha
2zxy_A,0.64,0.66,0.35,0.35,Alpha,Alpha
2pr7_A,0.67,0.66,0.34,0.32,Beta,Alpha
2pyq_A,0.55,0.59,0.08,0.18,Alpha,Alpha
1jy2_N,0.62,0.63,0.26,0.22,Coil,Alpha
1j3a_A,0.6,0.57,0.22,0.16,Alpha,Alpha


# For the 6 particuliar protein given

In [66]:
# DSSP results
print("DSSP")
for data in l_protein:
    key = data['protein']
    sequence = data['sequence']
    secondary_structure = data['secondary_structure']
    
    protein, prediction, Q3_value, MCC_value, family = compute_prediction(dssp_info_merged, key, sequence, secondary_structure)
    try:
        print("{} {} {:.2f} {:.2f} {}".format(protein, prediction, Q3_value, MCC_value, family))
    except:
        print("{} {} {:.2f} {} {}".format(protein, prediction, Q3_value, MCC_value, family))

# Stride results
print()
print("STRIDE")
for data in l_protein:
    key = data['protein']
    sequence = data['sequence']
    secondary_structure = data['secondary_structure']
    
    protein, prediction, Q3_value, MCC_value, family = compute_prediction(stride_info_merged, key, sequence, secondary_structure)
    try:
        print("{} {} {:.2f} {:.2f} {}".format(protein, prediction, Q3_value, MCC_value, family))
    except:
        print("{} {} {:.2f} {} {}".format(protein, prediction, Q3_value, MCC_value, family))

DSSP
family Beta
1arl_A EHCECCCEEEEEHCHHHHHHHHHHHHHCHHHHHHHHHECCECCCCCEEEEEEECCCCCCCCEHEECCCECHHEHHEHCEEHEEHHHECEECCCCCEEEHHHCHHEHEEEECCCCCEEEHECCHHHHEEECHCECCEHHEECEEEHCHCCEHECCCCCCCCCEHEEEHEHECCEEEEHEEEHEHHCCCCHCEHEEHHEHEEHEECCCCECECCCCCCHHHHHHHHHHHHHHHHEHCEEEECCEEEEEEEEECCCCECEEECCCCEEEEEEEECHHCCEECECCCCCHHHCHCHHEHEHEEEHHHEHCH 0.50 -0.01 Beta
family Coil
1ava_C CCCCCEECCCCHEHHHCHCEEHEEHCHCCCCCEEECCCCCCCCEEEECCCCCCCCCCEEEEEECCEECCCCEHEEEECCCCEHCEEHCCCHHCHCHHHHHHHHHCHHEEEEECECCCCCCCHHCHEEHEHCCCCEHHEHCEHCHHHCHHCHHCHEHHCCCEEECCCCCCHEEEEEECCCCC 0.50 0.00 Coil
family Alpha
1avm_A EEEECCCCCCECEHEEHCHHHHHHHEHHHHHEHEEHHCHHHHHHHHHHHHHHHCHHHHHHHHHHHHEHHCCCCCHCEEECCCCCCCCCCCCCCHHHCHEHHHHHECHHCHHHHHEHHHHCEECCCEEHEHHCCCCCCHHEEHEHECCCCCCCCECHEHHHEHHHHHHHEHHECECCCHEHCEEEEEEECHHHEHEHHHHHH 0.59 0.17 Alpha
family Alpha
1hge_B HECHEHEEEHECCCEEEECEEEEECECCCCCHCHHHHHHHHHHHHHHHHCCHHHEHHHHCHHHEHHEHHHHHHHHHHHHHHHHHHHHHEEHHHHHHHHHEHHHCHHEEHCHHHHHHHHHHHHHHHHHHCHHHHCCEECEEEHCCCCCEEHEHCCCCECEEHEHCHHHHHHEEEEE 0.