In [16]:
def load_CSV(filename, column_names=[], separator=","):
    l_data = []
    column_names.reverse()
    
    try:
        with open(filename, 'r') as file:
            for line in file.readlines():
                temp = line.strip().split(separator)

                data = {}
                for column in column_names:
                    data[column] = temp.pop()
                    data[column] = data[column].strip()
                    
                l_data.append(data)
    except:
        print("/!\ Malformed file {} /!\ Expecting a CSV with following columns: {}".format(filename, column_names))
        
    return l_data

In [17]:
def load_CATH(filename="cath_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'protein_family'], separator="\t")

def load_DSSP(filename="dssp_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

def load_STRIDE(filename="stride_info.txt"):
    return load_CSV(filename=filename, column_names=['PDB_code', 'PDB_chain_code', 'PDB_seq_code', 'residue_name', 'secondary_structure'], separator="\t")

In [33]:
cath_info = load_CATH()
dssp_info = load_DSSP()
stride_info = load_STRIDE()

print("Loaded {} entries for CATH".format(len(cath_info)))
print("Loaded {} entries for DSSP".format(len(dssp_info)))
print("Loaded {} entries for STRIDE".format(len(stride_info)))

Loaded 498 entries for CATH
Loaded 71391 entries for DSSP
Loaded 71077 entries for STRIDE


In [34]:
l_residue = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS',
             'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 
             'LEU', 'LYS', 'MET', 'PHE', 'PRO', 
             'SER', 'THR', 'TRP', 'TYR', 'VAL']

print("{} valid amino acid".format(len(l_residue)))

20 valid amino acid


In [39]:
# Merge DSSP/STRIDE entries according to the PDB code
def merge(data):
    l_error = []
    
    merged = {}
    for i in range(len(data)):
        entry = data[i]
        
        key = "{}_{}".format(entry['PDB_code'], entry['PDB_chain_code'])
        
        if key not in merged:
            merged[key] = {}
            merged[key]['chain'] = entry['PDB_chain_code']
            merged[key]['seq'] = []
        
        try:
            # Test data consistency
            #assert (entry['PDB_chain_code'] == merged[key]['chain']), "Inconsistent data: Chain code differs for same PDB entry {}, {}".format(
            #    entry['PDB_chain_code'], merged[entry['PDB_code']]['chain']
            #)

            # Valid amino acid
            assert (entry['residue_name'] in l_residue), "Invalid amino acid: {}".format(entry['residue_name'])
        except AssertionError as e:
            l_error.append("Entry #{}: {}".format(i+1, e.args[0]))
            continue
        
        # Assignement requierement
        if entry['secondary_structure'] == 'Other':
            entry['secondary_structure'] = 'Coil'
        
        merged[key]['seq'].append([
            entry['PDB_seq_code'],
            entry['residue_name'],
            entry['secondary_structure'],
        ])
    
    print("{} error(s) found".format(len(l_error)))
    #print("{}".format('\n'.join(l_error)))
    
    return merged

print("Merging DSSP")
dssp_info_merged = merge(dssp_info)
print("{} entries in DSSP".format(len(dssp_info_merged)))

print("Merging STRIDE")
stride_info_merged = merge(stride_info)
print("{} entries in STRIDE".format(len(stride_info_merged)))

Merging DSSP
804 error(s) found
498 entries in DSSP
Merging STRIDE
4 error(s) found
498 entries in STRIDE


In [43]:
print("Protein count: CATH: {}, DSSP: {}, STRIDE: {}".format(len(cath_info), len(dssp_info_merged), len(stride_info_merged)))

# Check loading results
try:
    assert len(cath_info) == len(dssp_info_merged) == len(stride_info_merged), "Warning: Protein count does not match !"
except AssertionError as e:
    print(e)
else:
    print("Loading went well !")

Protein count: CATH: 498, DSSP: 498, STRIDE: 498
