In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
property_dict = {
    "A": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "R": ["Polar", "Positive", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "N": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "D": ["Polar", "Negative", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "C": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "Q": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "E": ["Polar", "Negative", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "G": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "H": ["Polar", "Positive", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "I": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "L": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "K": ["Polar", "Positive", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "M": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "F": ["NonPolar", 'Neutral', 'Hydrophobic', "Aromatic", 'NonIonizable'],
    "P": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "S": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "T": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "W": ["NonPolar", 'Neutral', 'Hydrophobic', "Aromatic", 'NonIonizable'],
    "Y": ["Polar", 'Neutral', 'Hydrophobic', "Aromatic", 'Ionizable'],
    "V": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable']
}

### Transform property dictionary

## Notes

The RF was trained in a 5-fold cross-validation manner training on 4 of the 5 partitions using the epitope residues (positive dataset) and a subset of randomly selected non-epitope residues (negative dataset)

Each amino acid was encoded using the volume (5), hydrophobicity (6) and polarity (7) combined with the relative surface accessibility (RSA) and secondary structure (SS) predicted using NetsurfP (8).

Possible feature: Sum of volumes of residues (equation 1 in supple)

### Load epitope data of SARS

In [6]:
path = "epitope_table_export_1582825906.csv"

In [7]:
data = pd.read_csv(path)

In [8]:
epitopes_sars = data["Epitope.2"][1:]

In [9]:
for epi in epitopes_sars:
    print(epi)

AALVSGTATAGWTFGAG
AATKMSECVLGQSKRVD
AATVLQLPQGTTLPK
AAYFVGYLKPTTFMLKY
AGCLIGAEHVDTSYECD
AIPTNFSISITTEVMPV
AISSVLNDILSRLDKVE
ALALLLLDRLNQLESKV
ALNCYW
ALNCYWPLNDYGFYTTTGIGYQPYRVVVLSFEL + ACET(A1)
AMDPIYDEPTTTTSVPL
AMQMAYRF
ANKEGIVWVATEGALN
APRITFGGPTDSTDNNQN
AQKFNGLTVLPPLLTDD
ASSRSSSRSRGNSRNST
ASWFTALTQHGKEELRFPRGQ
ATEKSNVVRGWV
AYFPREGVFVFNGTSWF
AYSNNTIAIPTNFSISI
CANLLLQYGSFCTQLNRALSGIA
CASYHTVSLLRSTSQKS
CDIPIGAGICASYHTVS
CGPKLSTDLIKNQCVNF
CGPKLSTDLIKNQCVNFNFNGLTGTGVLTPSSKRFQPFQQFG
CGPKLSTDLIKNQCVNFNFNGLTGTGVLTPSSKRFQPFQQFGRDVSDFTD
CKFDEDDSEPVLKGVKLHYT
CSQNPLAELKCSVKSFE
CSQNPLAELKCSVKSFEIDKGIYQTSNFRVVPSGD
CTDVSTAIHADQLTPAW
CTDVSTAIHADQLTPAWRIYSTGNNVFQTQAG
CTPPALNCYWPLNDYGF
CTTFDDVQAPNYTQHTSSMRGVYYPDEIFR
CVLAWNTRNIDATSTGN
CYGVSATKLNDLCFSNV
CYWPLNDYGFYTTTGIG
DDKDPQFKDNVILLNKHIDA
DDSEPVLKGVKLHYT
DFCGKGYHLMSFPQAAP
DFSRQLQNSMSGASA
DGIYFAATEKSNVVRGW
DLGDISGINASVVNIQK
DLPSGFNTLKPIFKLPL
DMIAAYTAALVSGTATA
DNIKDC
DRCTTFDDVQAPNYTQH
DRLNQLESKVSGKGQQQQ
DVNLHSSR
DVSEKSGNFKHLREFVF
DVVNQNAQALNTLVKQL
EAE

In [14]:
np.mean([len(epi) for epi in epis]) # , return_counts=True)

17.42

### Load structural data of paper

In [10]:
with open("pdb_chains.fasta.txt", "r") as infile:
    txt = infile.read()
structured = txt.split(">")[1:]

In [29]:
stuct_metadata = list()
struct_sequ = list()
struct_lab_arr = list()
for entry in structured:
    split_entry = entry.split("\n")
    stuct_metadata.append(split_entry[0])
    struct_sequ.append(split_entry[1])
    labels = [int(s.isupper()) for s in split_entry[1]]
    struct_lab_arr.append(labels)
struct_lab_arr = np.array(struct_lab_arr)

In [30]:
# TESTING
print(struct_sequ[20])
print(struct_lab_arr[20])

statlclghhavpngtlvktitddqievtnatelvqsssTGKicnnphriLDgIDctlidallgdPHcdVFqnEtwdlfveRsKaFsncypydvpdyaslrslvassgtlefitegftwtgvtqnggsnackrgpgsgffsrlnwltksgstypvlnvtmpnndnfdklyiwgihhpstnqeqtslyvqasgrvtvstrrsqqtiipnigsrpwvrglssrisiywtivkpgdvlvinsngnliaprgyfkmrtgkssimrsDAPIdtcisecitpngsipndkpfqnvnkitygacpkyvkqntlklatgmrnvpekq
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [38]:
print("number amino acids", len("".join(struct_sequ)), "number sequences", len(lab_arr))

number amino acids 183887 number sequences 776


### Load linear data of paper

In [25]:
with open("iedb_linear_epitopes.fasta.txt", "r") as infile:
    txt_linear = infile.read()

In [28]:
linear = txt_linear.split(">")[1:]

In [45]:
linear_metadata = list()
linear_sequ = list()
linear_epi = list()
linear_labs = list()
for entry in linear:
    split_entry = entry.split("\n")
    if split_entry[0][:3]=="Pos":
        linear_labs.append(1)
    elif split_entry[0][:3]=="Neg":
        linear_labs.append(0)
    linear_metadata.append(split_entry[0][9:])
    linear_sequ.append(split_entry[1])
    linear_epi.append("".join([s for s in split_entry[1] if s.isupper()]))

In [52]:
# TESTING
test_int = 50
print(linear[test_int])
print(linear_sequ[test_int])
print(linear_epi[test_int])
print(linear_labs[test_int])

PositiveID_46198
msdngpqsnqrsapritfggptdstdnnqnggrngarpkqrrpqglpnntaswftaltqhgkeelrfprgqgvpiNTNSGPDDQIGYYRRATRrvrggdgkmkelsprwyfyylgtgpeaslpygankegivwvategalntpkdhigtrnpnnnaatvlqlpqgttlpkgfyaegsrggsqassrsssrsrgnsrnstpgssrgnsparmasgggetalalllldrlnqleskvsgkgqqqqgqtvtkksaaeaskkprqkrtatkqynvtqafgrrgpeqtqgnfgdqdlirqgtdykhwpqiaqfapsasaffgmsrigmevtpsgtwltyhgaiklddkdpqfkdnvillnkhidayktfpptepkkdkkkktdeaqplpqrqkkqptvtllpaadmddfsrqlqnsmsgasadstqa

msdngpqsnqrsapritfggptdstdnnqnggrngarpkqrrpqglpnntaswftaltqhgkeelrfprgqgvpiNTNSGPDDQIGYYRRATRrvrggdgkmkelsprwyfyylgtgpeaslpygankegivwvategalntpkdhigtrnpnnnaatvlqlpqgttlpkgfyaegsrggsqassrsssrsrgnsrnstpgssrgnsparmasgggetalalllldrlnqleskvsgkgqqqqgqtvtkksaaeaskkprqkrtatkqynvtqafgrrgpeqtqgnfgdqdlirqgtdykhwpqiaqfapsasaffgmsrigmevtpsgtwltyhgaiklddkdpqfkdnvillnkhidayktfpptepkkdkkkktdeaqplpqrqkkqptvtllpaadmddfsrqlqnsmsgasadstqa
NTNSGPDDQIGYYRRATR
1


In [55]:
print("class balance", np.unique(linear_labs, return_counts=True))

class balance (array([0, 1]), array([18722, 11834]))


### Define features

In [None]:
def seq2features(seq):
    for s in seq