In [305]:
import pandas as pd

train = pd.read_csv("../data/covid19_epitope_prediction/input_bcell.csv")
test = pd.read_csv("../data/covid19_epitope_prediction/input_sars.csv")

In [306]:
interval_fields = ["start_position", "end_position"]
train_itv = train[interval_fields].values
test_itv = test[interval_fields].values

In [307]:
train_peptide_seq = train["peptide_seq"]
test_peptide_seq = test["peptide_seq"]

In [308]:
from sklearn.feature_extraction.text import TfidfVectorizer

def encode_protein_sequences(sequences, k=3):
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(k, k))
    encoded = vectorizer.fit_transform(sequences)
    return encoded.toarray(), vectorizer

train_peptide_seq_data, vectorizer = encode_protein_sequences(train_peptide_seq)
test_peptide_seq_data = vectorizer.transform(test_peptide_seq)
test_peptide_seq_data = test_peptide_seq_data.toarray()
test_peptide_seq_data.shape


(520, 7379)

In [309]:
nsample = 3
topk = 3
tidx = 10

In [310]:
import rtree
import numpy as np

rt_index = rtree.index.Index()
for i, (l, r) in enumerate(train_itv):
    # Insert with proper bounding box format: (minx, miny, maxx, maxy)
    assert l <= r
    rt_index.insert(i, (l, r, l, r))


def evaluate(tidx):
    l, r = test_itv[tidx]
    inter = list(rt_index.intersection((l, l, r, r), objects=False))
    idx = np.array([i for i in inter if (l <= train_itv[i][0] and train_itv[i][1] <= r)])
    X_cand = train_peptide_seq_data[idx]
    q = test_peptide_seq_data[tidx].reshape(1, -1)
    # X_cand.shape, q.shape
    dists = np.sum((X_cand - q)**2, axis=1)
    topk_idx = np.argpartition(dists, topk)[:topk]
    topk_idx = idx[topk_idx]
    answer_peptide_seqs = train_peptide_seq[topk_idx].values
    # eval
    from difflib import SequenceMatcher
    print("start_position:", l)
    print("end_position:", r)
    print("query petide seq:", test_peptide_seq[tidx])
    print("answer petide seqs:", answer_peptide_seqs)
    similarities = []
    for ans_seq in answer_peptide_seqs:
        similarity = SequenceMatcher(None, test_peptide_seq[tidx], ans_seq).ratio()
        similarities.append(similarity)
    print("similarities:", similarities)
    

In [312]:
evaluate(4)

start_position: 9
end_position: 25
query petide seq: TLTSGSDLDRCTTFDDV
answer petide seqs: ['TSGSDLDRCTTFDDV' 'TNIILDLD' 'LDLDQEVK']
similarities: [0.9375, 0.4, 0.4]
