In [1]:
import json
import os
from copy import deepcopy
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

In [2]:
%load_ext autoreload
%autoreload 2

In [103]:
from utils import Utils
from isolate_aa import IsolateAA



if not os.path.isdir('/home/yuan/results/epitope/seq_vector_1d'):
    os.mkdir('/home/yuan/results/epitope/seq_vector_1d')



### epitopes and non-epitopes

In [113]:
# collect all epitopes
json_dir = '/home/yuan/data/omics_data/epitope/mysql'
rec_iter = Utils.scan_json_record(json_dir)

n = 0
outfile = '/home/yuan/results/epitope/epitopes.txt'
with open(outfile, 'w') as f:
    for rec in rec_iter:
        for item in rec['epitopes'].values():
            f.write(item['seq']+ '\t' + '1' + '\n')
            n += 1
print(f"Number of epitopes: {n}")

Number of epitopes: 3552504


In [121]:
# build non-epitopes
json_dir = '/home/yuan/data/omics_data/epitope/mysql'
rec_iter = Utils.scan_json_record(json_dir)

n, m = 0, 0
outfile = '/home/yuan/results/epitope/others.txt'
with open(outfile, 'w') as f:
    for rec in rec_iter:
        try:
            slicer = IsolateAA(record)
            other_seq = slicer.random_seq()
            for seq in other_seq:
                f.write(seq + '\t' + '0' + '\n')
                n += 1
        except Exception as e:
            # print(e)
            # break
            m +=1
print(f"Number of random seq: {n}-{m}")

Number of random seq: 7589872-0


In [124]:
df1 = pd.read_csv('/home/yuan/results/epitope/epitopes.txt', sep='\t', header=None, index_col=None)
df2 = pd.read_csv('/home/yuan/results/epitope/others.txt', sep='\t', header=None, index_col=None)
df = pd.concat([df1, df2])
print(df1.shape, df2.shape, df.shape)

(3552504, 2) (7589872, 2) (11142376, 2)


In [141]:
# encode 
from constants import PROPERTY
from encode_aa import EncodeAA

encoder = EncodeAA()
columns = ['hydrophobicity_ph7', 'hydrophobicity', 'polarity', \
  'polarizability', 'ven_der_waals_volume',] + list(PROPERTY) + ['label']

outfile = '/home/yuan/results/epitope/epi_comp_vector.txt'
dfv = df.apply(lambda x: encoder.mean_comp(x[0], x[1]), axis=1, result_type='expand')
dfv.columns = columns
dfv.to_csv(outfile, header=True, index=False, sep='\t')

In [135]:
from collections import Counter
s = Counter('AATTGAC')
s['X']

0

## retrieve sequence segment

In [40]:
# slice epitopes
def kmer_expand(size):
    json_dir = '/home/yuan/data/omics_data/epitope/mysql'
    rec_iter = Utils.scan_json_record(json_dir)
    info = {i:0 for i in ['epitope', 'wrong_size',  'other', 'epi_err', 'other_err']}
    outfile = f'../data/seq/epitopes_{size}_kmer_expand.txt'
    with open(outfile, 'w') as f:
        for record in rec_iter:
            slicer = IsolateAA(record)
            try:
                epi_seq = slicer.slice_kmer_expand(size)
                for seq in epi_seq:
                    if len(seq) == size:
                        f.write(f"{seq}\t1\n")
                        info['epitope'] += 1
                    else:
                        info['wrong_size'] += 1
            except Exception as e:
                info['epi_err'] += 1
            try:
                other_seq = slicer.random_other_seq(size, len(epi_seq))
                for seq in other_seq:
                    if len(seq) == size:
                        f.write(f"{seq}\t0\n")
                        info['other'] += 1
                    else:
                        info['wrong_size'] += 1
            except Exception as e:
                info['other_err'] += 1
    print(f"Statistics of {size}: {info}")

for size in range(7, 21):
    kmer_expand(size)

Statistics of 7: {'epitope': 6743386, 'wrong_size': 0, 'other': 7481760, 'epi_err': 0, 'other_err': 1365}
Statistics of 8: {'epitope': 5827214, 'wrong_size': 0, 'other': 6239401, 'epi_err': 0, 'other_err': 1365}
Statistics of 9: {'epitope': 4941268, 'wrong_size': 2, 'other': 5080210, 'epi_err': 1, 'other_err': 1365}
Statistics of 10: {'epitope': 4430072, 'wrong_size': 18, 'other': 4449388, 'epi_err': 8, 'other_err': 1365}
Statistics of 11: {'epitope': 4033441, 'wrong_size': 18, 'other': 3988950, 'epi_err': 12, 'other_err': 1365}
Statistics of 12: {'epitope': 3659498, 'wrong_size': 21, 'other': 3613152, 'epi_err': 16, 'other_err': 1365}
Statistics of 13: {'epitope': 3302545, 'wrong_size': 22, 'other': 3299348, 'epi_err': 27, 'other_err': 1365}
Statistics of 14: {'epitope': 2936909, 'wrong_size': 22, 'other': 2983422, 'epi_err': 35, 'other_err': 1365}
Statistics of 15: {'epitope': 2592107, 'wrong_size': 23, 'other': 2661587, 'epi_err': 47, 'other_err': 1365}
Statistics of 16: {'epitope':

In [41]:
from utils import Utils
from isolate_aa import IsolateAA

def shrink_expand(size):
    json_dir = '/home/yuan/data/omics_data/epitope/mysql'
    rec_iter = Utils.scan_json_record(json_dir)
    info = {i:0 for i in ['epitope', 'other', 'epi_err', 'other_err', 'wrong_size']}
    outfile = f'../data/seq/epitopes_{size}_shrink_expand.txt'
    with open(outfile, 'w') as f:
        for record in rec_iter:
            slicer = IsolateAA(record)
            try:
                epi_seq = slicer.slice_shrink_expand(size)
                for seq in epi_seq:
                    if len(seq) == size:
                        f.write(f"{seq}\t1\n")
                        info['epitope'] += 1
                    else:
                        info['wrong_size'] += 1
            except Exception as e:
                info['epi_err'] += 1
            try:
                other_seq = slicer.random_other_seq(size, len(epi_seq))
                for seq in other_seq:
                    if len(seq) == size:
                        f.write(f"{seq}\t0\n")
                        info['other'] += 1
                    else:
                        info['wrong_size'] += 1
            except Exception as e:
                info['other_err'] += 1
    print(f"Statistics of {size}: {info}")

for size in range(7, 21):
    shrink_expand(size)

Statistics of 7: {'epitope': 1721421, 'other': 2640675, 'epi_err': 5131, 'other_err': 1365, 'wrong_size': 12}
Statistics of 8: {'epitope': 1845888, 'other': 2607655, 'epi_err': 2058, 'other_err': 1365, 'wrong_size': 15}
Statistics of 9: {'epitope': 1888182, 'other': 2544836, 'epi_err': 1260, 'other_err': 1365, 'wrong_size': 10}
Statistics of 10: {'epitope': 1902931, 'other': 2474149, 'epi_err': 788, 'other_err': 1365, 'wrong_size': 24}
Statistics of 11: {'epitope': 1922760, 'other': 2428598, 'epi_err': 507, 'other_err': 1365, 'wrong_size': 22}
Statistics of 12: {'epitope': 1942434, 'other': 2376708, 'epi_err': 298, 'other_err': 1365, 'wrong_size': 25}
Statistics of 13: {'epitope': 1956754, 'other': 2331685, 'epi_err': 107, 'other_err': 1365, 'wrong_size': 24}
Statistics of 14: {'epitope': 1951227, 'other': 2277030, 'epi_err': 93, 'other_err': 1365, 'wrong_size': 23}
Statistics of 15: {'epitope': 1952874, 'other': 2236675, 'epi_err': 83, 'other_err': 1365, 'wrong_size': 24}
Statistics o

In [108]:
# encode 
encoder = EncodeAA()

outdir = '/home/yuan/results/epitope/seq_vector_1d'
indir = '/home/yuan/results/epitope/seq'
df_iter = Utils.scan_text(indir, '\t')
for df, file_name in df_iter:
    df = df.dropna()
    dfv = df.apply(lambda x: encoder.vector_1d(x[0], x[1]), axis=1, result_type='expand')
    outfile = os.path.join(outdir, file_name)
    dfv.to_csv(outfile, header=False, index=False, sep='\t')
    print(outfile)

/home/yuan/results/epitope/seq_vector_1d/epitopes_16_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_13_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_9_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_15_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_18_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_19_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_11_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_12_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_17_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_15_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_13_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_14_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_20_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_12_kmer_expand.txt
/home/yuan/results/ep

In [105]:
df.shape

(4311092, 2)

In [106]:
dfv.shape

(4311092, 101)