In [1]:
import json
import os
from copy import deepcopy
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

In [2]:
%load_ext autoreload
%autoreload 2

In [208]:
from utils import Utils
from isolate_aa import IsolateAA



if not os.path.isdir('/home/yuan/results/epitope/seq_vector_1d'):
    os.mkdir('/home/yuan/results/epitope/seq_vector_1d')

outdir = '/home/yuan/results/epitope'

seq_dir = os.path.join(outdir, 'seq')
if not os.path.isdir(seq_dir):
    os.mkdir(seq_dir)


json_dir = '/home/yuan/data/omics_data/epitope/mysql'

sizes = [7, 9, 10, 12, 15, 20, 30, 50]
info = {i:0 for i in ['wrong_size',  'seq', 'err']}

## random sequences

In [209]:
# collect wrong sequence by random sequences
from constants import PROPERTY
aa = list(PROPERTY)

for size in sizes:
    outfile = os.path.join(seq_dir, f'random_{size}.txt')
    print(outfile)
    with open(outfile, 'w') as f:
        for _ in range(4_000_000):
            seq = np.random.choice(aa, size)
            seq = ''.join(seq)
            f.write(seq + '\t' + 'random\n')

/home/yuan/results/epitope/seq/random_7.txt
/home/yuan/results/epitope/seq/random_9.txt
/home/yuan/results/epitope/seq/random_10.txt
/home/yuan/results/epitope/seq/random_12.txt
/home/yuan/results/epitope/seq/random_15.txt
/home/yuan/results/epitope/seq/random_20.txt
/home/yuan/results/epitope/seq/random_30.txt
/home/yuan/results/epitope/seq/random_50.txt


In [216]:
# collect wrong sequence by random sequences
from constants import PROPERTY
aa = list(PROPERTY)

key = 'random'
n, m, pool = 0, 0, {}
outfile = f'/home/yuan/results/epitope/{key}.txt'
with open(outfile, 'w') as f:
    seq_iter = Utils.scan_json_seq(json_dir, 'epitopes')
    for seq in seq_iter:
        if len(seq) >= 6 and seq not in pool:
            _seq = np.random.choice(aa, len(seq))
            _seq = ''.join(_seq)
            f.write(_seq + '\t' + key + '\n')
            pool[seq] = 1
            n += 1
        else:
            m += 1
print(f"Number of random sequence: {n}-{m}")

Number of random sequence: 1863999-1688505


### epitopes

In [213]:
# collect all epitopes
key = 'epitope'
n, m = 0, 0
pool = {}
outfile = f'/home/yuan/results/epitope/{key}.txt'
with open(outfile, 'w') as f:
    seq_iter = Utils.scan_json_seq(json_dir, 'epitopes')
    for seq in seq_iter:
        if seq not in pool:
            f.write(seq + '\t' + key + '\n')
            pool[seq] = 1
            n += 1
        else:
            pool[seq] += 1
            m += 1
print(f"Number of unique epitopes: {n}")
print(f"Number of duplicated epitopes: {m}")

Number of unique epitopes: 1865242
Number of duplicated epitopes: 1687262


### shuffled non-epitopes

In [214]:
# collect wrong sequence by shuffling epitopes
key = 'shuffle'
n , m, pool = 0, 0, {}
outfile = f'/home/yuan/results/epitope/{key}.txt'
with open(outfile, 'w') as f:
    seq_iter = Utils.scan_json_seq(json_dir, 'epitopes')
    for seq in seq_iter:
        if len(seq) >= 6 and seq not in pool:
            _seq = list(seq)
            np.random.shuffle(_seq)
            _seq = ''.join(_seq)
            f.write(_seq + '\t' + key + '\n')
            pool[seq] = 1
            n += 1
        else:
            m += 1
print(f"Number of shuffled epitopes: {n}")
print(f"Number of skipped epitopes: {m}")

Number of shuffled epitopes: 1863999
Number of skipped epitopes: 1688505


### non-epitopes: other sequences

In [241]:
# build non-epitopes from antigens
num, key = 2, f'other'
n, m, pool = 0, 0, {}
outfile = f'/home/yuan/results/epitope/{key}.txt'
with open(outfile, 'w') as f:
    rec_iter = Utils.scan_json_record(json_dir)
    for acc, record in rec_iter:
        slicer = IsolateAA(record)
        try:
            seqs = slicer.random_size_seq(num=num)
            for seq in seqs:
                if seq not in pool:
                    f.write(seq + '\t' + key + '\n')
                    pool[seq] = 1
                    n += 1
                else:
                    m += 1
        except Exception as e:
            m +=1
        if n % 10_000 == 0:
            print(n, end=',')
print(f"Number of random seq: {n}. failed: {m}")

790000,1150000,1150000,1590000,1600000,1830000,1830000,2340000,2400000,2680000,2990000,Number of random seq: 3185991. failed: 271864


In [232]:

a= np.array([0,1,0,0,0,0,1,1,0,0])
sum(a[3:6])

0

###

In [124]:
df1 = pd.read_csv('/home/yuan/results/epitope/epitopes.txt', sep='\t', header=None, index_col=None)
df2 = pd.read_csv('/home/yuan/results/epitope/others.txt', sep='\t', header=None, index_col=None)
df = pd.concat([df1, df2])
print(df1.shape, df2.shape, df.shape)

(3552504, 2) (7589872, 2) (11142376, 2)


In [141]:
# encode 
from constants import PROPERTY
from encode_aa import EncodeAA

encoder = EncodeAA()
columns = ['hydrophobicity_ph7', 'hydrophobicity', 'polarity', \
  'polarizability', 'ven_der_waals_volume',] + list(PROPERTY) + ['label']

outfile = '/home/yuan/results/epitope/epi_comp_vector.txt'
dfv = df.apply(lambda x: encoder.mean_comp(x[0], x[1]), axis=1, result_type='expand')
dfv.columns = columns
dfv.to_csv(outfile, header=True, index=False, sep='\t')

In [135]:
from collections import Counter
s = Counter('AATTGAC')
s['X']

0

## retrieve sequence segment

In [207]:
# slice epitopes
def kmer_expand(size):
    outfile = os.path.join(seq_dir, f'epitope_{size}_kmer_expand.txt')
    print(outfile)
    with open(outfile, 'w') as f:
        rec_iter = Utils.scan_json_record(json_dir)
        for acc, record in rec_iter:
            slicer = IsolateAA(record)
            try:
                epi_seq = slicer.slice_kmer_expand(size)
                for seq in epi_seq:
                    if len(seq) == size:
                        f.write(seq + '\t' + 'epitope\n')
                        info['seq'] += 1
                    else:
                        info['wrong_size'] += 1
            except Exception as e:
                info['err'] += 1
    print(f"Statistics of {size}: {info}")

for size in sizes:
    kmer_expand(size)

/home/yuan/results/epitope/seq/epitope_7_kmer_expand.txt
Statistics of 7: {'wrong_size': 629, 'seq': 139044510, 'err': 36049}
/home/yuan/results/epitope/seq/epitope_9_kmer_expand.txt
Statistics of 9: {'wrong_size': 631, 'seq': 143985778, 'err': 36050}
/home/yuan/results/epitope/seq/epitope_10_kmer_expand.txt
Statistics of 10: {'wrong_size': 649, 'seq': 148415850, 'err': 36058}
/home/yuan/results/epitope/seq/epitope_12_kmer_expand.txt
Statistics of 12: {'wrong_size': 670, 'seq': 152075348, 'err': 36074}
/home/yuan/results/epitope/seq/epitope_15_kmer_expand.txt
Statistics of 15: {'wrong_size': 693, 'seq': 154667455, 'err': 36121}
/home/yuan/results/epitope/seq/epitope_20_kmer_expand.txt
Statistics of 20: {'wrong_size': 718, 'seq': 156790158, 'err': 36312}
/home/yuan/results/epitope/seq/epitope_30_kmer_expand.txt
Statistics of 30: {'wrong_size': 743, 'seq': 158736609, 'err': 36845}
/home/yuan/results/epitope/seq/epitope_50_kmer_expand.txt
Statistics of 50: {'wrong_size': 768, 'seq': 16059

In [206]:
def shrink_expand(size):
    outfile = os.path.join(seq_dir, f'epitope_{size}_shrink_expand.txt')
    print(outfile)
    with open(outfile, 'w') as f:
        rec_iter = Utils.scan_json_record(json_dir)
        for acc, record in rec_iter:
            slicer = IsolateAA(record)
            try:
                epi_seq = slicer.slice_shrink_expand(size)
                for seq in epi_seq:
                    if len(seq) == size:
                        f.write(seq + '\t' + 'epitope\n')
                        info['seq'] += 1
                    else:
                        info['wrong_size'] += 1
            except Exception as e:
                info['err'] += 1
    print(f"Statistics of {size}: {info}")

for size in sizes:
    shrink_expand(size)

/home/yuan/results/epitope/seq/epitope_7_shrink_expand.txt
Statistics of 7: {'wrong_size': 471, 'seq': 118909918, 'err': 31132}
/home/yuan/results/epitope/seq/epitope_9_shrink_expand.txt
Statistics of 9: {'wrong_size': 481, 'seq': 120798100, 'err': 32392}
/home/yuan/results/epitope/seq/epitope_10_shrink_expand.txt
Statistics of 10: {'wrong_size': 505, 'seq': 122701031, 'err': 33180}
/home/yuan/results/epitope/seq/epitope_12_shrink_expand.txt
Statistics of 12: {'wrong_size': 530, 'seq': 124643465, 'err': 33478}
/home/yuan/results/epitope/seq/epitope_15_shrink_expand.txt
Statistics of 15: {'wrong_size': 554, 'seq': 126596339, 'err': 33561}
/home/yuan/results/epitope/seq/epitope_20_shrink_expand.txt
Statistics of 20: {'wrong_size': 579, 'seq': 128536009, 'err': 33770}
/home/yuan/results/epitope/seq/epitope_30_shrink_expand.txt
Statistics of 30: {'wrong_size': 604, 'seq': 130448651, 'err': 34319}
/home/yuan/results/epitope/seq/epitope_50_shrink_expand.txt
Statistics of 50: {'wrong_size': 6

In [205]:
# random other sequences
def random_other(size):
    outfile = os.path.join(seq_dir, f'other_{size}.txt')
    print(outfile)
    with open(outfile, 'w') as f:
        rec_iter = Utils.scan_json_record(json_dir)
        for acc, record in rec_iter:
            slicer = IsolateAA(record)
            num_epi = slicer.num_epitopes()
            try:
                other_seq = slicer.random_other_seq(size, num_epi)
                for seq in other_seq:
                    if len(seq) == size:
                        f.write(seq + '\t' + 'other\n')
                        info['seq'] += 1
                    else:
                        info['wrong_size'] += 1
            except Exception as e:
                info['err'] += 1
    print(f"Statistics of {size}: {info}")

for size in sizes:
    random_other(size)

Statistics of 7: {'wrong_size': 459, 'seq': 97202567, 'err': 16446}
Statistics of 9: {'wrong_size': 459, 'seq': 100486468, 'err': 17811}
Statistics of 10: {'wrong_size': 459, 'seq': 103689215, 'err': 19176}
Statistics of 12: {'wrong_size': 459, 'seq': 106763250, 'err': 20541}
Statistics of 15: {'wrong_size': 459, 'seq': 109672910, 'err': 21906}
Statistics of 20: {'wrong_size': 459, 'seq': 112404864, 'err': 23271}
Statistics of 30: {'wrong_size': 459, 'seq': 114918908, 'err': 24636}
Statistics of 50: {'wrong_size': 459, 'seq': 117188497, 'err': 26001}


## encode

In [108]:
# encode 
encoder = EncodeAA()

outdir = '/home/yuan/results/epitope/seq_vector_1d'
indir = '/home/yuan/results/epitope/seq'
df_iter = Utils.scan_text(indir, '\t')
for df, file_name in df_iter:
    df = df.dropna()
    dfv = df.apply(lambda x: encoder.vector_1d(x[0], x[1]), axis=1, result_type='expand')
    outfile = os.path.join(outdir, file_name)
    dfv.to_csv(outfile, header=False, index=False, sep='\t')
    print(outfile)

/home/yuan/results/epitope/seq_vector_1d/epitopes_16_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_13_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_9_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_15_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_18_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_19_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_11_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_12_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_17_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_15_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_13_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_14_kmer_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_20_shrink_expand.txt
/home/yuan/results/epitope/seq_vector_1d/epitopes_12_kmer_expand.txt
/home/yuan/results/ep

In [105]:
df.shape

(4311092, 2)

In [106]:
dfv.shape

(4311092, 101)