#### Visualization of Random Sequences in Embedding Space

In [1]:
cd ..

/home/romi/projects/preTCR


In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
import os, sys
import collections
import pandas as pd
import matplotlib

SRC_DIR = "cvc"
assert os.path.isdir(SRC_DIR), f"Cannot find src dir: {SRC_DIR}"
sys.path.append(SRC_DIR)
from cvc import model_utils
from lab_notebooks.utils import TRANSFORMER, DEVICE, DATA_DIR
MODEL_DIR = os.path.join(SRC_DIR, "models")
sys.path.append(MODEL_DIR)

FILT_EDIT_DIST = True

PLOT_DIR = os.path.join(os.path.dirname(SRC_DIR), "plots/CDR3_data_plots")
if not os.path.isdir(PLOT_DIR):
    os.makedirs(PLOT_DIR)
PLOT_DIR



'plots/CDR3_data_plots'

#### Prepare Data

In [8]:
data_dir = DATA_DIR + "db_data_appearances.csv"

In [None]:
tcrb_data = pd.read_csv(data_dir, engine="pyarrow")

# drop duplicates
tcrb_data_sample_from = tcrb_data.drop_duplicates()
tcrb_data_sample_from.head()

In [10]:
len(tcrb_data_sample_from)

91758697

#### Analyze Appearances/Abundance

In [13]:
# add column of sequence lengths
tcrb_data_sample_from['Length'] = tcrb_data_sample_from['Sequences'].apply(len)

In [14]:
collections.Counter(tcrb_data_sample_from['Private_Public_label'])

Counter({1: 17562246, 0: 74196451})

In [16]:
# for publicness
tcrb_data_sample_from['publicness_bin'] = pd.cut(tcrb_data_sample_from.Appearances, bins=[0,1,10,25, 50, 100, 150, 200, 250, 300, 350, 400,450, 500,550, 600,650, 700,750, 800,850, 900,950,1000, 2000], labels=['0-1','1-10','10-25', '25-50', '50-100','100-150','150-200', '200-250', '250-300', '300-350','350-400', '400-450', '450-500','500-550','550-600','600-650', '650-700','700-750', '750-800', '800-850','850-900','900-950','950-1000','1000+'])
tcrb_data_sample_from.publicness_bin.value_counts()

In [None]:
stats = collections.Counter(tcrb_data_sample_from['Appearances']) # to draw appearance histogram (no bins)
stats

In [39]:
# reverse order of stats dictionary
from collections import OrderedDict
reversed_stats = OrderedDict(reversed(list(stats.items())))

#### Subsample Data

In [None]:
# sample 500,000 rows
#tcrb_data_sample = tcrb_data_sample_from.sample(n=500000)

# for db data (for equal distributions between plots)
# We will sample 15% public and 85% private sequences to have a roughly equal distribution to previous samplings.
TOTAL_COUNT = 1000000
PUBLIC_COUNT = int(.15 * TOTAL_COUNT)
PRIVATE_COUNT = int(.85 * TOTAL_COUNT)
public_sample = tcrb_data_sample_from.query('Private_Public_label == 1').sample(PUBLIC_COUNT, random_state=10)
private_sample = tcrb_data_sample_from.query('Private_Public_label == 0').sample(PRIVATE_COUNT, random_state=10)
# join public and private samples
tcrb_data_sample = pd.concat([public_sample, private_sample])

#### Genarate 1000 random sequences

In [None]:
# Generate 10000 random sequences size 11 - 20 with even distribution of amino acids
amino_acids = ['R','H','K','D','E','S','T','N','Q','C','U','G','P','A','V','I','L','M','F','Y','W']
RANDOM_SEQ_NUM = 10000
seq = ""
seq_list = list()
from random import randint
for i in range(RANDOM_SEQ_NUM):
    seq = ""
    length = randint(10,20)
    m = 0
    while m<=length:
        seq += amino_acids[randint(0,len(amino_acids)-1)]
        m+=1
    seq_list.append(seq)
seq_list

# create dataframe from sequence list
data_for_df = {'Sequences': seq_list, 'Private_Public_label': 2, 'Appearances':0}
random_seq_df = pd.DataFrame(data_for_df)

In [None]:
# concat to tcrb_data_sample
# join public and private samples
tcrb_data_sample_concatinated = pd.concat([tcrb_data_sample, random_seq_df])
tcrb_data_sample = tcrb_data_sample_concatinated
tcrb_data_sample

#### Create and Display embeddings

In [None]:
from cvc.embbeding_wrapper import EmbeddingWrapper
# Create embeddings
embed_wrap = EmbeddingWrapper(TRANSFORMER, DEVICE, tcrb_data_sample, batch_size=1024, method="mean", layers=[-1])
embed_wrap.embeddings.shape

In [None]:
# create anndata object
tcrb_embeddings_adata = embed_wrap.create_anndata()

In [None]:
# Plot embeddings
embed_wrap.plot_embedding(
    anndata=tcrb_embeddings_adata,
    color_embed='Private_Public_label',
    colors=['gold', 'darkblue', 'maroon'],
    color_map=matplotlib.colors.ListedColormap(colors),
    title="UMAP of Embeddings with 10,000 Randomly Generated Sequences",
    legend_size=3,
    plot_pdf_path=os.path.join(PLOT_DIR, "cr_model_db_data_umap_10k_rand.pdf")
)