### ClusTCR clusters in Embedding Space
ClusTCR (https://svalkiers.github.io/clusTCR/) is used to create clusters for a set of CDR3 sequences. The embeddings of those sequences are then created with CVC and plotted, colored by their corresponding clusters.

In [1]:
cd ..

/home/romi/projects/preTCR


In [2]:
%load_ext autoreload
%autoreload 2

In [29]:
import os, sys
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# ClusTCR
from clustcr import datasets, Clustering, read_cdr3, metarepertoire

SRC_DIR = "cvc"
assert os.path.isdir(SRC_DIR), f"Cannot find src dir: {SRC_DIR}"
sys.path.append(SRC_DIR)
from cvc import model_utils
from cvc import plot_utils
from lab_notebooks.utils import TRANSFORMER, DEVICE, DATA_DIR
MODEL_DIR = os.path.join(SRC_DIR, "models")
sys.path.append(MODEL_DIR)

FILT_EDIT_DIST = True
PLOT_DIR = os.path.join(os.path.dirname(SRC_DIR), "plots/CDR3_data_plots")
if not os.path.isdir(PLOT_DIR):
    os.makedirs(PLOT_DIR)
PLOT_DIR

'plots/CDR3_data_plots'

#### Prepare Data

In [4]:
data_dir = DATA_DIR + "db_data_nuc_vj_genes_pub_priv.csv"

In [5]:
# read data
tcrb_data = pd.read_csv(data_dir, usecols=["Sequences", "Private_Public_label"], engine="pyarrow")

# drop duplicates
tcrb_data.drop_duplicates(inplace=True)
len(tcrb_data)

91758697

#### Different Embedding Visualizations

In [6]:
training_data_dir = DATA_DIR + "db_5mil_training_data_2.5Mpub_2.5Mpriv.csv"

In [7]:
# remove sequences from tcrb_data that are in training_seqs
training_seqs = pd.read_csv(training_data_dir, usecols=["Sequences"])
tcrb_data_sample_from = tcrb_data[~tcrb_data['Sequences'].isin(training_seqs['Sequences'])]
len(tcrb_data_sample_from)

86758697

In [12]:
# drop duplicates
tcrb_data_sample_from.drop_duplicates(inplace=True)
len(tcrb_data_sample_from)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tcrb_data_sample_from.drop_duplicates(inplace=True)


86758697

In [32]:
# sample 1,000,000 rows
tcrb_data_sample = tcrb_data_sample_from.sample(n=1000000)

#### ClusTCR

In [33]:
cdr3 = tcrb_data_sample['Sequences']
# recommended to use method mcl for data sets containing < 50,000 CDR3 sequences, and two-step for all data sets with > 50,000 sequences. 
clustering = Clustering(method='two-step')
output = clustering.fit(cdr3)

Clustering 10000 TCRs using two-step approach.
Total time to run ClusTCR: 0.350s


In [34]:
# view output
a = output.summary()
# a[a.cluster_idx == 277]
a

Unnamed: 0,size,motif
0,2,CASRV[RT]GGIDTQYF
1,2,CASSLER[IK]SPLHF
2,2,CSVEEAGG[RI]YEQYF
3,2,CSARGGLAG[AG]QQFF
4,2,CASS[QM]LGPYGYTF
5,2,CASSYSIS[RV]GNTEAFF


In [None]:
# look at the sizes of the clusters
sns.histplot(
    data=output.clusters_df, x='cluster', bins=output.clusters_df.cluster.max())
len(output.clusters_df), output.clusters_df.groupby('cluster').size().max()

#### Embeedings on top 20 clusters (by size)
##### Does the embedding space preserve the distances between the CDR3 sequences used by clusTCR?

In [36]:
sorted_output = output.summary().sort_values(by='size', ascending=False)
top_20_sorted = sorted_output.head(20)

In [37]:
top_20_sorted['cluster_idx'] = top_20_sorted.index
# take the largest 20 clusters from output
largest_20_clusters = output.clusters_df[output.clusters_df.cluster.isin(top_20_sorted['cluster_idx'])]

In [38]:
# create and display embeddings #TODO
from lab_notebooks import embbeding_wrapper as EW
embedding_wrapper = EW.EmbeddingWrapper(TRANSFORMER, DEVICE, method="mean", layers=[-1])
tcrb_embeddings = embedding_wrapper(largest_20_clusters['junction_aa'])
tcrb_embeddings.shape
tcrb_embeddings_adata = embedding_wrapper.create_adata(tcrb_embeddings, largest_20_clusters)
tcrb_embeddings_adata.plot_embedding(tcrb_embeddings_adata, color_embed='cluster',
                                color_map=plt.get_cmap('tab20'),
                                title="Top 20 clusTCR Clusters in Embedding Space",
                                legend_size=3, PLOT_DIR=PLOT_DIR, plot_pdf_path="clusTCR_umap_2.5_mil_model.pdf")

AttributeError: 'numpy.float64' object has no attribute 'isspace'

In [39]:
largest_20_clusters['junction_aa']

0       CASRVRGGIDTQYF
1       CASRVTGGIDTQYF
2        CASSLERISPLHF
3        CASSLERKSPLHF
4       CSVEEAGGIYEQYF
5       CSVEEAGGRYEQYF
6       CSARGGLAGAQQFF
7       CSARGGLAGGQQFF
8        CASSMLGPYGYTF
9        CASSQLGPYGYTF
10    CASSYSISRGNTEAFF
11    CASSYSISVGNTEAFF
Name: junction_aa, dtype: object

In [None]:
from cvc.embbeding_wrapper import EmbeddingWrapper
largest_20_clusters.loc[:, 'Sequences'] = largest_20_clusters['junction_aa']

# Create embeddings
embed_wrap = EmbeddingWrapper(TRANSFORMER, DEVICE, largest_20_clusters, batch_size=1024, method="mean", layers=[-1])
embed_wrap.embeddings.shape

In [None]:
# Plot embeddings
embed_wrap.plot_embedding(
    color_embed='cluster',
    color_map=plt.get_cmap('tab20'),
    title="Top 20 clusTCR Clusters in Embedding Space",
    legend_size=3,
    plot_pdf_path=os.path.join(PLOT_DIR, "clusTCR_umap_2.5_mil_model.pdf"),
)