#### Create Embeddings for CDR3 Sequences (with specified model)
##### Edit input and outputs paths. Input path should direct to the csv that has the sequences to create the embeddings for, and the output path should direct to the csv that will hold the embeddings.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ..

/home/romi/projects/cvc


#### Set Environment

In [3]:
import os, sys
import pandas as pd
SRC_DIR = "cvc"
assert os.path.isdir(SRC_DIR), f"Cannot find src dir: {SRC_DIR}"
sys.path.append(SRC_DIR)

from cvc import model_utils

from lab_notebooks.utils import SC_TRANSFORMER, TRANSFORMER, DEVICE
MODEL_DIR = os.path.join(SRC_DIR, "models")
sys.path.append(MODEL_DIR)

FILT_EDIT_DIST = True

#### Specify Parameters

In [4]:
# the sequences in the csv file need to be in a column called Sequences
input_path = "./CDR3_data/MAIT_cell_data_embeddings_8_datasets.csv"
output_path = "./CDR3_data/MAIT_cell_data_embeddings_8_datasets_embeddings.csv"
label_column = "MAIT_cell"

In [5]:
# specify which model to use
# CVC - TRANSFORMER
# scCVC - SC_TRANSFORMER
TRANSFORMER_TO_USE = TRANSFORMER

#### Load Data

In [9]:
# load data
tcrb_data = pd.read_csv(input_path, index_col=0)
# drop duplicates
tcrb_data = tcrb_data.drop_duplicates()
# drop rows with NaN
tcrb_data = tcrb_data.dropna()

In [10]:
tcrb_data

Unnamed: 0,Sequences,MAIT_cell
0,CAYRSVDSNYQLIW,MAIT_cell
1,CASSDREGEVSYNSPLHF|CASSFGGQPQHF,MAIT_cell
2,CASSPREDSTDTQYF,MAIT_cell
3,CASSTGTGDGYTF|CAVDMDSNYQLIW||,MAIT_cell
4,CASSYPGPTDTQYF|CAESLSDGQKLLF|CAVRDGDYKLSF,MAIT_cell
...,...,...
6777,CAFMTNAGGTSYGKLTF|CASSQGAYGYTF|CAVAVGVSGGGADGLTF,non-MAIT_cell
6778,CASSYNEQFF|CAVETGNQFYF,non-MAIT_cell
6779,CASCSGTGYDEQYF|CASTGTSGGPTLRDEQFF,non-MAIT_cell
6780,CASSVRSSMNTEAFF|CASSVVGGAGAYQETQYF|CASSVRSSMNT...,non-MAIT_cell


In [11]:
# rename sequences column if not named "Sequences"
tcrb_data.rename(columns={'cdr3': 'Sequences'}, inplace=True)

#### Create embeddings

In [12]:
from cvc.embbeding_wrapper import EmbeddingWrapper

# Create embeddings
embed_wrap = EmbeddingWrapper(TRANSFORMER_TO_USE, DEVICE, tcrb_data, batch_size=128, method="mean", layers=[-1], pbar=True, max_len=120)
embed_wrap.embeddings.shape

Some weights of the model checkpoint at /home/romi/projects/cvc/output_dir_singlecell_v2 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/53 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 180.00 MiB (GPU 0; 15.78 GiB total capacity; 1.55 GiB already allocated; 53.75 MiB free; 1.89 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
tcrb_embeddings_df = pd.DataFrame(embed_wrap.embeddings)
tcrb_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.149587,-0.197047,-0.606274,0.246161,0.146642,-0.208196,0.247288,-0.079729,0.198015,-0.245093,...,0.221753,0.178292,-0.638298,0.011018,-0.319213,0.780746,-0.412318,0.522235,0.514297,0.207887
1,-0.003837,0.004939,-0.305185,-0.467737,0.554620,0.526504,0.037872,-0.058776,0.078017,-0.378901,...,-0.394406,-0.488040,0.325271,-0.620283,-0.451306,-0.204477,-0.042363,0.488556,0.090419,0.102915
2,0.027054,0.238266,0.138490,-0.477724,0.421007,0.106373,-0.682291,-0.285417,0.483916,-0.346199,...,-0.040932,-0.713956,0.507948,-0.131854,0.230915,0.139384,-0.132491,0.416187,0.232041,-0.580679
3,-0.149578,0.100374,-0.122322,-0.478098,0.454484,0.019130,0.048994,0.235329,0.132402,-0.505863,...,-0.385887,-0.625642,0.282709,0.410364,0.008110,-0.087051,-0.256943,0.420582,-0.317372,0.073733
4,-0.540540,0.291595,-0.353379,-0.261123,0.426946,0.357447,-0.488219,-0.435053,0.388957,-0.267924,...,0.051595,-0.938021,0.328469,-0.103942,0.323352,0.735837,0.272375,0.473237,0.894415,-0.243902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,-0.329775,0.793744,-1.056300,0.181084,0.043187,-0.084797,0.497528,0.468565,0.060517,0.038454,...,0.523287,-0.355889,-0.269269,-0.119622,-0.516974,0.407369,0.187545,0.344213,0.109828,0.269770
5012,-0.100326,0.304981,0.190182,-0.059698,0.107292,0.121101,-0.251405,-0.306332,0.125169,-0.237075,...,0.001577,-0.410296,-0.172579,-0.399773,-0.378466,0.207969,-0.128755,0.727024,0.299680,0.027128
5013,-0.056023,-0.094397,-0.433103,-0.062313,0.069507,-0.040174,0.057963,-0.075291,0.027573,-0.167322,...,-0.025749,-0.564122,-0.157909,0.064306,-0.047353,0.050179,-0.237840,0.765852,-0.496809,0.229118
5014,-0.422617,0.676036,-1.092209,0.034845,0.449366,0.100408,0.365332,0.275192,0.020950,0.030260,...,0.410969,0.098768,-0.000810,-0.096166,-0.774795,0.810294,-0.045002,0.009765,0.297485,0.518775


In [15]:
# create anndata object
tcrb_embeddings_adata = embed_wrap.create_anndata()

# add public/private label to dataframe
df_embeddings_with_label = tcrb_embeddings_df
df_embeddings_with_label['Sequences']=list(tcrb_embeddings_adata.obs['Sequences'])
df_embeddings_with_label[label_column]=list(tcrb_embeddings_adata.obs[label_column])



In [16]:
df_embeddings_with_label

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,Sequences,MAIT_cell
0,-0.149587,-0.197047,-0.606274,0.246161,0.146642,-0.208196,0.247288,-0.079729,0.198015,-0.245093,...,-0.638298,0.011018,-0.319213,0.780746,-0.412318,0.522235,0.514297,0.207887,CASSVAGLLYEQYF,MAIT_cell
1,-0.003837,0.004939,-0.305185,-0.467737,0.554620,0.526504,0.037872,-0.058776,0.078017,-0.378901,...,0.325271,-0.620283,-0.451306,-0.204477,-0.042363,0.488556,0.090419,0.102915,CASSHPPGADLGGQPQHF,MAIT_cell
2,0.027054,0.238266,0.138490,-0.477724,0.421007,0.106373,-0.682291,-0.285417,0.483916,-0.346199,...,0.507948,-0.131854,0.230915,0.139384,-0.132491,0.416187,0.232041,-0.580679,CAWSVPPVQGDRTQHF,MAIT_cell
3,-0.149578,0.100374,-0.122322,-0.478098,0.454484,0.019130,0.048994,0.235329,0.132402,-0.505863,...,0.282709,0.410364,0.008110,-0.087051,-0.256943,0.420582,-0.317372,0.073733,CSARDLDSLTNGYTF,MAIT_cell
4,-0.540540,0.291595,-0.353379,-0.261123,0.426946,0.357447,-0.488219,-0.435053,0.388957,-0.267924,...,0.328469,-0.103942,0.323352,0.735837,0.272375,0.473237,0.894415,-0.243902,CAWSGEPSQAQYF,MAIT_cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,-0.329775,0.793744,-1.056300,0.181084,0.043187,-0.084797,0.497528,0.468565,0.060517,0.038454,...,-0.269269,-0.119622,-0.516974,0.407369,0.187545,0.344213,0.109828,0.269770,CASTGENNSPLHF,non-MAIT_cell
5012,-0.100326,0.304981,0.190182,-0.059698,0.107292,0.121101,-0.251405,-0.306332,0.125169,-0.237075,...,-0.172579,-0.399773,-0.378466,0.207969,-0.128755,0.727024,0.299680,0.027128,CASSVDWSGPGNTGELFF,non-MAIT_cell
5013,-0.056023,-0.094397,-0.433103,-0.062313,0.069507,-0.040174,0.057963,-0.075291,0.027573,-0.167322,...,-0.157909,0.064306,-0.047353,0.050179,-0.237840,0.765852,-0.496809,0.229118,CSARALAGGTNEQFF,non-MAIT_cell
5014,-0.422617,0.676036,-1.092209,0.034845,0.449366,0.100408,0.365332,0.275192,0.020950,0.030260,...,-0.000810,-0.096166,-0.774795,0.810294,-0.045002,0.009765,0.297485,0.518775,CASSFQGGDQPQHF,non-MAIT_cell


In [17]:
# output embeddings to csv
tcrb_embeddings_df.to_csv(output_path, index=False)
# save anndata object
tcrb_embeddings_adata.write_h5ad(output_path.replace(".csv", ".h5ad"))

... storing 'MAIT_cell' as categorical
