#### Create Embeddings for CDR3 Sequences (with specified model)
##### Edit input and outputs paths. Input path should direct to the csv that has the sequences to create the embeddings for, and the output path should direct to the csv that will hold the embeddings.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ..

/home/dsi/rgoldner/projects/cvc


#### Set Environment

In [3]:
import os, sys
import pandas as pd
SRC_DIR = "cvc"
assert os.path.isdir(SRC_DIR), f"Cannot find src dir: {SRC_DIR}"
sys.path.append(SRC_DIR)

from cvc import model_utils

from lab_notebooks.utils import SC_TRANSFORMER, TRANSFORMER, DEVICE
MODEL_DIR = os.path.join(SRC_DIR, "models")
sys.path.append(MODEL_DIR)

FILT_EDIT_DIST = True

#### Specify Parameters

In [7]:
# the sequences in the csv file need to be in a column called Sequences
input_path = "/home/dsi/rgoldner/CDR3_data/thank_you_romi.csv"
output_path = "/home/dsi/rgoldner/CDR3_data/thank_you_romi_embeddings.csv"
label_column = ""

In [8]:
# specify which model to use
# CVC - TRANSFORMER
# scCVC - SC_TRANSFORMER
TRANSFORMER_TO_USE = TRANSFORMER

#### Load Data

In [9]:
# load data
tcrb_data = pd.read_csv(input_path, index_col=0)
# drop duplicates
tcrb_data = tcrb_data.drop_duplicates()
# drop rows with NaN
tcrb_data = tcrb_data.dropna()

INFO:numexpr.utils:Note: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
INFO:numexpr.utils:Note: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [15]:
# make tcrb_data a dataframe with Sequences column
tcrb_data = pd.DataFrame(tcrb_data)

In [17]:
# make the sequences a column not an index
tcrb_data = tcrb_data.reset_index()

In [19]:
# rename sequences column if not named "Sequences"
tcrb_data.rename(columns={'amino_acid': 'Sequences'}, inplace=True)

In [20]:
tcrb_data

Unnamed: 0,Sequences
0,CASGDYGYTF
1,CASGGNYGYTF
2,CASGRETQYF
3,CASGSGNTIYF
4,CASGSTDTQYF
...,...
9995,CASSPTAYEQYF
9996,CASSLGQGANTEAFF
9997,CASSLVQGNYGYTF
9998,CASSQGLYEQYF


#### Create embeddings

In [21]:
from cvc.embbeding_wrapper import EmbeddingWrapper

# Create embeddings
embed_wrap = EmbeddingWrapper(TRANSFORMER_TO_USE, DEVICE, tcrb_data, batch_size=128, method="mean", layers=[-1], pbar=True, max_len=120)
embed_wrap.embeddings.shape

INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/google-noto-emoji/NotoColorEmoji.ttf: In FT2Font: Could not set the fontsize (error code 0x17)


  0%|          | 0/79 [00:00<?, ?it/s]

(10000, 768)

In [22]:
tcrb_embeddings_df = pd.DataFrame(embed_wrap.embeddings)
tcrb_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.606887,0.777590,-1.375953,0.248092,0.477837,0.173244,0.154224,0.706639,0.127992,-0.238264,...,0.746878,0.441525,-0.368605,1.168895,0.169562,0.919588,-0.199778,-0.095737,0.614277,0.891439
1,-0.493638,0.866038,-1.423497,0.224215,0.244290,-0.007401,0.320357,0.787626,0.196858,-0.284240,...,1.134784,0.433451,-0.380624,1.065082,0.148533,1.014662,-0.130221,-0.327940,0.405078,0.890063
2,-0.732013,1.017734,-1.346555,0.011262,0.434303,0.013914,0.089565,0.230926,0.060034,-0.165503,...,1.108108,0.215105,-0.388929,1.007705,-0.024779,1.004402,0.359819,-0.177151,0.863491,0.369633
3,-0.631234,1.022021,-1.256504,-0.033699,0.380276,0.108630,0.372201,0.948205,-0.115172,0.013807,...,1.248619,0.218845,-0.469651,0.957724,-0.101322,1.216887,0.300408,-0.123666,0.496636,0.515614
4,-0.795483,1.284921,-1.299754,0.020509,0.419061,0.398061,0.266195,0.681700,0.089774,-0.049373,...,1.407869,0.321387,-0.401684,1.318620,-0.248883,1.082043,0.633624,-0.301196,0.802184,0.687073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.695072,1.037451,-1.023804,0.535071,0.411828,-0.132538,0.442870,0.208656,-0.305494,0.066448,...,1.041657,0.819425,-0.628385,0.773898,-0.309156,1.377102,0.240711,-0.156311,1.145874,0.275621
9996,-0.508786,1.100305,-1.102823,0.249247,-0.129231,0.130378,0.461171,0.320160,-0.216447,-0.054576,...,1.032337,0.510869,-0.102000,0.942525,-0.328476,1.322956,0.435212,-0.072095,0.673242,0.451141
9997,-0.238871,0.690715,-1.048388,0.333828,0.248334,-0.000915,0.330514,0.246601,0.051285,-0.156323,...,0.545221,0.505995,-0.300727,0.714871,-0.250641,0.921687,-0.177157,0.046636,0.595020,0.876425
9998,-0.718748,1.025938,-1.370056,0.477898,0.487099,-0.073461,0.355182,0.363366,-0.116337,-0.103241,...,1.020043,0.826583,-0.200132,1.063207,-0.508420,1.220864,0.067374,-0.096934,1.208098,0.510929


In [15]:
# create anndata object
tcrb_embeddings_adata = embed_wrap.create_anndata()

# add public/private label to dataframe
df_embeddings_with_label = tcrb_embeddings_df
df_embeddings_with_label['Sequences']=list(tcrb_embeddings_adata.obs['Sequences'])
df_embeddings_with_label[label_column]=list(tcrb_embeddings_adata.obs[label_column])



In [16]:
df_embeddings_with_label

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,Sequences,MAIT_cell
0,-0.149587,-0.197047,-0.606274,0.246161,0.146642,-0.208196,0.247288,-0.079729,0.198015,-0.245093,...,-0.638298,0.011018,-0.319213,0.780746,-0.412318,0.522235,0.514297,0.207887,CASSVAGLLYEQYF,MAIT_cell
1,-0.003837,0.004939,-0.305185,-0.467737,0.554620,0.526504,0.037872,-0.058776,0.078017,-0.378901,...,0.325271,-0.620283,-0.451306,-0.204477,-0.042363,0.488556,0.090419,0.102915,CASSHPPGADLGGQPQHF,MAIT_cell
2,0.027054,0.238266,0.138490,-0.477724,0.421007,0.106373,-0.682291,-0.285417,0.483916,-0.346199,...,0.507948,-0.131854,0.230915,0.139384,-0.132491,0.416187,0.232041,-0.580679,CAWSVPPVQGDRTQHF,MAIT_cell
3,-0.149578,0.100374,-0.122322,-0.478098,0.454484,0.019130,0.048994,0.235329,0.132402,-0.505863,...,0.282709,0.410364,0.008110,-0.087051,-0.256943,0.420582,-0.317372,0.073733,CSARDLDSLTNGYTF,MAIT_cell
4,-0.540540,0.291595,-0.353379,-0.261123,0.426946,0.357447,-0.488219,-0.435053,0.388957,-0.267924,...,0.328469,-0.103942,0.323352,0.735837,0.272375,0.473237,0.894415,-0.243902,CAWSGEPSQAQYF,MAIT_cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,-0.329775,0.793744,-1.056300,0.181084,0.043187,-0.084797,0.497528,0.468565,0.060517,0.038454,...,-0.269269,-0.119622,-0.516974,0.407369,0.187545,0.344213,0.109828,0.269770,CASTGENNSPLHF,non-MAIT_cell
5012,-0.100326,0.304981,0.190182,-0.059698,0.107292,0.121101,-0.251405,-0.306332,0.125169,-0.237075,...,-0.172579,-0.399773,-0.378466,0.207969,-0.128755,0.727024,0.299680,0.027128,CASSVDWSGPGNTGELFF,non-MAIT_cell
5013,-0.056023,-0.094397,-0.433103,-0.062313,0.069507,-0.040174,0.057963,-0.075291,0.027573,-0.167322,...,-0.157909,0.064306,-0.047353,0.050179,-0.237840,0.765852,-0.496809,0.229118,CSARALAGGTNEQFF,non-MAIT_cell
5014,-0.422617,0.676036,-1.092209,0.034845,0.449366,0.100408,0.365332,0.275192,0.020950,0.030260,...,-0.000810,-0.096166,-0.774795,0.810294,-0.045002,0.009765,0.297485,0.518775,CASSFQGGDQPQHF,non-MAIT_cell


In [23]:
# output embeddings to csv
tcrb_embeddings_df.to_csv(output_path, index=False)
# save anndata object
#tcrb_embeddings_adata.write_h5ad(output_path.replace(".csv", ".h5ad"))