#### Preprocess Single Cell Data for scCVC Training and Analysis

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
cd ..

/home/romi/projects/cvc


In [5]:
import pickle
import numpy as np
import pandas as pd
import tqdm
from single_cell_research import cvc_sequencing as cs

In [6]:
# read pickle file into a dataframe
with open('./CDR3_data/totalSCdfCleand.pkl', 'rb') as f:
    df = pickle.load(f)

In [7]:
df

Unnamed: 0,barcode_unique,cdr3_nt,chain,cdr3,seriesnumber,samplenumber,clinicalcondition,treatment,sex,age,tissue,patient_id_unique,cdr3_len
0,-4745948352490398320,TGCCGCGTCTCGGGGCTGTGAGCCAAAAACATTCAGTACTTC,TRB,CRVSGL*AKNIQYF,GSE154826,343,LUSC,,,,NORMAL,-3017181870017205481,14
1,3356714683345452859,TGTGCCTGGAACCTCGGACAGGTAAACACTGAAGCTTTCTTT,TRB,CAWNLGQVNTEAFF,GSE154826,343,LUSC,,,,NORMAL,-3017181870017205481,14
2,3356714683345452859,TGTGCAGCTACCGGTAACCAGTTCTATTTT,TRA,CAATGNQFYF,GSE154826,343,LUSC,,,,NORMAL,-3017181870017205481,10
3,4037581621109621028,TGTGCTCTAGCCCGGAATTCAGGATACAGCACCCTCACCTTT,TRA,CALARNSGYSTLTF,GSE154826,343,LUSC,,,,NORMAL,-3017181870017205481,14
4,4037581621109621028,TGCAGTGCTAGTCAGGGAGGGAGCGAGCAGTACTTC,TRB,CSASQGGSEQYF,GSE154826,343,LUSC,,,,NORMAL,-3017181870017205481,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4200330,1612487042232535466,TGTGCCAGCAGTTTATCGGGACCCCAAGAGACCCAGTACTTC,TRB,CASSLSGPQETQYF,GSE176201,GSM5359856,HEALTHY,,,,PBMC,-3094572326455513286,14
4200331,4755150262647356642,TGTGCAGCAACTGACTCTGGGGCTGGGAGTTACCAACTCACTTTC,TRA,CAATDSGAGSYQLTF,GSE176201,GSM5359856,HEALTHY,,,,PBMC,-3094572326455513286,15
4200332,4755150262647356642,TGCAGTGCTCCGCTTGGGACTAGCAATGAGCAGTTCTTC,TRB,CSAPLGTSNEQFF,GSE176201,GSM5359856,HEALTHY,,,,PBMC,-3094572326455513286,13
4200333,-2066594407049752914,TGTGCCAGCAGTGAGTCGTACAGAAATGAGCAGTTCTTC,TRB,CASSESYRNEQFF,GSE176201,GSM5359856,HEALTHY,,,,PBMC,-3094572326455513286,13


#### Create df with joint representation TRA|TRB

In [10]:
df_for_training = df[['barcode_unique', 'cdr3']]
df_for_training

Unnamed: 0,barcode_unique,cdr3
0,-4745948352490398320,CRVSGL*AKNIQYF
1,3356714683345452859,CAWNLGQVNTEAFF
2,3356714683345452859,CAATGNQFYF
3,4037581621109621028,CALARNSGYSTLTF
4,4037581621109621028,CSASQGGSEQYF
...,...,...
4200330,1612487042232535466,CASSLSGPQETQYF
4200331,4755150262647356642,CAATDSGAGSYQLTF
4200332,4755150262647356642,CSAPLGTSNEQFF
4200333,-2066594407049752914,CASSESYRNEQFF


In [12]:
seqs = cs.create_tcr_seqs(tcr_seqs_df=df_for_training, max_len=120, column_to_concat='cdr3'):
df = pd.DataFrame({'tcr_seqs': seqs})
df.to_csv("./CDR3_data/scDATA_ready_for_training.csv", index=False)

100%|█████████████████████████████| 4200334/4200334 [00:05<00:00, 800999.28it/s]


#### Create different sub-datasets of the data

In [6]:
df_small = df[:100000]
df_to_use = df_small

##### TRA Sequences Per Unique TRB

In [8]:
trb_dict = {}

# First loop: Add all TRA sequences to dictionary
for idx, row in df_to_use.iterrows():
    seq = row['cdr3']
    barcode = row['barcode_unique']
    chain = row['chain']

    if chain == 'TRB':
        if seq not in trb_dict:
            trb_dict[seq] = {'TRB': [barcode], 'TRA': []}
        else:
            trb_dict[seq]['TRB'].append(barcode)

# Second loop: Add corresponding TRB sequences to dictionary
for idx, row in df_to_use.iterrows():
    seq = row['cdr3']
    barcode = row['barcode_unique']
    chain = row['chain']

    if chain == 'TRA':
        for trb_seq, barcode_dict in trb_dict.items():
            if barcode in barcode_dict['TRB']:
                barcode_dict['TRA'].append(seq)

# Convert the TRA dictionary to a dataframe
trb_df = pd.DataFrame.from_dict(trb_dict, orient='index')
trb_df = trb_df.reset_index()
trb_df = trb_df.rename(columns={'index': 'TRB_unique'})

In [14]:
trb_df

Unnamed: 0,TRB_unique,TRB,TRA
0,CRVSGL*AKNIQYF,"[-4745948352490398320, -3129492036928541149, 8...","[CWPMAGANSKLTF, CTSVL*APGDSGAGSYQLTF, CAVSDRNG..."
1,CAWNLGQVNTEAFF,[3356714683345452859],[CAATGNQFYF]
2,CSASQGGSEQYF,[4037581621109621028],[CALARNSGYSTLTF]
3,CASINRDRGYEQYF,"[8994390596421136397, 617268407404388123, 1209...","[CAVSAVKAAGNKLTF, CAEKDSGGYQKVTF, CAASKGNTPLVF..."
4,CASSQEGTEQYF,"[-7741998218701412576, 7185584915482215100]","[CAERPGGRRALTF, CAASDNYGQNFVF, CAERPGGRRALTF, ..."
...,...,...,...
31324,CASRRTGTDNYNEQFF,[149960240039357457],[]
31325,CASSWTSGGATGELFF,[-6420659858687717802],[CAGRNYGGSQGNLIF]
31326,CASSVAGTGGDTQYF,[-2335391633839996783],[CAVSAPPASGGSYIPTF]
31327,CASSRLVSGTGAGTEAFF,[7835509911367664861],"[CAVTYSGGGADGLTF, CALN*]"


In [15]:
trb_df.to_csv("./CDR3_data/trb_to_tra_mapping_100k_scData.csv", index=False)

##### TRB Sequences Per Unique TRA


In [None]:
tra_dict = {}

# First loop: Add all TRA sequences to dictionary
for idx, row in df_to_use.iterrows():
    seq = row['cdr3']
    barcode = row['barcode_unique']
    chain = row['chain']

    if chain == 'TRA':
        if seq not in tra_dict:
            tra_dict[seq] = {'TRA': [barcode], 'TRB': []}
        else:
            tra_dict[seq]['TRA'].append(barcode)

# Second loop: Add corresponding TRB sequences to dictionary
for idx, row in df_to_use.iterrows():
    seq = row['cdr3']
    barcode = row['barcode_unique']
    chain = row['chain']

    if chain == 'TRB':
        for tra_seq, barcode_dict in tra_dict.items():
            if barcode in barcode_dict['TRA']:
                barcode_dict['TRB'].append(seq)

# Convert the TRA dictionary to a dataframe
tra_df = pd.DataFrame.from_dict(tra_dict, orient='index')
tra_df = tra_df.reset_index()
tra_df = tra_df.rename(columns={'index': 'TRA_unique'})

In [None]:
tra_df.to_csv("./CDR3_data/tra_to_trb_mapping_100k_scData.csv", index=False)