Read libraries and table with peptides from all experiments

In [1]:
import pandas as pd
import numpy as np

In [2]:
ST1 = pd.read_csv("../../../data/processed/zhang/Zhang_ST1_processed.csv.gz", index_col=0)

In [3]:
ST1.head()

Unnamed: 0,peptide_name,peptide_seq,tetramer_fluorescence,no_experiment,HLA_type
0,NYESO1-V165,SLLMWITQV,PE,1,A02:01
1,ADI-SVA,SVASTITGV,PE,1,A02:01
2,BRA-AG,WLLPGTSTV,PE,1,A02:01
3,BRA-NA,WLLPGTSTL,PE,1,A02:01
4,CD1-LLG,LLGATCMFV,PE,1,A02:01


The top 5 ranked peptides by MID count that are classified as positively binding are listed. An identity of "0" indicates that no positively binding peptide was found.
A molecular identifier (MID) was included in the DNA-BC to
provide absolute counting of the copy number for each species of tetram-
ers bound to the cell

<b>ST2 - Experiment 1

In [4]:
ST2 = pd.read_excel("../../../data/original/zhang/ST2.xlsx")

In [5]:
new_columns = ["Cell Name", "Sorted Population", "rank_1", "rank_2", "rank_3", "rank_4", "rank_5","TRAV1", "CDR3α,1", "TRAV2", "CDR3α,2", "TRBV", "CDR3β"]

In [6]:
ST2.columns = new_columns

In [7]:
ST2 = ST2.iloc[2:, :].reset_index(drop=True)

In [8]:
ST2

Unnamed: 0,Cell Name,Sorted Population,rank_1,rank_2,rank_3,rank_4,rank_5,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β
0,AA1,Naïve Endogenous,ZNT8-LLS,0,0,0,0,,,,,"6-2*01,6-3*01",CASSYSENEQFF
1,AA10,Naïve Endogenous,MART1-A2L,0,0,0,0,12-2*01,CGGQAGTALIF,,,6-1*01,CASRSYVASSNEQFF
2,AA11,Naïve Endogenous,MART1-A2L,0,0,0,0,12-2*01,CAVNGGNQFYF,,,28*01,CASTQWYGGGTPPYF
3,AA12,Naïve Endogenous,MART1-A2L,0,0,0,0,12-2*01,CAVGRDDKIIF,,,7-2*01,CASSLTTGVFSQPQHF
4,AA2,Naïve Endogenous,MART1-A2L,0,0,0,0,17*01,CATCMDSNYQLIW,,,15*01,CATSPYSVTTFANTIYF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,BH4,Spike-In,HCV-K1Y,HCV-L2I,HCV-K1S,HCV-K1Y17V,HCV-KLV,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF
171,BH5,Spike-In,HCV-K1Y,HCV-L2I,HCV-K1Y17V,HCV-K1S,HCV-KLV,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF
172,BH6,Spike-In,HCV-K1Y,HCV-L2I,HCV-K1S,HCV-K1Y17V,HCV-KLV,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF
173,BH7,Spike-In,HCV-K1Y,HCV-L2I,HCV-K1Y17V,HCV-K1S,HCV-KLV,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF


Merging peptides from each rank to create one column. Adding a flag of rank number.

In [9]:
remaining_cols = [col for col in ST2.columns if col not in ["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"]]
remaining_cols

['Cell Name',
 'Sorted Population',
 'TRAV1',
 'CDR3α,1',
 'TRAV2',
 'CDR3α,2',
 'TRBV',
 'CDR3β']

In [10]:
ST2 = ST2.melt(id_vars=remaining_cols, value_vars=["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"], value_name='peptide_name')

In [11]:
ST2 = ST2.rename(columns={'variable':'peptide_rank'})

In [12]:
ST2.columns

Index(['Cell Name', 'Sorted Population', 'TRAV1', 'CDR3α,1', 'TRAV2',
       'CDR3α,2', 'TRBV', 'CDR3β', 'peptide_rank', 'peptide_name'],
      dtype='object')

In [13]:
ST2["no_experiment"] = "1"

In [14]:
ST2 = ST2.astype({"no_experiment": "int64"})

In [15]:
Exp1 = ST2.merge(ST1, on = ["peptide_name", "no_experiment"])

In [16]:
Exp1

Unnamed: 0,Cell Name,Sorted Population,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type
0,AA1,Naïve Endogenous,,,,,"6-2*01,6-3*01",CASSYSENEQFF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01
1,AB12,Naïve Endogenous,12-3*01,CAAGGSYIPTF,,,28*01,CASSGTGGYSGANVLTF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01
2,AA10,Naïve Endogenous,12-2*01,CGGQAGTALIF,,,6-1*01,CASRSYVASSNEQFF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01
3,AA11,Naïve Endogenous,12-2*01,CAVNGGNQFYF,,,28*01,CASTQWYGGGTPPYF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01
4,AA12,Naïve Endogenous,12-2*01,CAVGRDDKIIF,,,7-2*01,CASSLTTGVFSQPQHF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,BH2,Spike-In,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01
195,BH3,Spike-In,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01
196,BH5,Spike-In,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01
197,BH7,Spike-In,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01


In [17]:
Exp1 = Exp1.rename(columns={"Cell Name": "cell_name", "Sorted Population": "sorted_population", "CDR3α,1": "CDR3alpha_1", "CDR3α,2":"CDR3alpha_2", "CDR3β": "CDR3beta"})

Mam dwa warianty łańcucha TRAV -> chce zrobić z tego wiersze

In [18]:
remaining_cols = [col for col in Exp1.columns if col not in ['TRAV1', 'TRAV2', 'CDR3alpha_1', 'CDR3alpha_2']]
remaining_cols

['cell_name',
 'sorted_population',
 'TRBV',
 'CDR3beta',
 'peptide_rank',
 'peptide_name',
 'no_experiment',
 'peptide_seq',
 'tetramer_fluorescence',
 'HLA_type']

In [19]:
Exp1.cell_name.duplicated().sum()

27

In [20]:
Exp1_trav = Exp1.melt(id_vars=remaining_cols, value_vars=['TRAV1', 'TRAV2'], value_name='TRAV')
Exp1_trav

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRAV
0,AA1,Naïve Endogenous,"6-2*01,6-3*01",CASSYSENEQFF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,TRAV1,
1,AB12,Naïve Endogenous,28*01,CASSGTGGYSGANVLTF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,TRAV1,12-3*01
2,AA10,Naïve Endogenous,6-1*01,CASRSYVASSNEQFF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,TRAV1,12-2*01
3,AA11,Naïve Endogenous,28*01,CASTQWYGGGTPPYF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,TRAV1,12-2*01
4,AA12,Naïve Endogenous,7-2*01,CASSLTTGVFSQPQHF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,TRAV1,12-2*01
...,...,...,...,...,...,...,...,...,...,...,...,...
393,BH2,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,TRAV2,
394,BH3,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,TRAV2,
395,BH5,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,TRAV2,
396,BH7,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,TRAV2,


In [21]:
Exp1_cdr3alpha = Exp1.melt(id_vars=remaining_cols, value_vars=['CDR3alpha_1', 'CDR3alpha_2'], value_name='CDR3alpha')
Exp1_cdr3alpha

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3alpha
0,AA1,Naïve Endogenous,"6-2*01,6-3*01",CASSYSENEQFF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,CDR3alpha_1,
1,AB12,Naïve Endogenous,28*01,CASSGTGGYSGANVLTF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,CDR3alpha_1,CAAGGSYIPTF
2,AA10,Naïve Endogenous,6-1*01,CASRSYVASSNEQFF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,CDR3alpha_1,CGGQAGTALIF
3,AA11,Naïve Endogenous,28*01,CASTQWYGGGTPPYF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,CDR3alpha_1,CAVNGGNQFYF
4,AA12,Naïve Endogenous,7-2*01,CASSLTTGVFSQPQHF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,CDR3alpha_1,CAVGRDDKIIF
...,...,...,...,...,...,...,...,...,...,...,...,...
393,BH2,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,CDR3alpha_2,
394,BH3,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,CDR3alpha_2,
395,BH5,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,CDR3alpha_2,
396,BH7,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,CDR3alpha_2,


In [22]:
Exp1_trav.drop(columns=['variable'], inplace=True)
Exp1_cdr3alpha.drop(columns=['variable'], inplace=True)

In [23]:
assert (Exp1_trav.cell_name == Exp1_cdr3alpha.cell_name).all()

In [24]:
Exp1_final = pd.concat([Exp1_trav,Exp1_cdr3alpha['CDR3alpha']], axis=1)
Exp1_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,AA1,Naïve Endogenous,"6-2*01,6-3*01",CASSYSENEQFF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,,
1,AB12,Naïve Endogenous,28*01,CASSGTGGYSGANVLTF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,12-3*01,CAAGGSYIPTF
2,AA10,Naïve Endogenous,6-1*01,CASRSYVASSNEQFF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CGGQAGTALIF
3,AA11,Naïve Endogenous,28*01,CASTQWYGGGTPPYF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CAVNGGNQFYF
4,AA12,Naïve Endogenous,7-2*01,CASSLTTGVFSQPQHF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CAVGRDDKIIF
...,...,...,...,...,...,...,...,...,...,...,...,...
393,BH2,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,,
394,BH3,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,,
395,BH5,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,,
396,BH7,Spike-In,28*01,CASSFLGTGLNEQYF,rank_4,HCV-K1S,1,SLVALGINAV,APC,A02:01,,


Tutaj sprawdzamy w obrębie remaining_cols czy mamy jakieś duplikaty. Potem sprawdzamy czy gdzieś występuje NaN zestaw TRAV & CDR3alpha.

Interesują nas oryginały puste, oryginały pełne, oraz duplikaty pełne.

In [25]:
mask_remaining = Exp1_final.duplicated(subset=remaining_cols)
mask_melted = Exp1_final[['TRAV', 'CDR3alpha']].isna().all(axis=1)

mask_redundant = mask_remaining & mask_melted

Exp1_final = Exp1_final[~(mask_remaining & mask_melted).values]
Exp1_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,AA1,Naïve Endogenous,"6-2*01,6-3*01",CASSYSENEQFF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,,
1,AB12,Naïve Endogenous,28*01,CASSGTGGYSGANVLTF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,12-3*01,CAAGGSYIPTF
2,AA10,Naïve Endogenous,6-1*01,CASRSYVASSNEQFF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CGGQAGTALIF
3,AA11,Naïve Endogenous,28*01,CASTQWYGGGTPPYF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CAVNGGNQFYF
4,AA12,Naïve Endogenous,7-2*01,CASSLTTGVFSQPQHF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CAVGRDDKIIF
...,...,...,...,...,...,...,...,...,...,...,...,...
255,BF11,Non-Naïve Endogenous,,,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,38-2/DV8*01,CAYRSPPSSEKLVF
256,BF12,Non-Naïve Endogenous,,,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,30*01,CGTLRNNNARLMF
269,AC2,Naïve Endogenous,18*01,CASSSRDRSSSTEAFF,rank_1,GP100-IMD,1,IMDQVPFSV,PE,A02:01,16*01,CALSPGYNFNKFYF
279,BB7,Naïve Foreign,20-1*01,CSARGAGVPYEQYF,rank_1,CMV-MLN,1,MLNIPSINV,APC,A02:01,8-4*01,CAVSSITQGGSEKLVF


In [26]:
Exp1_final["TRAV"].unique()

array([nan, '12-3*01', '12-2*01', '17*01', '23/DV6*01', '13-2*01',
       '41*01', '16*01', '13-1*01', '8-1*01', '26-1*01', '8-4*01',
       '38-2/DV8*01', '29/DV5*01', '24*01', '12-1*01', '8-3*01', '27*01',
       '30*01', '3*01', '35*01', '*'], dtype=object)

In [27]:
Exp1_final["TRBV"].unique()

array(['6-2*01,6-3*01', '28*01', '6-1*01', '7-2*01', '15*01', '4-2*01',
       '2*01', nan, '4-1*01', '27*01', '20-1*01', '19*01', '6-6*01',
       '14*01', '6-5*01', '11-1*01', '5-6*01', '6-4*01',
       '12-3*01,12-4*01', '18*01', '9*01', '4-3*01', '25-1*01', '10-3*01',
       '3-1*01', '29-1*01'], dtype=object)

In [28]:
Exp1_final = Exp1_final[Exp1_final["TRAV"]!="*"]

In [29]:
Exp1_final = Exp1_final.reset_index(drop=True)

In [30]:
Exp1_final.to_csv("../../../data/processed/zhang/Exp1_merged.csv.gz")

<b>ST3 - Experiment 2

In [31]:
ST3 = pd.read_excel("../../../data/original/zhang/ST3.xlsx")

In [32]:
new_columns = ["Cell Name", "Sorted Population", "rank_1", "rank_2", "rank_3", "rank_4", "rank_5" ,"TRAV1", "CDR3α,1", "TRAV2", "CDR3α,2", "TRBV", "CDR3β"]

In [33]:
ST3.columns = new_columns

In [34]:
ST3 = ST3.iloc[2:, :].reset_index(drop=True)

In [35]:
ST3

Unnamed: 0,Cell Name,Sorted Population,rank_1,rank_2,rank_3,rank_4,rank_5,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β
0,WA11,Clone,HCV_K1Y,HCV_L2I,HCV_K1S,HCV_K1Y17V,HCV_KLV,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF
1,WB11,Clone,HCV_K1Y,HCV_L2I,HCV_K1S,HCV_K1Y17V,HCV_KLV,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF
2,WC11,Clone,HCV_K1Y,HCV_K1S,HCV_L2I,HCV_K1Y17V,HCV_KLV,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF
3,WD11,Clone,0,0,0,0,0,,,,,,
4,WE11,Clone,HCV_K1Y,HCV_L2I,HCV_K1S,HCV_K1Y17V,HCV_KLV,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,WF9,Self_Nonnaive,MART1_A2L,0,0,0,0,14/DV4,CAMREGTGRRALTF,,,"6-2,6-3",CASSYFGGSLSEQYF
372,WG10,Self_Nonnaive,DRIP_MLY,0,0,0,0,,,,,"12-3,12-4",CASSFGRNRSQNTEAFF
373,WG9,Self_Nonnaive,MART1_A2L,0,0,0,0,14/DV4,CAMREGTGRRALTF,,,"6-2,6-3",CASSYFGGSLSEQYF
374,WH10,Self_Nonnaive,0,0,0,0,0,14/DV4,CAMREGPGGTSYGKLTF,,,"6-2,6-3",CASSYRQDSNQPQHF


In [36]:
remaining_cols = [col for col in ST3.columns if col not in ["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"]]
remaining_cols

['Cell Name',
 'Sorted Population',
 'TRAV1',
 'CDR3α,1',
 'TRAV2',
 'CDR3α,2',
 'TRBV',
 'CDR3β']

In [37]:
ST3 = ST3.melt(id_vars=remaining_cols, value_vars=["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"], value_name='peptide_name')

In [38]:
ST3 = ST3.rename(columns={'variable':'peptide_rank'})

In [39]:
ST3.columns

Index(['Cell Name', 'Sorted Population', 'TRAV1', 'CDR3α,1', 'TRAV2',
       'CDR3α,2', 'TRBV', 'CDR3β', 'peptide_rank', 'peptide_name'],
      dtype='object')

In [40]:
ST3["no_experiment"] = "2"

In [41]:
ST3 = ST3.astype({"no_experiment": "int64"})

In [42]:
ST3 = ST3[~(ST3["peptide_name"] == 0)]
ST3["peptide_name"] = ST3["peptide_name"].str.strip()

In [43]:
Experiment2  = ST1[ST1["no_experiment"]==2]

In [44]:
Experiment2

Unnamed: 0,peptide_name,peptide_seq,tetramer_fluorescence,no_experiment,HLA_type
96,NYESO1-V165,SLLMWITQV,PE,2,A02:01
97,ADI-SVA,SVASTITGV,PE,2,A02:01
98,BRA-AG,WLLPGTSTV,PE,2,A02:01
99,BRA-NA,WLLPGTSTL,PE,2,A02:01
100,CD1-LLG,LLGATCMFV,PE,2,A02:01
...,...,...,...,...,...
187,YFV-LLW,LLWNGPMAV,APC,2,A02:01
188,ALADH-VLM,VLMGGVPGVE,APC,2,A02:01
189,GLNS-GLL,GLLHHAPSL,APC,2,A02:01
190,SODA-DMW,DMWEHAFYL,APC,2,A02:01


In [45]:
ST3["peptide_name"].isin(Experiment2["peptide_name"]).value_counts()

False    410
Name: peptide_name, dtype: int64

Nie pokrywają się! W ST3 używane są "_" w nazwach, natomiast w ST1 "-". Należy zamienić w ST3 wszystkie "_" na "-".

In [46]:
ST3["peptide_name"] = ST3["peptide_name"].replace("_", "-",regex=True)

In [47]:
ST3["peptide_name"]

0       HCV-K1Y
1       HCV-K1Y
2       HCV-K1Y
4       HCV-K1Y
5       HCV-K1Y
         ...   
1506    HCV-KLV
1508    HCV-KLV
1509    HCV-KLV
1510    HCV-KLV
1511    HCV-KLV
Name: peptide_name, Length: 410, dtype: object

In [48]:
Exp2 = ST3.merge(ST1, on = ["peptide_name", "no_experiment"])

In [49]:
Exp2

Unnamed: 0,Cell Name,Sorted Population,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type
0,WA11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01
1,WB11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01
2,WC11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01
3,WE11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01
4,WF11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,WC11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01
398,WE11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01
399,WG11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01
400,WH11,Clone,38-2/DV8,CAYRSPPSSEKLVF,,,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01


In [50]:
Exp2 = Exp2.rename(columns={"Cell Name": "cell_name", "Sorted Population": "sorted_population", "CDR3α,1": "CDR3alpha_1", "CDR3α,2":"CDR3alpha_2", "CDR3β": "CDR3beta"})

Mam dwa warianty łańcucha TRAV -> chce zrobić z tego wiersze

In [51]:
remaining_cols = [col for col in Exp2.columns if col not in ['TRAV1', 'TRAV2', 'CDR3alpha_1', 'CDR3alpha_2']]
remaining_cols

['cell_name',
 'sorted_population',
 'TRBV',
 'CDR3beta',
 'peptide_rank',
 'peptide_name',
 'no_experiment',
 'peptide_seq',
 'tetramer_fluorescence',
 'HLA_type']

In [52]:
Exp2.cell_name.duplicated().sum()

60

In [53]:
Exp2_trav = Exp2.melt(id_vars=remaining_cols, value_vars=['TRAV1', 'TRAV2'], value_name='TRAV')
Exp2_trav

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRAV
0,WA11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,TRAV1,38-2/DV8
1,WB11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,TRAV1,38-2/DV8
2,WC11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,TRAV1,38-2/DV8
3,WE11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,TRAV1,38-2/DV8
4,WF11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,TRAV1,38-2/DV8
...,...,...,...,...,...,...,...,...,...,...,...,...
799,WC11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,TRAV2,
800,WE11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,TRAV2,
801,WG11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,TRAV2,
802,WH11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,TRAV2,


In [54]:
Exp2_cdr3alpha = Exp2.melt(id_vars=remaining_cols, value_vars=['CDR3alpha_1', 'CDR3alpha_2'], value_name='CDR3alpha')
Exp2_cdr3alpha

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3alpha
0,WA11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,CDR3alpha_1,CAYRSPPSSEKLVF
1,WB11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,CDR3alpha_1,CAYRSPPSSEKLVF
2,WC11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,CDR3alpha_1,CAYRSPPSSEKLVF
3,WE11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,CDR3alpha_1,CAYRSPPSSEKLVF
4,WF11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,CDR3alpha_1,CAYRSPPSSEKLVF
...,...,...,...,...,...,...,...,...,...,...,...,...
799,WC11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,CDR3alpha_2,
800,WE11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,CDR3alpha_2,
801,WG11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,CDR3alpha_2,
802,WH11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,CDR3alpha_2,


In [55]:
Exp2_trav.drop(columns=['variable'], inplace=True)
Exp2_cdr3alpha.drop(columns=['variable'], inplace=True)

In [56]:
assert (Exp2_trav.cell_name == Exp2_cdr3alpha.cell_name).all()

In [57]:
Exp2_final = pd.concat([Exp2_trav,Exp2_cdr3alpha['CDR3alpha']], axis=1)
Exp2_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,WA11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
1,WB11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
2,WC11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
3,WE11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
4,WF11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
...,...,...,...,...,...,...,...,...,...,...,...,...
799,WC11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,,
800,WE11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,,
801,WG11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,,
802,WH11,Clone,28,CASSFLGTGLNEQYF,rank_4,HCV-K1Y17V,2,YLVALGVNAV,APC,A02:01,,


Tutaj sprawdzamy w obrębie remaining_cols czy mamy jakieś duplikaty. Potem sprawdzamy czy gdzieś występuje NaN zestaw TRAV & CDR3alpha.

In [58]:
mask_remaining = Exp2_final.duplicated(subset=remaining_cols)
mask_melted = Exp2_final[['TRAV', 'CDR3alpha']].isna().all(axis=1)

mask_redundant = mask_remaining & mask_melted

Exp2_final = Exp2_final[~(mask_remaining & mask_melted).values]
Exp2_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,WA11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
1,WB11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
2,WC11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
3,WE11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
4,WF11,Clone,28,CASSFLGTGLNEQYF,rank_1,HCV-K1Y,2,YLVALGINAV,APC,A02:01,38-2/DV8,CAYRSPPSSEKLVF
...,...,...,...,...,...,...,...,...,...,...,...,...
613,UC6,Self_Naive,6-5,CASSYSQGVYTGELFF,rank_1,MART1-A2L,2,ELAGIGILTV,PE,A02:01,25,CAGYKLVF
734,WB5,Self_Naive,19,CASSTTEAYEQYF,rank_1,GP100-IMD,2,IMDQVPFSV,PE,A02:01,8-3,CAGGPYNTDKLIF
745,WC6,Self_Naive,9,CASSALAGGQADTQYF,rank_1,ZNT8-LLS,2,LLSLFSLWL,PE,A02:01,26-1,CIVRVECMYSGGGADGLTF
764,TG1,Foreign_Naive,,,rank_2,HAFP-FMN,2,FMNKFIYEI,PE,A02:01,12-2,CAVSNQGGKLIF


Interesują nas oryginały puste, oryginały pełne, oraz duplikaty pełne.

In [59]:
Exp2_final["TRAV"].unique()

array(['38-2/DV8', '12-2', nan, '12-1', '10', '39', '24', '14/DV4', '27',
       '25', '29/DV5', '17', '8-3', '4', '8-2', '20', '19', '3', '26-2',
       '35', '38-1', '12-3', '8-4', '9-2', '41', '16', '1-2', '8-1', '5',
       '26-1'], dtype=object)

In [60]:
Exp2_final["TRBV"].unique()

array(['28', '10-3', '15', '6-1', '4-1', nan, '27', '20-1', '11-2', '9',
       '13', '5-5', '19', '25-1', '3-1', '2', '6-5', '4-2', '6-2,6-3',
       '5-1', '29-1', '10-2', '7-8', '7-2', '30', '12-3,12-4', '24-1',
       '11-1', '4-3', '5-6', '14', '7-9', '6-4', '18', '5-4', '10-1',
       '12-5', '6-6'], dtype=object)

Czyli tutaj brakuje końcówek *01

In [61]:
Exp2_final = Exp2_final.reset_index(drop=True)

In [62]:
Exp2_final.to_csv("../../../data/processed/zhang/Exp2_merged.csv.gz")

<b>ST5 - Experiment 3

In [63]:
ST5 = pd.read_excel("../../../data/original/zhang/ST5.xlsx") 

In [64]:
new_columns = ["Cell Name", "Sorted Population", "rank_1", "rank_2", "rank_3", "rank_4", "rank_5" ,"TRAV1", "CDR3α,1", "TRAV2", "CDR3α,2", "TRBV", "CDR3β"]

In [65]:
ST5.columns = new_columns

In [66]:
ST5 = ST5.iloc[2:, :].reset_index(drop=True)

In [67]:
ST5

Unnamed: 0,Cell Name,Sorted Population,rank_1,rank_2,rank_3,rank_4,rank_5,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β
0,BA1,Neo+WT+,GANAB,GANAB-S5F,0,0,0,,,,,29-1*01,CSVPEGNTGELFF
1,BA10,Neo+WT+,HCV-KLV,0,0,0,0,,,,,7-9*01,CASSLEGEQYF
2,BA11,Neo+WT+,NSDHL-A9V,NSDHL,0,0,0,14/DV4*01,CAMRESNTGGFKTIF,,,,
3,BA2,Neo+WT+,SMARCD3,SMARCD3-H8Y,0,0,0,5*01,CAVYNTDKLIF,,,4-1*01,CASSQGALGYTF
4,BA3,Neo+WT+,USP28,0,0,0,0,13-2*01/13-2*02,GGTSYGKLTF,,,"12-3*01,12-4*01",CASSFPDRGQGVYGYTF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,CC8,Neo-WT+,SEC24A,0,0,0,0,,,,,,
269,CD6,Neo-WT+,GANAB,0,0,0,0,12-2*01,CAVNNARLMF,,,4-3*01,CASSQGGGGTDTQYF
270,CH11,Neo-WT+,GANAB,0,0,0,0,12-2*01,CAVNNARLMF,,,4-3*01,CASSQGGGGTDTQYF
271,CH4,Neo-WT+,SEC24A,0,0,0,0,14/DV4*01,CAMREFYSGGGADGLTF,,,2*01,CASSEDRGNSPLHF


In [68]:
remaining_cols = [col for col in ST5.columns if col not in ["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"]]
remaining_cols

['Cell Name',
 'Sorted Population',
 'TRAV1',
 'CDR3α,1',
 'TRAV2',
 'CDR3α,2',
 'TRBV',
 'CDR3β']

In [69]:
ST5 = ST5.melt(id_vars=remaining_cols, value_vars=["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"], value_name='peptide_name')

In [70]:
ST5 = ST5.rename(columns={'variable':'peptide_rank'})

In [71]:
ST5.columns

Index(['Cell Name', 'Sorted Population', 'TRAV1', 'CDR3α,1', 'TRAV2',
       'CDR3α,2', 'TRBV', 'CDR3β', 'peptide_rank', 'peptide_name'],
      dtype='object')

In [72]:
ST5["no_experiment"] = "3"

In [73]:
ST5 = ST5.astype({"no_experiment": "int64"})

In [74]:
Exp3 = ST5.merge(ST1, on = ["peptide_name", "no_experiment"])

In [75]:
Exp3

Unnamed: 0,Cell Name,Sorted Population,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type
0,BA1,Neo+WT+,,,,,29-1*01,CSVPEGNTGELFF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01
1,BA4,Neo-WT+,12-2*01,CAVNNARLMF,,,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01
2,BA7,Neo-WT+,12-2*01,CAIEGGKLIF,,,2*01,CASSDWGGETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01
3,BB2,Neo-WT+,12-2*01,CAVNNARLMF,,,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01
4,BB3,Neo-WT+,12-3*01,CAMKDFGNEKLTF,,,2*01,CSWDFQETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,BB11,Neo+WT+,25*01,CAGRKTSYDKVIF,,,4-3*01,CASSYASTGTLNYGYTF,rank_2,SEC24A,3,FLYNPLTRV,PE,A02:01
98,BG4,Neo-WT+,23/DV6*01,CAASLNTNAGKSTF,,,30*01,CAWSVGNYGYTF,rank_1,COL18A1,3,VLLGVKLSGV,PE,A02:01
99,BH3,Neo-WT+,13-2*01,CAENKDDYKLSF,,,7-8*01,CASSFSATGELFF,rank_1,SNX24,3,KLSHQPVLL,PE,A02:01
100,CH6,Neo+WT+,12-1*01,CVVSPYNQGGKLIF,,,7-9*01,CASSLDIGDQPQHF,rank_2,GNL3L,3,NLNRCSVPV,PE,A02:01


In [76]:
Exp3 = Exp3.rename(columns={"Cell Name": "cell_name", "Sorted Population": "sorted_population", "CDR3α,1": "CDR3alpha_1", "CDR3α,2":"CDR3alpha_2", "CDR3β": "CDR3beta"})

Mam dwa warianty łańcucha TRAV -> chce zrobić z tego wiersze

In [77]:
remaining_cols = [col for col in Exp3.columns if col not in ['TRAV1', 'TRAV2', 'CDR3alpha_1', 'CDR3alpha_2']]
remaining_cols

['cell_name',
 'sorted_population',
 'TRBV',
 'CDR3beta',
 'peptide_rank',
 'peptide_name',
 'no_experiment',
 'peptide_seq',
 'tetramer_fluorescence',
 'HLA_type']

In [78]:
Exp3.cell_name.duplicated().sum()

3

In [79]:
Exp3_trav = Exp3.melt(id_vars=remaining_cols, value_vars=['TRAV1', 'TRAV2'], value_name='TRAV')
Exp3_trav

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRAV
0,BA1,Neo+WT+,29-1*01,CSVPEGNTGELFF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,TRAV1,
1,BA4,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,TRAV1,12-2*01
2,BA7,Neo-WT+,2*01,CASSDWGGETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,TRAV1,12-2*01
3,BB2,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,TRAV1,12-2*01
4,BB3,Neo-WT+,2*01,CSWDFQETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,TRAV1,12-3*01
...,...,...,...,...,...,...,...,...,...,...,...,...
199,BB11,Neo+WT+,4-3*01,CASSYASTGTLNYGYTF,rank_2,SEC24A,3,FLYNPLTRV,PE,A02:01,TRAV2,
200,BG4,Neo-WT+,30*01,CAWSVGNYGYTF,rank_1,COL18A1,3,VLLGVKLSGV,PE,A02:01,TRAV2,
201,BH3,Neo-WT+,7-8*01,CASSFSATGELFF,rank_1,SNX24,3,KLSHQPVLL,PE,A02:01,TRAV2,
202,CH6,Neo+WT+,7-9*01,CASSLDIGDQPQHF,rank_2,GNL3L,3,NLNRCSVPV,PE,A02:01,TRAV2,


In [80]:
Exp3_cdr3alpha = Exp3.melt(id_vars=remaining_cols, value_vars=['CDR3alpha_1', 'CDR3alpha_2'], value_name='CDR3alpha')
Exp3_cdr3alpha

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3alpha
0,BA1,Neo+WT+,29-1*01,CSVPEGNTGELFF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,CDR3alpha_1,
1,BA4,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,CDR3alpha_1,CAVNNARLMF
2,BA7,Neo-WT+,2*01,CASSDWGGETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,CDR3alpha_1,CAIEGGKLIF
3,BB2,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,CDR3alpha_1,CAVNNARLMF
4,BB3,Neo-WT+,2*01,CSWDFQETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,CDR3alpha_1,CAMKDFGNEKLTF
...,...,...,...,...,...,...,...,...,...,...,...,...
199,BB11,Neo+WT+,4-3*01,CASSYASTGTLNYGYTF,rank_2,SEC24A,3,FLYNPLTRV,PE,A02:01,CDR3alpha_2,
200,BG4,Neo-WT+,30*01,CAWSVGNYGYTF,rank_1,COL18A1,3,VLLGVKLSGV,PE,A02:01,CDR3alpha_2,
201,BH3,Neo-WT+,7-8*01,CASSFSATGELFF,rank_1,SNX24,3,KLSHQPVLL,PE,A02:01,CDR3alpha_2,
202,CH6,Neo+WT+,7-9*01,CASSLDIGDQPQHF,rank_2,GNL3L,3,NLNRCSVPV,PE,A02:01,CDR3alpha_2,


In [81]:
Exp3_trav.drop(columns=['variable'], inplace=True)
Exp3_cdr3alpha.drop(columns=['variable'], inplace=True)

In [82]:
assert (Exp3_trav.cell_name == Exp3_cdr3alpha.cell_name).all()

In [83]:
Exp3_final = pd.concat([Exp3_trav,Exp3_cdr3alpha['CDR3alpha']], axis=1)
Exp3_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,BA1,Neo+WT+,29-1*01,CSVPEGNTGELFF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,,
1,BA4,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-2*01,CAVNNARLMF
2,BA7,Neo-WT+,2*01,CASSDWGGETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-2*01,CAIEGGKLIF
3,BB2,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-2*01,CAVNNARLMF
4,BB3,Neo-WT+,2*01,CSWDFQETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-3*01,CAMKDFGNEKLTF
...,...,...,...,...,...,...,...,...,...,...,...,...
199,BB11,Neo+WT+,4-3*01,CASSYASTGTLNYGYTF,rank_2,SEC24A,3,FLYNPLTRV,PE,A02:01,,
200,BG4,Neo-WT+,30*01,CAWSVGNYGYTF,rank_1,COL18A1,3,VLLGVKLSGV,PE,A02:01,,
201,BH3,Neo-WT+,7-8*01,CASSFSATGELFF,rank_1,SNX24,3,KLSHQPVLL,PE,A02:01,,
202,CH6,Neo+WT+,7-9*01,CASSLDIGDQPQHF,rank_2,GNL3L,3,NLNRCSVPV,PE,A02:01,,


Tutaj sprawdzamy w obrębie remaining_cols czy mamy jakieś duplikaty. Potem sprawdzamy czy gdzieś występuje NaN zestaw TRAV & CDR3alpha.

In [84]:
mask_remaining = Exp3_final.duplicated(subset=remaining_cols)
mask_melted = Exp3_final[['TRAV', 'CDR3alpha']].isna().all(axis=1)

mask_redundant = mask_remaining & mask_melted

Exp3_final = Exp3_final[~(mask_remaining & mask_melted).values]
Exp3_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,BA1,Neo+WT+,29-1*01,CSVPEGNTGELFF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,,
1,BA4,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-2*01,CAVNNARLMF
2,BA7,Neo-WT+,2*01,CASSDWGGETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-2*01,CAIEGGKLIF
3,BB2,Neo-WT+,4-3*01,CASSQGGGGTDTQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-2*01,CAVNNARLMF
4,BB3,Neo-WT+,2*01,CSWDFQETQYF,rank_1,GANAB,3,ALYGSVPVL,PE,A02:01,12-3*01,CAMKDFGNEKLTF
...,...,...,...,...,...,...,...,...,...,...,...,...
156,BC6,Neo-WT+,9*01,CASSVETGGLDTQYF,rank_1,FNDC3B,3,VVLSWAPPV,PE,A02:01,12-1*01,CVVISTDSWGKFQF
168,CC6,Neo+WT+,9*01,CASSVETGGLDTQYF,rank_2,MLL2,3,ALSPVIPLI,PE,A02:01,17*01,CATASLRNYGQNFVF
180,BG1,Neo+WT+,"12-3*01,12-4*01",CASSPDPYEQYF,rank_1,MRM1,3,LLFGMTPCL,PE,A02:01,13-2*01,CAVVMEYGNKLVF
185,CB9,Neo-WT+,3-1*01,CASSLAYEQYF,rank_1,PGM5,3,AVGSHVYSV,PE,A02:01,9-2*01,CAPPIEGGSEKLVF


Interesują nas oryginały puste, oryginały pełne, oraz duplikaty pełne.

In [85]:
Exp3_final["TRAV"].unique()

array([nan, '12-2*01', '12-3*01', '5*01', '8-1*01', '13-2*01/13-2*02',
       '8-3*01', '38-2/DV8*01', '9-2*01', '14/DV4*01', '38-1*01', '19*01',
       '3*01', '29/DV5*01', '17*01', '41*01 F', '1-2*01', '24*01',
       '21*01', '10*01', '8-4*01', '41*01', '35*01', '12-1*01', '22*01',
       '22*01 F', '25*01', '23/DV6*01', '13-2*01'], dtype=object)

In [86]:
Exp3_final["TRBV"].unique()

array(['29-1*01', '4-3*01', '2*01', nan, '6-2*01,6-3*01', '4-1*01',
       '27*01', '12-3*01,12-4*01', '7-2*01', '20-1*01', '7-9*01',
       '5-4*01', '6-6*01', '15*01', '9*01', '7-8*01', '10-3*01', '13*01',
       '6-5*01', '28*01', '19*01', '7-7*01', '5-8*01', '5-5*01', '6-1*01',
       '7-6*01', '4-2*01', '25-1*01', '3-1*01', '18*01', '11-2*01',
       '5-6*01', '30*01'], dtype=object)

In [87]:
Exp3_final = Exp3_final.reset_index(drop=True)

In [88]:
Exp3_final.to_csv("../../../data/processed/zhang/Exp3_merged.csv.gz")

<b>ST6 - Experiment 4

In [89]:
ST6 = pd.read_excel("../../../data/original/zhang/ST6.xlsx")

In [90]:
new_columns = ["Cell Name", "Sorted Population", "rank_1", "rank_2", "rank_3", "rank_4", "rank_5","TRAV1", "CDR3α,1", "TRAV2", "CDR3α,2", "TRBV", "CDR3β"]

In [91]:
ST6.columns = new_columns

In [92]:
ST6 = ST6.iloc[2:, :].reset_index(drop=True)

In [93]:
ST6

Unnamed: 0,Cell Name,Sorted Population,rank_1,rank_2,rank_3,rank_4,rank_5,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β
0,G23,Neo+WT+,0,0,0,0,0,,,,,,
1,G6,Neo+WT+,0,0,0,0,0,,,,,15*01,CATSQMGDTQYF
2,H10,Neo+WT+,0,0,0,0,0,3*01,CAVGFYGNNRLAF,,,,
3,H9,Neo+WT+,0,0,0,0,0,,,,,"6-2*01,6-3*01",CASSPFGDMLYNEQFF
4,I12,Neo+WT+,0,0,0,0,0,12-2*01,CAVRNNDMRF,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,K18,Neo-WT+,SNX24,0,0,0,0,,,,,,
292,H18,Neo-WT+,TEAD1_(VLE),0,0,0,0,24*01,CAFSQYGNKLVF,,,,
293,J3,Neo-WT+,USP28,0,0,0,0,,,,,,
294,H13,Neo-WT+,WDR46,0,0,0,0,,,,,,


In [94]:
remaining_cols = [col for col in ST6.columns if col not in ["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"]]
remaining_cols

['Cell Name',
 'Sorted Population',
 'TRAV1',
 'CDR3α,1',
 'TRAV2',
 'CDR3α,2',
 'TRBV',
 'CDR3β']

In [95]:
ST6 = ST6.melt(id_vars=remaining_cols, value_vars=["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"], value_name='peptide_name')

In [96]:
ST6 = ST6.rename(columns={'variable':'peptide_rank'})

In [97]:
ST6.columns

Index(['Cell Name', 'Sorted Population', 'TRAV1', 'CDR3α,1', 'TRAV2',
       'CDR3α,2', 'TRBV', 'CDR3β', 'peptide_rank', 'peptide_name'],
      dtype='object')

In [98]:
ST6["no_experiment"] = "4"

In [99]:
ST6 = ST6.astype({"no_experiment": "int64"})

In [100]:
Exp4 = ST6.merge(ST1, on = ["peptide_name", "no_experiment"])

In [101]:
Exp4

Unnamed: 0,Cell Name,Sorted Population,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV,CDR3β,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type
0,H23,Neo+WT+,,,,,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01
1,H4,Neo+WT+,,,,,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01
2,K2,Neo+WT+,,,,,29-1*01,CSVEGLRGGNEQFF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01
3,G5,Neo-WT+,,,,,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01
4,H21,Neo-WT+,38-2/DV8*01,CAYSPPLVF,,,"6-2*01,6-3*01",CASRGGDGETQYF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,J22,Neo-WT+,,,,,,,rank_1,WDR46,4,FLTYLDVSV,PE,A02:01
139,K14,Neo-WT+,,,,,19*01,CATWDSGNIQYF,rank_1,MAGEA3_KVAE,4,KVAELVHFL,PE,A02:01
140,J3,Neo-WT+,,,,,,,rank_1,USP28,4,LIIPCIHLI,PE,A02:01
141,G20,Neo+WT+,,,,,,,rank_4,USP28,4,LIIPCIHLI,PE,A02:01


In [102]:
Exp4 = Exp4.rename(columns={"Cell Name": "cell_name", "Sorted Population": "sorted_population", "CDR3α,1": "CDR3alpha_1", "CDR3α,2":"CDR3alpha_2", "CDR3β": "CDR3beta"})

Mam dwa warianty łańcucha TRAV -> chce zrobić z tego wiersze

In [103]:
remaining_cols = [col for col in Exp4.columns if col not in ['TRAV1', 'TRAV2', 'CDR3alpha_1', 'CDR3alpha_2']]
remaining_cols

['cell_name',
 'sorted_population',
 'TRBV',
 'CDR3beta',
 'peptide_rank',
 'peptide_name',
 'no_experiment',
 'peptide_seq',
 'tetramer_fluorescence',
 'HLA_type']

In [104]:
Exp4.cell_name.duplicated().sum()

6

In [105]:
Exp4_trav = Exp4.melt(id_vars=remaining_cols, value_vars=['TRAV1', 'TRAV2'], value_name='TRAV')
Exp4_trav

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRAV
0,H23,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,TRAV1,
1,H4,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,TRAV1,
2,K2,Neo+WT+,29-1*01,CSVEGLRGGNEQFF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,TRAV1,
3,G5,Neo-WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,TRAV1,
4,H21,Neo-WT+,"6-2*01,6-3*01",CASRGGDGETQYF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,TRAV1,38-2/DV8*01
...,...,...,...,...,...,...,...,...,...,...,...,...
281,J22,Neo-WT+,,,rank_1,WDR46,4,FLTYLDVSV,PE,A02:01,TRAV2,
282,K14,Neo-WT+,19*01,CATWDSGNIQYF,rank_1,MAGEA3_KVAE,4,KVAELVHFL,PE,A02:01,TRAV2,
283,J3,Neo-WT+,,,rank_1,USP28,4,LIIPCIHLI,PE,A02:01,TRAV2,
284,G20,Neo+WT+,,,rank_4,USP28,4,LIIPCIHLI,PE,A02:01,TRAV2,


In [106]:
Exp4_cdr3alpha = Exp4.melt(id_vars=remaining_cols, value_vars=['CDR3alpha_1', 'CDR3alpha_2'], value_name='CDR3alpha')
Exp4_cdr3alpha

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3alpha
0,H23,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,CDR3alpha_1,
1,H4,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,CDR3alpha_1,
2,K2,Neo+WT+,29-1*01,CSVEGLRGGNEQFF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,CDR3alpha_1,
3,G5,Neo-WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,CDR3alpha_1,
4,H21,Neo-WT+,"6-2*01,6-3*01",CASRGGDGETQYF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,CDR3alpha_1,CAYSPPLVF
...,...,...,...,...,...,...,...,...,...,...,...,...
281,J22,Neo-WT+,,,rank_1,WDR46,4,FLTYLDVSV,PE,A02:01,CDR3alpha_2,
282,K14,Neo-WT+,19*01,CATWDSGNIQYF,rank_1,MAGEA3_KVAE,4,KVAELVHFL,PE,A02:01,CDR3alpha_2,
283,J3,Neo-WT+,,,rank_1,USP28,4,LIIPCIHLI,PE,A02:01,CDR3alpha_2,
284,G20,Neo+WT+,,,rank_4,USP28,4,LIIPCIHLI,PE,A02:01,CDR3alpha_2,


In [107]:
Exp4_trav.drop(columns=['variable'], inplace=True)
Exp4_cdr3alpha.drop(columns=['variable'], inplace=True)

In [108]:
assert (Exp4_trav.cell_name == Exp4_cdr3alpha.cell_name).all()

In [109]:
Exp4_final = pd.concat([Exp4_trav,Exp4_cdr3alpha['CDR3alpha']], axis=1)
Exp4_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,H23,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
1,H4,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
2,K2,Neo+WT+,29-1*01,CSVEGLRGGNEQFF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
3,G5,Neo-WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
4,H21,Neo-WT+,"6-2*01,6-3*01",CASRGGDGETQYF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,38-2/DV8*01,CAYSPPLVF
...,...,...,...,...,...,...,...,...,...,...,...,...
281,J22,Neo-WT+,,,rank_1,WDR46,4,FLTYLDVSV,PE,A02:01,,
282,K14,Neo-WT+,19*01,CATWDSGNIQYF,rank_1,MAGEA3_KVAE,4,KVAELVHFL,PE,A02:01,,
283,J3,Neo-WT+,,,rank_1,USP28,4,LIIPCIHLI,PE,A02:01,,
284,G20,Neo+WT+,,,rank_4,USP28,4,LIIPCIHLI,PE,A02:01,,


Tutaj sprawdzamy w obrębie remaining_cols czy mamy jakieś duplikaty. Potem sprawdzamy czy gdzieś występuje NaN zestaw TRAV & CDR3alpha.

In [110]:
mask_remaining = Exp4_final.duplicated(subset=remaining_cols)
mask_melted = Exp4_final[['TRAV', 'CDR3alpha']].isna().all(axis=1)

mask_redundant = mask_remaining & mask_melted

Exp4_final = Exp4_final[~(mask_remaining & mask_melted).values]
Exp4_final

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha
0,H23,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
1,H4,Neo+WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
2,K2,Neo+WT+,29-1*01,CSVEGLRGGNEQFF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
3,G5,Neo-WT+,,,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,,
4,H21,Neo-WT+,"6-2*01,6-3*01",CASRGGDGETQYF,rank_1,AKAP13,4,KLMNIQQQL,PE,A02:01,38-2/DV8*01,CAYSPPLVF
...,...,...,...,...,...,...,...,...,...,...,...,...
139,K14,Neo-WT+,19*01,CATWDSGNIQYF,rank_1,MAGEA3_KVAE,4,KVAELVHFL,PE,A02:01,,
140,J3,Neo-WT+,,,rank_1,USP28,4,LIIPCIHLI,PE,A02:01,,
141,G20,Neo+WT+,,,rank_4,USP28,4,LIIPCIHLI,PE,A02:01,,
142,H5,Neo+WT+,,,rank_2,MAGEA12_KMAE,4,KMAELVHFL,APC,A02:01,,


Interesują nas oryginały puste, oryginały pełne, oraz duplikaty pełne.

In [111]:
Exp4_final["TRAV"].unique()

array([nan, '38-2/DV8*01', '38-1*01', '16*01', '14/DV4*01', '19*01',
       '12-2*01', '29/DV5*01', '12-1*01', '5*01', '13-2*01', '12-3*01',
       '3*01', '9-2*01', '1-2*01', '10*01', '24*01', '17*01', '35*01',
       '22*01', '8-3*01'], dtype=object)

In [112]:
Exp4_final["TRBV"].unique()

array([nan, '29-1*01', '6-2*01,6-3*01', '9*01', '19*01', '13*01', '30*01',
       '6-5*01', '5-4*01', '20-1*01', '3-1*01', '4-2*01', '4-3*01',
       '7-9*01', '4-1*01', '12-3*01,12-4*01', '28*01', '27*01'],
      dtype=object)

In [113]:
Exp4_final = Exp4_final.reset_index(drop=True)

In [114]:
Exp4_final.to_csv("../../../data/processed/zhang/Exp4_merged.csv.gz")

<b>ST8 - Experiment 5

In [115]:
ST8 = pd.read_excel("../../../data/original/zhang/ST8.xlsx")

In [116]:
new_columns = ["Cell Name", "Sorted Population", "rank_1", "rank_2", "rank_3", "rank_4", "rank_5","TRAV1", "CDR3α,1", "TRAV2", "CDR3α,2", "TRBV1", "CDR3β,1", "TRBV2", "CDR3β,2"]

In [117]:
ST8.columns = new_columns

In [118]:
ST8 = ST8.iloc[2:, :].reset_index(drop=True)

In [119]:
ST8

Unnamed: 0,Cell Name,Sorted Population,rank_1,rank_2,rank_3,rank_4,rank_5,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV1,"CDR3β,1",TRBV2,"CDR3β,2"
0,SA1,Clone,HCV-KLV(PE),HCV-KLV(APC),0,0,0,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,,
1,SB1,Clone,HCV-KLV(APC),HCV-KLV(PE),0,0,0,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,,
2,SC1,Clone,HCV-KLV(APC),HCV-KLV(PE),0,0,0,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,,
3,SD1,Clone,0,0,0,0,0,,,,,,,,
4,SE1,Clone,HCV-KLV(APC),HCV-KLV(PE),0,0,0,38-2/DV8*01,CAYRSPPSSEKLVF,,,28*01,CASSFLGTGLNEQYF,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SH10,Neo-WT+,0,0,0,0,0,,,,,,,,
792,SH11,Neo-WT+,0,0,0,0,0,,,,,,,,
793,SH3,Neo-WT+,0,0,0,0,0,,,,,,,,
794,SH5,Neo-WT+,ITIH6-RLG,0,0,0,0,19*01,CALSEDQFYF,,,6-1*01,CASRPGGGSYNEQFF,,


In [120]:
remaining_cols = [col for col in ST8.columns if col not in ["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"]]
remaining_cols

['Cell Name',
 'Sorted Population',
 'TRAV1',
 'CDR3α,1',
 'TRAV2',
 'CDR3α,2',
 'TRBV1',
 'CDR3β,1',
 'TRBV2',
 'CDR3β,2']

In [121]:
ST8 = ST8.melt(id_vars=remaining_cols, value_vars=["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"], value_name='peptide_name')

In [122]:
ST8 = ST8.rename(columns={'variable':'peptide_rank'})

In [123]:
ST8.columns

Index(['Cell Name', 'Sorted Population', 'TRAV1', 'CDR3α,1', 'TRAV2',
       'CDR3α,2', 'TRBV1', 'CDR3β,1', 'TRBV2', 'CDR3β,2', 'peptide_rank',
       'peptide_name'],
      dtype='object')

In [124]:
ST8["no_experiment"] = "5"

In [125]:
ST8 = ST8.astype({"no_experiment": "int64"})

In [126]:
Exp5 = ST8.merge(ST1, on = ["peptide_name", "no_experiment"])

In [127]:
Exp5

Unnamed: 0,Cell Name,Sorted Population,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV1,"CDR3β,1",TRBV2,"CDR3β,2",peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type
0,GA10,Neo+WT+,8-3*01,CAVGAEDSNYQLIW,,,"6-2*01,6-3*01",CASSYSWGEQFF,,,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01
1,GC5,Neo+WT+,29/DV5*01,CAASATGGTSYGKLTF,,,,,,,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01
2,GE11,Neo+WT+,29/DV5*01,CAASHGSSNTGKLIF,,,,,,,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01
3,JA4,Neo+WT+,20*01,CAVLTSGYSTLTF,,,13*01,CASSPMTGAEQFF,,,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01
4,LE10,Neo+WT+,8-6*01,CAVTDNNAGNMLTF,,,7-3*01,CASSFGPGYEQYF,,,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,JG8,Neo+WT+,10*01,CVVRGGYNKLIF,,,5-4*01,CASSSDRGEQFF,,,rank_3,C17orf75-ALS,5,ALSYTPVEV,APC,A02:01
771,LC5,Neo+WT+,5*01,CAETLYNQGGKLIF,,,9*01,CASSGRQGIDTEAFF,,,rank_3,TRIM58-YMV,5,YMVLASPSV,APC,A02:01
772,LH2,Neo+WT+,,,,,,,,,rank_3,ZDHHC17-LLL_T4I,5,LLLIFNVSV,PE,A02:01
773,MA3,Neo+WT+,,,,,7-9*01,CASSLAYRPYEQYF,,,rank_3,EXOC3L4-ILL_V9I,5,ILLDWAANI,PE,A02:01


In [128]:
Exp5 = Exp5.rename(columns={"Cell Name": "cell_name", "Sorted Population": "sorted_population", "CDR3α,1": "CDR3alpha_1", "CDR3α,2":"CDR3alpha_2", "CDR3β,1": "CDR3beta_1", "CDR3β,2": "CDR3beta_2"})

In [129]:
remaining_cols = [col for col in Exp5.columns if col not in ['TRAV1', 'TRAV2', 'TRBV1', 'TRBV2', 'CDR3alpha_1', 'CDR3alpha_2', 'CDR3beta_1', 'CDR3beta_2']]
remaining_cols

['cell_name',
 'sorted_population',
 'peptide_rank',
 'peptide_name',
 'no_experiment',
 'peptide_seq',
 'tetramer_fluorescence',
 'HLA_type']

In [130]:
Exp5.cell_name.duplicated().sum()

208

In [131]:
Exp5 = Exp5.drop_duplicates(subset='cell_name')

In [132]:
Exp5_trav = Exp5.melt(id_vars=remaining_cols, value_vars=['TRAV1', 'TRAV2'], value_name='TRAV')
Exp5_trav

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRAV
0,GA10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRAV1,8-3*01
1,GC5,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRAV1,29/DV5*01
2,GE11,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRAV1,29/DV5*01
3,JA4,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRAV1,20*01
4,LE10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRAV1,8-6*01
...,...,...,...,...,...,...,...,...,...,...
1129,NE3,Neo-WT+,rank_1,GABRG3-TAM,5,TAMDLFVTV,APC,A02:01,TRAV2,
1130,OE9,Neo-WT+,rank_1,PELP1-LVL,5,LVLPLVMGV,APC,A02:01,TRAV2,
1131,OH7,Neo-WT+,rank_1,ST6GALNAC2-LLF,5,LLFALYFSA,APC,A02:01,TRAV2,
1132,SA3,Neo-WT+,rank_1,ERBB2-ALI,5,ALIHHNTHL,APC,A02:01,TRAV2,


In [133]:
Exp5_cdr3alpha = Exp5.melt(id_vars=remaining_cols, value_vars=['CDR3alpha_1', 'CDR3alpha_2'], value_name='CDR3alpha')
Exp5_cdr3alpha

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3alpha
0,GA10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3alpha_1,CAVGAEDSNYQLIW
1,GC5,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3alpha_1,CAASATGGTSYGKLTF
2,GE11,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3alpha_1,CAASHGSSNTGKLIF
3,JA4,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3alpha_1,CAVLTSGYSTLTF
4,LE10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3alpha_1,CAVTDNNAGNMLTF
...,...,...,...,...,...,...,...,...,...,...
1129,NE3,Neo-WT+,rank_1,GABRG3-TAM,5,TAMDLFVTV,APC,A02:01,CDR3alpha_2,
1130,OE9,Neo-WT+,rank_1,PELP1-LVL,5,LVLPLVMGV,APC,A02:01,CDR3alpha_2,
1131,OH7,Neo-WT+,rank_1,ST6GALNAC2-LLF,5,LLFALYFSA,APC,A02:01,CDR3alpha_2,
1132,SA3,Neo-WT+,rank_1,ERBB2-ALI,5,ALIHHNTHL,APC,A02:01,CDR3alpha_2,


In [134]:
Exp5_trbv = Exp5.melt(id_vars=remaining_cols, value_vars=['TRBV1', 'TRBV2'], value_name='TRBV')
Exp5_trbv

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRBV
0,GA10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRBV1,"6-2*01,6-3*01"
1,GC5,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRBV1,
2,GE11,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRBV1,
3,JA4,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRBV1,13*01
4,LE10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,TRBV1,7-3*01
...,...,...,...,...,...,...,...,...,...,...
1129,NE3,Neo-WT+,rank_1,GABRG3-TAM,5,TAMDLFVTV,APC,A02:01,TRBV2,
1130,OE9,Neo-WT+,rank_1,PELP1-LVL,5,LVLPLVMGV,APC,A02:01,TRBV2,
1131,OH7,Neo-WT+,rank_1,ST6GALNAC2-LLF,5,LLFALYFSA,APC,A02:01,TRBV2,
1132,SA3,Neo-WT+,rank_1,ERBB2-ALI,5,ALIHHNTHL,APC,A02:01,TRBV2,


In [135]:
Exp5_cdr3beta = Exp5.melt(id_vars=remaining_cols, value_vars=['CDR3beta_1', 'CDR3beta_2'], value_name='CDR3beta')
Exp5_cdr3beta

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3beta
0,GA10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3beta_1,CASSYSWGEQFF
1,GC5,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3beta_1,
2,GE11,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3beta_1,
3,JA4,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3beta_1,CASSPMTGAEQFF
4,LE10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,CDR3beta_1,CASSFGPGYEQYF
...,...,...,...,...,...,...,...,...,...,...
1129,NE3,Neo-WT+,rank_1,GABRG3-TAM,5,TAMDLFVTV,APC,A02:01,CDR3beta_2,
1130,OE9,Neo-WT+,rank_1,PELP1-LVL,5,LVLPLVMGV,APC,A02:01,CDR3beta_2,
1131,OH7,Neo-WT+,rank_1,ST6GALNAC2-LLF,5,LLFALYFSA,APC,A02:01,CDR3beta_2,
1132,SA3,Neo-WT+,rank_1,ERBB2-ALI,5,ALIHHNTHL,APC,A02:01,CDR3beta_2,


In [136]:
Exp5_trav.drop(columns=['variable'], inplace=True)
Exp5_cdr3alpha.drop(columns=['variable'], inplace=True)
Exp5_trbv.drop(columns=['variable'], inplace=True)
Exp5_cdr3beta.drop(columns=['variable'], inplace=True)

In [137]:
assert (Exp5_trav.cell_name == Exp5_cdr3alpha.cell_name).all()

In [138]:
Exp5_final = pd.concat([Exp5_trav,Exp5_cdr3alpha['CDR3alpha'],Exp5_trbv['TRBV'],Exp5_cdr3beta['CDR3beta']], axis=1)
Exp5_final

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha,TRBV,CDR3beta
0,GA10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,8-3*01,CAVGAEDSNYQLIW,"6-2*01,6-3*01",CASSYSWGEQFF
1,GC5,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,29/DV5*01,CAASATGGTSYGKLTF,,
2,GE11,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,29/DV5*01,CAASHGSSNTGKLIF,,
3,JA4,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,20*01,CAVLTSGYSTLTF,13*01,CASSPMTGAEQFF
4,LE10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,8-6*01,CAVTDNNAGNMLTF,7-3*01,CASSFGPGYEQYF
...,...,...,...,...,...,...,...,...,...,...,...,...
1129,NE3,Neo-WT+,rank_1,GABRG3-TAM,5,TAMDLFVTV,APC,A02:01,,,,
1130,OE9,Neo-WT+,rank_1,PELP1-LVL,5,LVLPLVMGV,APC,A02:01,,,,
1131,OH7,Neo-WT+,rank_1,ST6GALNAC2-LLF,5,LLFALYFSA,APC,A02:01,,,,
1132,SA3,Neo-WT+,rank_1,ERBB2-ALI,5,ALIHHNTHL,APC,A02:01,,,,


Tutaj sprawdzamy w obrębie remaining_cols czy mamy jakieś duplikaty. Potem sprawdzamy czy gdzieś występuje NaN zestaw TRAV & CDR3alpha.

Interesują nas oryginały puste, oryginały pełne, oraz duplikaty pełne.

In [139]:
mask_remaining = Exp5_final.duplicated(subset=remaining_cols)
mask_melted = Exp5_final[['TRAV', 'CDR3alpha', 'TRBV', 'CDR3beta']].isna().all(axis=1)

mask_redundant = mask_remaining & mask_melted

Exp5_final = Exp5_final[~(mask_remaining & mask_melted).values]
Exp5_final

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha,TRBV,CDR3beta
0,GA10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,8-3*01,CAVGAEDSNYQLIW,"6-2*01,6-3*01",CASSYSWGEQFF
1,GC5,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,29/DV5*01,CAASATGGTSYGKLTF,,
2,GE11,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,29/DV5*01,CAASHGSSNTGKLIF,,
3,JA4,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,20*01,CAVLTSGYSTLTF,13*01,CASSPMTGAEQFF
4,LE10,Neo+WT+,rank_1,FNDC3B-VVL_L3M,5,VVMSWAPPV,PE,A02:01,8-6*01,CAVTDNNAGNMLTF,7-3*01,CASSFGPGYEQYF
...,...,...,...,...,...,...,...,...,...,...,...,...
1059,MA12,Neo-WT+,rank_1,SHROOM2-KLL,5,KLLAGDEIV,APC,A02:01,25*01,CAGYDYKLSF,,
1064,LA4,Neo-WT+,rank_1,LCP1-NLF,5,NLFNRYPAL,APC,A02:01,26-2*01,CILRGIPRDSSYKLIF,,
1065,LC2,Neo-WT+,rank_1,LCP1-NLF,5,NLFNRYPAL,APC,A02:01,19*01,CALSRGGGADGLTF,,
1071,KA3,Neo-WT+,rank_1,HAUS3-ILN,5,ILNAMITKI,APC,A02:01,19*01,CALSEGERDDKIIF,,


In [140]:
Exp5_final["TRAV"].unique()

array(['8-3*01', '29/DV5*01', '20*01', '8-6*01', nan, '41*01', '17*01',
       '3*01', '39*01', '22*01', '14/DV4*01', '21*01', '24*01', '12-1*01',
       '12-2*01', '23/DV6*01', '8-4*01', '27*01', '22*01 F',
       '38-2/DV8*01', '26-2*01', '19*01', '9-2*01', '10*01', '5*01',
       '1-2*01', '12-2', '12-3*01', '4*01', '8-1*01', '34*01', '8-1*01 F',
       '26-1*01', '1-1*01', '8-2*01', '25*01 F', '3*01 F', '2*01', '6*01',
       '9-2', '35*01', '25*01', '16*01', '29/DV5*01 F', '30*01',
       '1-2*01\xa0', '38-1*01', '40*01', '21', '13-1*01', '12-1',
       '36/DV7*01'], dtype=object)

In [141]:
Exp5_final["TRAV"] = Exp5_final["TRAV"].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Exp5_final["TRAV"] = Exp5_final["TRAV"].str.strip()


In [142]:
Exp5_final["TRAV"].unique()

array(['8-3*01', '29/DV5*01', '20*01', '8-6*01', nan, '41*01', '17*01',
       '3*01', '39*01', '22*01', '14/DV4*01', '21*01', '24*01', '12-1*01',
       '12-2*01', '23/DV6*01', '8-4*01', '27*01', '22*01 F',
       '38-2/DV8*01', '26-2*01', '19*01', '9-2*01', '10*01', '5*01',
       '1-2*01', '12-2', '12-3*01', '4*01', '8-1*01', '34*01', '8-1*01 F',
       '26-1*01', '1-1*01', '8-2*01', '25*01 F', '3*01 F', '2*01', '6*01',
       '9-2', '35*01', '25*01', '16*01', '29/DV5*01 F', '30*01',
       '38-1*01', '40*01', '21', '13-1*01', '12-1', '36/DV7*01'],
      dtype=object)

In [143]:
Exp5_final['TRBV'].unique()

array(['6-2*01,6-3*01', nan, '13*01', '7-3*01', '15*01', '3-1*01',
       '6-5*01', '29-1*01', '5-4*01', '6-4*01', '27*01', '7-9*01',
       '10-3*01', '5-1*01', '6-1*01', '7-8*01', '11-2*01', '28*01',
       '5-6*01', '30*01', '19*01', '20-1*01', '2*01', '9*01', '4-1*01',
       '10-2*01', '25-1*01', '12-3*01,12-4*01', '4-2*01', '6-6*01',
       '7-6*01', '10-1*01', '11-1*01', '7-2*01', '11-3*01', '14*01',
       '18*01', '7-4*01', '12-5*01', '4-3*01'], dtype=object)

In [144]:
Exp5_final = Exp5_final.reset_index(drop=True)

In [145]:
Exp5_final.to_csv("../../../data/processed/zhang/Exp5_merged.csv.gz")

<b>ST9 - Experiment 6

In [146]:
ST9 = pd.read_excel("../../../data/original/zhang/ST9.xlsx")

In [147]:
new_columns = ["Cell Name", "Sorted Population", "rank_1", "rank_2", "rank_3", "rank_4", "rank_5","TRAV1", "CDR3α,1", "TRAV2", "CDR3α,2", "TRBV1", "CDR3β,1", "TRBV2", "CDR3β,2"]

In [148]:
ST9.columns = new_columns

In [149]:
ST9 = ST9.iloc[2:, :].reset_index(drop=True)

In [150]:
ST9

Unnamed: 0,Cell Name,Sorted Population,rank_1,rank_2,rank_3,rank_4,rank_5,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV1,"CDR3β,1",TRBV2,"CDR3β,2"
0,AD5,Neo+WT+,0,0,0,0,0,17*01,CATDAAGSTLGRLYF,3*01,CAVRDSDYGQNFVF,"12-3*01,12-4*01",CASSPPHNDRDILNTDTQYF,,
1,AD9,Neo+WT+,0,0,0,0,0,1-2*01,CAVRVLSYNFNKFYF,,,20-1*01,CSARTDGSYEQYF,,
2,BC5,Neo+WT+,0,0,0,0,0,24*01,CALNSGGGADGLTF,,,5-1*01,CASSWGGYEQYF,,
3,BC8,Neo+WT+,0,0,0,0,0,16*01,CAPPLNQAGTALIF,,,"12-3*01,12-4*01",CASSPGAGSTEAFF,,
4,BD8,Neo+WT+,0,0,0,0,0,,,,,4-3*01,CASSQDPRTGANYGYTF,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,BF4,Neo-WT+,TRPV4-FMI,0,0,0,0,,,,,9*01,CASSDRQGIATGELFF,,
270,AB9,Neo-WT+,TTLL12-KLP,0,0,0,0,5*01,CAESSSMDTGRRALTF,,,5-5*01,CASSLNPGTSYEQYF,,
271,AC3,Neo-WT+,WDR46-FLT,0,0,0,0,,,,,9*01,CASNSGGAYNEQFF,,
272,AG2,Neo-WT+,ZDHHC7-SLL,0,0,0,0,14/DV4*01,CAMREFPQGGSEKLVF,,,4-2*01,CASSQGQGLQETQYF,,


In [151]:
remaining_cols = [col for col in ST9.columns if col not in ["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"]]
remaining_cols

['Cell Name',
 'Sorted Population',
 'TRAV1',
 'CDR3α,1',
 'TRAV2',
 'CDR3α,2',
 'TRBV1',
 'CDR3β,1',
 'TRBV2',
 'CDR3β,2']

In [152]:
ST9 = ST9.melt(id_vars=remaining_cols, value_vars=["rank_1", "rank_2", "rank_3", "rank_4", "rank_5"], value_name='peptide_name')

In [153]:
ST9 = ST9.rename(columns={'variable':'peptide_rank'})

In [154]:
ST9.columns

Index(['Cell Name', 'Sorted Population', 'TRAV1', 'CDR3α,1', 'TRAV2',
       'CDR3α,2', 'TRBV1', 'CDR3β,1', 'TRBV2', 'CDR3β,2', 'peptide_rank',
       'peptide_name'],
      dtype='object')

In [155]:
ST9["no_experiment"] = "6"

In [156]:
ST9 = ST9.astype({"no_experiment": "int64"})

In [157]:
Exp6 = ST9.merge(ST1, on = ["peptide_name", "no_experiment"])

In [158]:
Exp6

Unnamed: 0,Cell Name,Sorted Population,TRAV1,"CDR3α,1",TRAV2,"CDR3α,2",TRBV1,"CDR3β,1",TRBV2,"CDR3β,2",peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type
0,BA7,Neo+WT+,8-3*01,CAVAPSNDMRF,,,,,,,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01
1,BE6,Neo-WT+,38-2/DV8*01,CAYTDKLIF,,,6-6*01,CASGDLSYEQYF,,,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01
2,FF1,Neo+WT+,5*01,CAEIYNQGGKLIF,,,19*01,CASSTPGGWNTEAFF,,,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01
3,FH12,Neo+WT+,,,,,,,,,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01
4,AD7,Neo+WT-,24*01,CALIYNQGGKLIF,,,2*01,CASNSGPNEKLFF,,,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,BA7,Neo+WT+,8-3*01,CAVAPSNDMRF,,,,,,,rank_3,SMOX-KLA_KN,6,KLANPLPYT,PE,A02:01
365,AE3,Neo+WT+,,,,,5-6*01,CASSPVLGTYGYTF,,,rank_3,MLL2-ALS_L8H,6,ALSPVIPHI,PE,A02:01
366,FB6,Neo+WT+,5*01,CAECGGNNDMRF,12-2*01,CAVPYSGAGSYQLTF,28*01,CASSLNTEAFF,,,rank_3,SREBF1-YLQ,6,YLQDSLATT,APC,A02:01
367,BE5,Neo-WT+,24*01,CAFSRYSTLTF,,,19*01,CASSIGDYEQYF,,,rank_3,MYPN-RVI,6,RVIGMPPPV,APC,A02:01


In [159]:
Exp6 = Exp6.rename(columns={"Cell Name": "cell_name", "Sorted Population": "sorted_population", "CDR3α,1": "CDR3alpha_1", "CDR3α,2":"CDR3alpha_2", "CDR3β,1": "CDR3beta_1", "CDR3β,2": "CDR3beta_2"})

In [160]:
remaining_cols = [col for col in Exp6.columns if col not in ['TRAV1', 'TRAV2', 'TRBV1', 'TRBV2', 'CDR3alpha_1', 'CDR3alpha_2', 'CDR3beta_1', 'CDR3beta_2']]
remaining_cols

['cell_name',
 'sorted_population',
 'peptide_rank',
 'peptide_name',
 'no_experiment',
 'peptide_seq',
 'tetramer_fluorescence',
 'HLA_type']

In [161]:
Exp6.cell_name.duplicated().sum()

114

In [162]:
Exp6_trav = Exp6.melt(id_vars=remaining_cols, value_vars=['TRAV1', 'TRAV2'], value_name='TRAV')
Exp6_trav

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRAV
0,BA7,Neo+WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,TRAV1,8-3*01
1,BE6,Neo-WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,TRAV1,38-2/DV8*01
2,FF1,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,TRAV1,5*01
3,FH12,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,TRAV1,
4,AD7,Neo+WT-,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,TRAV1,24*01
...,...,...,...,...,...,...,...,...,...,...
733,BA7,Neo+WT+,rank_3,SMOX-KLA_KN,6,KLANPLPYT,PE,A02:01,TRAV2,
734,AE3,Neo+WT+,rank_3,MLL2-ALS_L8H,6,ALSPVIPHI,PE,A02:01,TRAV2,
735,FB6,Neo+WT+,rank_3,SREBF1-YLQ,6,YLQDSLATT,APC,A02:01,TRAV2,12-2*01
736,BE5,Neo-WT+,rank_3,MYPN-RVI,6,RVIGMPPPV,APC,A02:01,TRAV2,


In [163]:
Exp6_cdr3alpha = Exp6.melt(id_vars=remaining_cols, value_vars=['CDR3alpha_1', 'CDR3alpha_2'], value_name='CDR3alpha')
Exp6_cdr3alpha

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3alpha
0,BA7,Neo+WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,CDR3alpha_1,CAVAPSNDMRF
1,BE6,Neo-WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,CDR3alpha_1,CAYTDKLIF
2,FF1,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,CDR3alpha_1,CAEIYNQGGKLIF
3,FH12,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,CDR3alpha_1,
4,AD7,Neo+WT-,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,CDR3alpha_1,CALIYNQGGKLIF
...,...,...,...,...,...,...,...,...,...,...
733,BA7,Neo+WT+,rank_3,SMOX-KLA_KN,6,KLANPLPYT,PE,A02:01,CDR3alpha_2,
734,AE3,Neo+WT+,rank_3,MLL2-ALS_L8H,6,ALSPVIPHI,PE,A02:01,CDR3alpha_2,
735,FB6,Neo+WT+,rank_3,SREBF1-YLQ,6,YLQDSLATT,APC,A02:01,CDR3alpha_2,CAVPYSGAGSYQLTF
736,BE5,Neo-WT+,rank_3,MYPN-RVI,6,RVIGMPPPV,APC,A02:01,CDR3alpha_2,


In [164]:
Exp6_trbv = Exp6.melt(id_vars=remaining_cols, value_vars=['TRBV1', 'TRBV2'], value_name='TRBV')
Exp6_trbv

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,TRBV
0,BA7,Neo+WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,TRBV1,
1,BE6,Neo-WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,TRBV1,6-6*01
2,FF1,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,TRBV1,19*01
3,FH12,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,TRBV1,
4,AD7,Neo+WT-,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,TRBV1,2*01
...,...,...,...,...,...,...,...,...,...,...
733,BA7,Neo+WT+,rank_3,SMOX-KLA_KN,6,KLANPLPYT,PE,A02:01,TRBV2,
734,AE3,Neo+WT+,rank_3,MLL2-ALS_L8H,6,ALSPVIPHI,PE,A02:01,TRBV2,
735,FB6,Neo+WT+,rank_3,SREBF1-YLQ,6,YLQDSLATT,APC,A02:01,TRBV2,
736,BE5,Neo-WT+,rank_3,MYPN-RVI,6,RVIGMPPPV,APC,A02:01,TRBV2,


In [165]:
Exp6_cdr3beta = Exp6.melt(id_vars=remaining_cols, value_vars=['CDR3beta_1', 'CDR3beta_2'], value_name='CDR3beta')
Exp6_cdr3beta

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,variable,CDR3beta
0,BA7,Neo+WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,CDR3beta_1,
1,BE6,Neo-WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,CDR3beta_1,CASGDLSYEQYF
2,FF1,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,CDR3beta_1,CASSTPGGWNTEAFF
3,FH12,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,CDR3beta_1,
4,AD7,Neo+WT-,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,CDR3beta_1,CASNSGPNEKLFF
...,...,...,...,...,...,...,...,...,...,...
733,BA7,Neo+WT+,rank_3,SMOX-KLA_KN,6,KLANPLPYT,PE,A02:01,CDR3beta_2,
734,AE3,Neo+WT+,rank_3,MLL2-ALS_L8H,6,ALSPVIPHI,PE,A02:01,CDR3beta_2,
735,FB6,Neo+WT+,rank_3,SREBF1-YLQ,6,YLQDSLATT,APC,A02:01,CDR3beta_2,
736,BE5,Neo-WT+,rank_3,MYPN-RVI,6,RVIGMPPPV,APC,A02:01,CDR3beta_2,


In [166]:
Exp6_trav.drop(columns=['variable'], inplace=True)
Exp6_cdr3alpha.drop(columns=['variable'], inplace=True)
Exp6_trbv.drop(columns=['variable'], inplace=True)
Exp6_cdr3beta.drop(columns=['variable'], inplace=True)

In [167]:
assert (Exp6_trav.cell_name == Exp6_cdr3alpha.cell_name).all()

In [168]:
Exp6_final = pd.concat([Exp6_trav,Exp6_cdr3alpha['CDR3alpha'],Exp6_trbv['TRBV'],Exp6_cdr3beta['CDR3beta']], axis=1)
Exp6_final

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha,TRBV,CDR3beta
0,BA7,Neo+WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,8-3*01,CAVAPSNDMRF,,
1,BE6,Neo-WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,38-2/DV8*01,CAYTDKLIF,6-6*01,CASGDLSYEQYF
2,FF1,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,5*01,CAEIYNQGGKLIF,19*01,CASSTPGGWNTEAFF
3,FH12,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,,,,
4,AD7,Neo+WT-,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,24*01,CALIYNQGGKLIF,2*01,CASNSGPNEKLFF
...,...,...,...,...,...,...,...,...,...,...,...,...
733,BA7,Neo+WT+,rank_3,SMOX-KLA_KN,6,KLANPLPYT,PE,A02:01,,,,
734,AE3,Neo+WT+,rank_3,MLL2-ALS_L8H,6,ALSPVIPHI,PE,A02:01,,,,
735,FB6,Neo+WT+,rank_3,SREBF1-YLQ,6,YLQDSLATT,APC,A02:01,12-2*01,CAVPYSGAGSYQLTF,,
736,BE5,Neo-WT+,rank_3,MYPN-RVI,6,RVIGMPPPV,APC,A02:01,,,,


Tutaj sprawdzamy w obrębie remaining_cols czy mamy jakieś duplikaty. Potem sprawdzamy czy gdzieś występuje NaN zestaw TRAV & CDR3alpha.

Interesują nas oryginały puste, oryginały pełne, oraz duplikaty pełne.

In [169]:
mask_remaining = Exp6_final.duplicated(subset=remaining_cols)
mask_melted = Exp6_final[['TRAV', 'CDR3alpha', 'TRBV', 'CDR3beta']].isna().all(axis=1)

mask_redundant = mask_remaining & mask_melted

Exp6_final = Exp6_final[~(mask_remaining & mask_melted).values]
Exp6_final

Unnamed: 0,cell_name,sorted_population,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha,TRBV,CDR3beta
0,BA7,Neo+WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,8-3*01,CAVAPSNDMRF,,
1,BE6,Neo-WT+,rank_1,AKAP13-KLM,6,KLMNIQQQL,APC,A02:01,38-2/DV8*01,CAYTDKLIF,6-6*01,CASGDLSYEQYF
2,FF1,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,5*01,CAEIYNQGGKLIF,19*01,CASSTPGGWNTEAFF
3,FH12,Neo+WT+,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,,,,
4,AD7,Neo+WT-,rank_1,APBB2-VQY_L7F,6,VQYLGMFPV,PE,A02:01,24*01,CALIYNQGGKLIF,2*01,CASNSGPNEKLFF
...,...,...,...,...,...,...,...,...,...,...,...,...
696,AC6,Neo+WT+,rank_2,DHX33-LLA_M4I,6,LLAIKVPNV,PE,A02:01,26-2*01,CILREGSNYQLIW,,
701,AD11,Neo+WT+,rank_2,FNDC3B-VVL,6,VVLSWAPPV,APC,A02:01,24*01,CAPNNAGNMLTF,,
724,FB6,Neo+WT+,rank_2,SSPN-9,6,FLMASISSS,APC,A02:01,12-2*01,CAVPYSGAGSYQLTF,,
726,BF8,Neo+WT+,rank_2,VN1R5-MII,6,MIISHLSLI,APC,A02:01,38-2/DV8*01,CACNNAGGTSYGKLTF,,


In [170]:
Exp6_final["TRAV"].unique()

array(['8-3*01', '38-2/DV8*01', '5*01', nan, '24*01', '12-2*01', '17*01',
       '3*01', '41*01', '14/DV4*01', '4*01', '26-2*01', '12-3*01',
       '20*01', '27*01', '12-1*01', '21*01', '23/DV6*01', '16*01',
       '30*01', '8-1*01', '1-2*01 F', '39*01', '38-1*01', '9-2*01',
       '19*01', '41*01 F', '1-2*01', '8-6*01', '26-1*01', '22*01',
       '8-2*01', '25*01', '10*01', '29/DV5*01', '36/DV7', '35*01',
       '14/DV4*01 F', '40*01', '35', '6*01'], dtype=object)

In [171]:
Exp6_final["TRBV"].unique()

array([nan, '6-6*01', '19*01', '2*01', '30*01', '27*01', '29-1*01',
       '5-6*01', '4-3*01', '6-2*01,6-3*01', '6-5*01', '5-1*01', '13*01',
       '20-1*01', '3-1*01', '14*01', '9*01', '10-2*01', '7-9*01',
       '7-8*01', '12-3*01,12-4*01', '4-1*01', '7-2*01', '4-2*01',
       '7-3*01', '25-1*01', '5-4*01', '5-8*01', '15*01', '6-1*01',
       '28*01', '11-2*01', '6-4*01', '7-6*01', '10-3*01', '11-3*01',
       '5-5*01'], dtype=object)

In [172]:
Exp6_final = Exp6_final.reset_index(drop=True)

In [173]:
Exp6_final.to_csv("../../../data/processed/zhang/Exp6_merged.csv.gz")