Read libraries and experiment datasets with additional information

In [1]:
import pandas as pd
import numpy as np

In [2]:
Exp1 = pd.read_csv("../../../data/processed/zhang/Exp1_add.csv.gz", index_col=0)
Exp2 = pd.read_csv("../../../data/processed/zhang/Exp2_add.csv.gz", index_col=0)
Exp3 = pd.read_csv("../../../data/processed/zhang/Exp3_add.csv.gz", index_col=0)
Exp4 = pd.read_csv("../../../data/processed/zhang/Exp4_add.csv.gz", index_col=0)
Exp5 = pd.read_csv("../../../data/processed/zhang/Exp5_add.csv.gz", index_col=0)
Exp6 = pd.read_csv("../../../data/processed/zhang/Exp6_add.csv.gz", index_col=0)

Merge datasets and add common info

In [3]:
df_to_merge = [Exp1,Exp2,Exp3,Exp4,Exp5,Exp6]

In [4]:
result = pd.concat(df_to_merge).reset_index(drop=True)
result["tcr_source_organism"] = "human"
result["dataset"] = "zhang2018"
result["experiment_type"] = "tetramer staining"

In [5]:
result

Unnamed: 0,cell_name,sorted_population,TRBV,CDR3beta,peptide_rank,peptide_name,no_experiment,peptide_seq,tetramer_fluorescence,HLA_type,TRAV,CDR3alpha,peptide_source,peptide_reference,wildtype,tcr_source_organism,dataset,experiment_type
0,AA1,Naïve Endogenous,"6-2*01,6-3*01",CASSYSENEQFF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,,,,,,human,zhang2018,tetramer staining
1,AB12,Naïve Endogenous,28*01,CASSGTGGYSGANVLTF,rank_1,ZNT8-LLS,1,LLSLFSLWL,PE,A02:01,12-3*01,CAAGGSYIPTF,,,,human,zhang2018,tetramer staining
2,AA10,Naïve Endogenous,6-1*01,CASRSYVASSNEQFF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CGGQAGTALIF,,,,human,zhang2018,tetramer staining
3,AA11,Naïve Endogenous,28*01,CASTQWYGGGTPPYF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CAVNGGNQFYF,,,,human,zhang2018,tetramer staining
4,AA12,Naïve Endogenous,7-2*01,CASSLTTGVFSQPQHF,rank_1,MART1-A2L,1,ELAGIGILTV,PE,A02:01,12-2*01,CAVGRDDKIIF,,,,human,zhang2018,tetramer staining
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1896,AC6,Neo+WT+,,,rank_2,DHX33-LLA_M4I,6,LLAIKVPNV,PE,A02:01,26-2*01,CILREGSNYQLIW,NSCLC,"Rizvi, N.A. et al. Mutational landscape determ...",no,human,zhang2018,tetramer staining
1897,AD11,Neo+WT+,,,rank_2,FNDC3B-VVL,6,VVLSWAPPV,APC,A02:01,24*01,CAPNNAGNMLTF,CLL,"Rajasagi, M. et al. Systematic identification ...",yes,human,zhang2018,tetramer staining
1898,FB6,Neo+WT+,,,rank_2,SSPN-9,6,FLMASISSS,APC,A02:01,12-2*01,CAVPYSGAGSYQLTF,SKCM,"Strønen, E. et al. Targeting of cancer neoanti...",yes,human,zhang2018,tetramer staining
1899,BF8,Neo+WT+,,,rank_2,VN1R5-MII,6,MIISHLSLI,APC,A02:01,38-2/DV8*01,CACNNAGGTSYGKLTF,NSCLC,"Rizvi, N.A. et al. Mutational landscape determ...",yes,human,zhang2018,tetramer staining


In [6]:
result.columns

Index(['cell_name', 'sorted_population', 'TRBV', 'CDR3beta', 'peptide_rank',
       'peptide_name', 'no_experiment', 'peptide_seq', 'tetramer_fluorescence',
       'HLA_type', 'TRAV', 'CDR3alpha', 'peptide_source', 'peptide_reference',
       'wildtype', 'tcr_source_organism', 'dataset', 'experiment_type'],
      dtype='object')

Rename columns and order them

In [7]:
result = result.rename(columns={'CDR3alpha': 'CDR3a', 'CDR3beta': 'CDR3b'})

In [8]:
column_names = ["peptide_seq", "HLA_type", "CDR3a", "CDR3b", "peptide_name", "wildtype", "peptide_rank", "peptide_source", "peptide_reference", "TRAV", "TRBV", "tcr_source_organism", "cell_name", "sorted_population", "experiment_type", "tetramer_fluorescence", "no_experiment", "dataset"]

result = result.reindex(columns=column_names)

In [9]:
result["peptide_reference"].astype('str')

0                                                     nan
1                                                     nan
2                                                     nan
3                                                     nan
4                                                     nan
                              ...                        
1896    Rizvi, N.A. et al. Mutational landscape determ...
1897    Rajasagi, M. et al. Systematic identification ...
1898    Strønen, E. et al. Targeting of cancer neoanti...
1899    Rizvi, N.A. et al. Mutational landscape determ...
1900    Rizvi, N.A. et al. Mutational landscape determ...
Name: peptide_reference, Length: 1901, dtype: object

In [10]:
result["peptide_reference"] = result["peptide_reference"].str.split(",", 1).str.get(0)

Getting rid of rows where there is no CDR3 and V gene 

In [11]:
mask = result[['TRAV', 'CDR3a', 'TRBV', 'CDR3b']].isna().all(axis=1)

result = result[~mask.values]

Checking duplicates - some pHLA:TCR are the same for different cell names

In [12]:
result.duplicated(subset=["peptide_seq", "CDR3a", "CDR3b", "TRAV", "TRBV", "sorted_population"]).value_counts()

False    1360
True      237
dtype: int64

In [13]:
result[result.duplicated(subset=["peptide_seq", "CDR3a", "CDR3b", "TRAV", "TRBV", "sorted_population"])]

Unnamed: 0,peptide_seq,HLA_type,CDR3a,CDR3b,peptide_name,wildtype,peptide_rank,peptide_source,peptide_reference,TRAV,TRBV,tcr_source_organism,cell_name,sorted_population,experiment_type,tetramer_fluorescence,no_experiment,dataset
51,ELAGIGILTV,A02:01,,CASSFAGTTEAFF,MART1-A2L,,rank_1,,,,27*01,human,BE7,Non-Naïve Endogenous,tetramer staining,PE,1,zhang2018
53,ELAGIGILTV,A02:01,CAVTAGGTSYGKLTF,,MART1-A2L,,rank_1,,,12-2*01,,human,BE9,Non-Naïve Endogenous,tetramer staining,PE,1,zhang2018
54,ELAGIGILTV,A02:01,,CASSFAGTTEAFF,MART1-A2L,,rank_1,,,,27*01,human,BF1,Non-Naïve Endogenous,tetramer staining,PE,1,zhang2018
56,ELAGIGILTV,A02:01,CAVTAGGTSYGKLTF,,MART1-A2L,,rank_1,,,12-2*01,,human,BF11,Non-Naïve Endogenous,tetramer staining,PE,1,zhang2018
57,ELAGIGILTV,A02:01,CAVTAGGTSYGKLTF,,MART1-A2L,,rank_1,,,12-2*01,,human,BF12,Non-Naïve Endogenous,tetramer staining,PE,1,zhang2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1530,LLAMKVPNV,A02:01,,CASSLGQGDFGETQYF,DHX33-LLA,yes,rank_3,NSCLC,Rizvi,,5-1*01,human,FF8,Neo+WT+,tetramer staining,APC,6,zhang2018
1688,KLWASPLHV,A02:01,CAMSTEDDKIIF,CASAPGPVYEQYF,ATP6AP1-KLG_G3W,no,rank_1,NSCLC,Rizvi,12-3*01,4-1*01,human,FG4,Neo+WT-,tetramer staining,PE,6,zhang2018
1870,LLAMKVPNV,A02:01,CILREGSNYQLIW,,DHX33-LLA,yes,rank_1,NSCLC,Rizvi,26-2*01,,human,AC6,Neo+WT+,tetramer staining,APC,6,zhang2018
1871,LLAMKVPNV,A02:01,CILREGSNYQLIW,,DHX33-LLA,yes,rank_1,NSCLC,Rizvi,26-2*01,,human,AC6,Neo+WT+,tetramer staining,APC,6,zhang2018


In [14]:
result.drop_duplicates(subset=["peptide_seq", "CDR3a", "CDR3b", "TRAV", "TRBV", "sorted_population"], inplace=True, ignore_index= True)

In [15]:
result.sample(10)

Unnamed: 0,peptide_seq,HLA_type,CDR3a,CDR3b,peptide_name,wildtype,peptide_rank,peptide_source,peptide_reference,TRAV,TRBV,tcr_source_organism,cell_name,sorted_population,experiment_type,tetramer_fluorescence,no_experiment,dataset
1008,SLAPLSPRV,A02:01,CAVNMGGNEKLTF,CASSFGQEEGQPQHF,CNKSR1-SLA_A9V,no,rank_2,SKCM,Robbins,12-2*01,13*01,human,FD2,Neo+WT-,tetramer staining,PE,6,zhang2018
418,FLTYLDVSV,A02:01,CAMRESKAAGNKLTF,CASSLWGQGWTGELFF,WDR46,yes,rank_1,SKCM,Cohen,14/DV4*01,7-2*01,human,BG3,Neo-WT+,tetramer staining,PE,3,zhang2018
772,LLAMTVPNV,A02:01,,CASSLDFQGPRDF,DHX33-LLA_K5T,no,rank_1,NSCLC,Rizvi,,11-2*01,human,SD5,Neo+WT-,tetramer staining,PE,5,zhang2018
497,ILTGLNYEA,A02:01,CAPPGAQKLVF,,NSDHL,yes,rank_2,SKCM,Cohen,24*01,,human,J23,Neo+WT+,tetramer staining,PE,4,zhang2018
1297,ILNAMITKI,A02:01,CAADSGGGADGLTF,CAIQGFSGPLHF,HAUS3-ILN,yes,rank_2,SKCM,Robbins,12-2*01,6-5*01,human,AA6,Neo+WT+,tetramer staining,APC,6,zhang2018
538,LLAMKVPNV,A02:01,CASEVGGYALNF,,DHX33-LLA,yes,rank_1,NSCLC,Rizvi,12-2*01,,human,GD1,Neo+WT+,tetramer staining,APC,5,zhang2018
824,VLFHRAFLV,A02:01,CAVSEWDDMRF,CASSDGRADTQYF,INTS1-VLL_L3F,no,rank_1,NSCLC,Rizvi,8-4*01,10-2*01,human,JB10,Neo+WT-,tetramer staining,PE,5,zhang2018
1111,FILDAVQRV,A02:01,CALSEAYNYGQNFVF,CASSEIASYNEQFF,PXDNL-SIL_S1F,no,rank_1,SKCM,Robbins,19*01,6-1*01,human,BE8,Neo+WT+,tetramer staining,PE,6,zhang2018
296,YLEPGPVTA,A02:01,CAPGIAGGTSYGKLTF,CASSLAYSYEQYF,GP100-YLE,,rank_1,,,17,27,human,VE6,Self_Naive,tetramer staining,PE,2,zhang2018
512,AVGSHVYSV,A02:01,CALTTDSNSGYALNF,,PGM5,yes,rank_1,SKCM,Strønen,19*01,,human,G8,Neo-WT+,tetramer staining,PE,4,zhang2018


In [16]:
result.to_csv("../../../data/processed/zhang/zhang_input.csv.gz", index=False)