In [1]:
import pandas as pd
from tempfile import NamedTemporaryFile
from epytope.Core.AntigenImmuneReceptor import AntigenImmuneReceptor
from epytope.Core.TCREpitope import TCREpitope
from epytope.Core.ImmuneReceptorChain import ImmuneReceptorChain
from epytope.TCRSpecificityPrediction import TCRSpecificityPredictorFactory, ML
import os
import scirpy as ir
from epytope.IO.FileReader import process_dataset_TCR

In [2]:
TRA1 = ImmuneReceptorChain(chain_type="TRA", v_gene="TRAV12-1*01", d_gene="", j_gene="TRAJ23*01",
                                 cdr3="VVRAGKLI")
TRB1 = ImmuneReceptorChain(chain_type="TRB", v_gene="TRBV6-3*01", d_gene="", j_gene="TRBJ2-4*01",
                        cdr3="ASGQGNFDIQY")
TRA2 = ImmuneReceptorChain(chain_type="TRA", v_gene="TRAV9-2*01", d_gene="", j_gene="TRAJ43*01", cdr3="ALSDPVNDMR")
TRB2 = ImmuneReceptorChain(chain_type="TRB", v_gene="TRBV11-2*01", d_gene="", j_gene="TRBJ1-5*01",
                           cdr3="ASSLRGRGDQPQH")
epitope1 = TCREpitope("FLRGRAYGL", mhc="HLA-B*08:01")
epitope2 = TCREpitope("HSKRKCDEL", mhc="HLA-B*08:01")
TCR1 = AntigenImmuneReceptor(receptor_id="1", chains=[TRA1, TRB1], cell_type="CD8")
TCR2 = AntigenImmuneReceptor(receptor_id="2", chains=[TRA2, TRB2], cell_type="CD8")
TCRs = [TCR1, TCR2]
epitopes = [epitope1, epitope2]
peptide = []
dataset = pd.DataFrame({"Receptor_ID": 1, "TRA": "CAVSAASGGSYIPTF", "TRB": "CASSFSGNTGELFF", "TRAV": "TRAV3", "TRAJ": "TRAJ6",
                        "TRBV": "TRBV12-3", "TRBJ": "TRBJ2-2", "T-Cell-Type": "CD8", "Peptide": "RAKFKQLL",
                        "MHC": "HLA-B*08", "Species": "", "Antigen.species": "", "Tissue": ""}, index=[0])
TCR = ""
vdjdb = "/home/mahmoud/Downloads/vdjdb/vdjdb_full.txt"
McPAS = "/home/mahmoud/Downloads/McPAS-TCR.csv"
IEDB = "/home/mahmoud/Downloads/tcell_receptor_table_export_1660640162.csv"
repository = {"ERGO-II": "/home/mahmoud/Documents/BA/ERGOII/ERGO-II",
              "TITAN": "/home/mahmoud/Documents/BA/TITAN/TITAN",
              "ImRex": "/home/mahmoud/Documents/BA/IMRex/ImRex",
              "NetTCR2": "/home/mahmoud/Documents/BA/test/NetTCR-2.0",
              "pMTnet": "/home/mahmoud/Documents/BA/test/pMTnet",
              "ATM_TCR": "/home/mahmoud/Documents/BA/test/ATM-TCR"}
pMTnet_interpreter = "/home/mahmoud/anaconda3/envs/pmtnet/bin/python"

## Available methods

In [3]:
for name,version in TCRSpecificityPredictorFactory.available_methods().items():
    print(name, ",".join(version))

ergo-ii  
titan  
imrex  
nettcr2 2.0
pmtnet  
atm_tcr  


## Test binding specificity for each TCR to each epitope

In [4]:
outputs = []
for m in TCRSpecificityPredictorFactory.available_methods():
    mo = TCRSpecificityPredictorFactory(m)
    outputs.append(mo.predict(peptides=epitopes, 
                              TCRs=TCRs, 
                              repository=repository[mo.name], 
                              all=True, 
                              trained_on="vdjdb", 
                              trained_model="/home/mahmoud/Documents/BA/TITAN/TITAN/public/trained_model",
                              nettcr_chain="ab", 
                              pMTnet_interpreter=pMTnet_interpreter))
print("Test binding specificity for each TCR to each epitope\n")
pd.concat(outputs, axis=1)

Test binding specificity for each TCR to each epitope



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ERGO-II,TITAN,ImRex,NetTCR2,pMTnet,ATM_TCR
Receptor_ID,TRA,TRB,Peptide,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,VVRAGKLI,ASGQGNFDIQY,FLRGRAYGL,0.2161,0.1875,0.7602,0.0007,0.61,1.0
1,VVRAGKLI,ASGQGNFDIQY,HSKRKCDEL,0.051,0.1793,0.3393,0.0126,0.234,0.9238
2,ALSDPVNDMR,ASSLRGRGDQPQH,FLRGRAYGL,0.132,0.162,0.5253,0.0008,0.953,0.9969
2,ALSDPVNDMR,ASSLRGRGDQPQH,HSKRKCDEL,0.0217,0.3874,0.9271,0.0049,0.708,1.0


## Test binding specificity for TCRs to the corresponding epitopes in the same passed order

In [8]:
print("Test binding specificity for TCRs to the corresponding epitopes in the same passed order\n\n")
outputs = []
for m in TCRSpecificityPredictorFactory.available_methods():
    mo = TCRSpecificityPredictorFactory(m)
    outputs.append(mo.predict(peptides=epitopes, 
                              TCRs=TCRs, 
                              repository=repository[mo.name], 
                              all=False, 
                              trained_on="vdjdb", 
                              trained_model="/home/mahmoud/Documents/BA/TITAN/TITAN/public/trained_model",
                              nettcr_chain="ab", 
                              pMTnet_interpreter=pMTnet_interpreter))
pd.concat(outputs, axis=1)

Test binding specificity for TCRs to the corresponding epitopes in the same passed order




Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ERGO-II,TITAN,ImRex,NetTCR2,pMTnet,ATM_TCR
Receptor_ID,TRA,TRB,Peptide,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,VVRAGKLI,ASGQGNFDIQY,FLRGRAYGL,0.2161,0.1875,0.7602,0.0006,0.61,1.0
2,ALSDPVNDMR,ASSLRGRGDQPQH,HSKRKCDEL,0.0217,0.3874,0.9271,0.0006,0.708,1.0


## Testing on predefined dataset

In [9]:
print("Testing on predefined dataset\n")
outputs = []
for m in TCRSpecificityPredictorFactory.available_methods():
    mo = TCRSpecificityPredictorFactory(m)
    outputs.append(mo.predict_from_dataset(df=dataset, 
                                           repository=repository[mo.name],
                                           trained_on="vdjdb", 
                                           trained_model="/home/mahmoud/Documents/BA/TITAN/TITAN/public/trained_model",
                                           nettcr_chain="b", 
                                           pMTnet_interpreter=pMTnet_interpreter))
pd.concat(outputs, axis=1)

Testing on predefined dataset



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ERGO-II,TITAN,ImRex,NetTCR2,pMTnet,ATM_TCR
Receptor_ID,TRA,TRB,Peptide,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,CAVSAASGGSYIPTF,CASSFSGNTGELFF,RAKFKQLL,0.7402,0.9192,0.5539,0.0002,0.023,1.0


## Testing on vdjdb

In [5]:
print("Testing on vdjdb\n")
outputs_vdj = []
for m in TCRSpecificityPredictorFactory.available_methods():
    mo = TCRSpecificityPredictorFactory(m)
    outputs_vdj.append(mo.predict_from_dataset(path=vdjdb,
                                           source="vdjdb",
                                           repository=repository[mo.name],
                                           trained_on="vdjdb", 
                                           trained_model="/home/mahmoud/Documents/BA/TITAN/TITAN/public/trained_model",
                                           nettcr_chain="ab", 
                                           pMTnet_interpreter=pMTnet_interpreter))
pd.concat(outputs_vdj, axis=1)

Testing on vdjdb

TITAN's trained model can not make predictions for those samples, which their v- or j-regions are not included in the human v- or j-regions given by IMGT. Therefore the prediction score for these samples will be -1.
ImRex's trained model could not make predictions for some samples, which have either cdr3-beta-seqs, that are not 10-20 aas long or epitopes, that are not 8-11 aas long. These samples have prediction score of -1
NetTCR-2's trained model could not make predictions for some samples, which have either cdr3-(beta, alpha)-seqs, that are longer than 30 aas or epitopes, that are longer than 9 aas. These samples, have prediction score of -1
155 Antigens are longer than 15 aas, thus the corresponding samples will have prediction score of -1. All samples with HLA, that is not in HLA_seq_lib, will have score -1 too.
Mission loading.
Processing: /tmp/tmpiuoqa__g
drop A*08:01
drop DPA1*02:01
drop 2Db
drop 2Eb1
drop DRB1*03:01
drop DRA*01:02:03
drop DPA1*01:03
drop DQA1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ERGO-II,TITAN,ImRex,NetTCR2,pMTnet,ATM_TCR
Receptor_ID,TRA,TRB,Peptide,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,,CAAADEEIGNQPQHF,ATDALMTGY,0.8973,0.7855,0.9570,-1.0000,0.0056,1.0000
,,CAAGGQFYGYTF,KAFSPEVIPMF,0.6970,0.9640,0.8186,-1.0000,0.0021,1.0000
,,CACLLPYEQYF,GPGHKARVL,0.3762,0.7867,0.8902,-1.0000,0.0270,1.0000
,,CAEGGRDYGYTF,KAFSPEVIPMF,0.6210,0.9582,0.9857,-1.0000,0.0035,1.0000
,,CAFLGGSGANVLTF,RLQSLQTYV,0.5525,0.2128,0.2206,-1.0000,0.4600,1.0000
...,...,...,...,...,...,...,...,...,...
ex,,CASSSRTSGGTDTQYF,FRDYVDRFYKTLRAEQASQE,0.6345,0.9884,-1.0000,-1.0000,-1.0000,0.9612
lung10_12.clone,CAEGARDSNYQLIW,CASSAQANQPQHF,GILGFVFTL,0.2824,0.6982,0.8543,0.9918,0.2830,1.0000
lung10_15.clone,CATDGGGGSQGNLIF,CASSFRSTDTQYF,GILGFVFTL,0.8949,0.9902,0.9552,0.9999,0.0012,1.0000
lung10_17.clone,CAGAYGGSQGNLIF,CASSSRSSGEQYF,GILGFVFTL,0.8740,0.9910,0.9905,0.2549,0.0048,1.0000


## Testing on McPAS

In [6]:
outputs_McPAS = []
for m in TCRSpecificityPredictorFactory.available_methods():
    mo = TCRSpecificityPredictorFactory(m)
    outputs_McPAS.append(mo.predict_from_dataset(path=McPAS,
                                           source="mcpas",
                                           repository=repository[mo.name],
                                           trained_on="vdjdb", 
                                           trained_model="/home/mahmoud/Documents/BA/TITAN/TITAN/public/trained_model",
                                           nettcr_chain="ab", 
                                           pMTnet_interpreter=pMTnet_interpreter))
pd.concat(outputs_McPAS, axis=1)

TITAN's trained model can not make predictions for those samples, which their v- or j-regions are not included in the human v- or j-regions given by IMGT. Therefore the prediction score for these samples will be -1.
ImRex's trained model could not make predictions for some samples, which have either cdr3-beta-seqs, that are not 10-20 aas long or epitopes, that are not 8-11 aas long. These samples have prediction score of -1
NetTCR-2's trained model could not make predictions for some samples, which have either cdr3-(beta, alpha)-seqs, that are longer than 30 aas or epitopes, that are longer than 9 aas. These samples, have prediction score of -1
472 Antigens are longer than 15 aas, thus the corresponding samples will have prediction score of -1. All samples with HLA, that is not in HLA_seq_lib, will have score -1 too.
Mission loading.
Processing: /tmp/tmpx4aobjkr
drop 2s
drop 2Kd
drop 2q
drop 2u
drop 2b
drop DQ8
drop 2db
drop DPB1*02:01
drop DR1
drop 2k
drop DRB1*15:03
drop 2kb
drop DQ2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ERGO-II,TITAN,ImRex,NetTCR2,pMTnet,ATM_TCR
Receptor_ID,TRA,TRB,Peptide,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,,CASSDAGANTEVF,IKAVYNFATCG,0.0621,-1.0000,0.5774,-1.0000,-1.0,0.9606
1,,CASSDAGAYAEQF,IKAVYNFATCG,0.1989,-1.0000,0.9060,-1.0000,-1.0,0.0085
2,,CASSDAGGAAEVF,IKAVYNFATCG,0.0869,-1.0000,0.4252,-1.0000,-1.0,0.7705
3,,CASSDAGHSPLYF,IKAVYNFATCG,0.0372,-1.0000,0.3055,-1.0000,-1.0,0.7951
4,,CASSDAWGGAEQYF,IKAVYNFATCG,0.0293,-1.0000,0.1793,-1.0000,-1.0,1.0000
...,...,...,...,...,...,...,...,...,...
39029,CATDAEGNNRLAF,CASSIFGGGLGEQFF,FLCMKALLL,0.0695,0.2267,0.2555,0.0000,-1.0,0.0605
39030,CGAVGYQKVTF,CALNGEISYNEQFF,FLCMKALLL,0.0218,0.3146,0.8019,0.0001,-1.0,0.9998
39031,CAVIWYNNNDMRF,CASSQGVNTGELFF,FLCMKALLL,0.0261,0.2451,0.3594,0.0269,-1.0,1.0000
39031,CAVIWYNNNDMRF,CASSQGVNTGELFF,LPRRSGAAGA,,,,,,1.0000


## Testing on IEDB

In [5]:
outputs_IEDB = []
for m in TCRSpecificityPredictorFactory.available_methods():
    mo = TCRSpecificityPredictorFactory(m)
    outputs_IEDB.append(mo.predict_from_dataset(path=IEDB,
                                           source="IEDB",
                                           repository=repository[mo.name],
                                           trained_on="vdjdb", 
                                           trained_model="/home/mahmoud/Documents/BA/TITAN/TITAN/public/trained_model",
                                           nettcr_chain="ab", 
                                           pMTnet_interpreter=pMTnet_interpreter))
pd.concat(outputs_IEDB, axis=1)

TITAN's trained model can not make predictions for those samples, which their v- or j-regions are not included in the human v- or j-regions given by IMGT. Therefore the prediction score for these samples will be -1.
ImRex's trained model could not make predictions for some samples, which have either cdr3-beta-seqs, that are not 10-20 aas long or epitopes, that are not 8-11 aas long. These samples have prediction score of -1
NetTCR-2's trained model could not make predictions for some samples, which have either cdr3-(beta, alpha)-seqs, that are longer than 30 aas or epitopes, that are longer than 9 aas. These samples, have prediction score of -1
27531 Antigens are longer than 15 aas, thus the corresponding samples will have prediction score of -1. All samples with HLA, that is not in HLA_seq_lib, will have score -1 too.
Mission loading.
Processing: /tmp/tmplqg2srlk
drop DRB1*15:01
drop DR2
drop DRB1*04:01
drop DRA*01:01
drop DQA1*01:02
drop DQA1*02:01
drop IAd
drop A2
drop DR
drop DPA1*

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ERGO-II,TITAN,ImRex,NetTCR2,pMTnet,ATM_TCR
Receptor_ID,TRA,TRB,Peptide,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
57,IVVRSSNTGKLI,ASSQDRDTQY,VMAPRTLIL,0.3773,0.4718,0.8735,0.0003,0.5060,0.0020
58,,ASSLGQAYEQY,EEYLKAWTF,0.4917,0.3044,0.8231,-1.0000,0.9280,1.0000
58,,ASSLGQAYEQY,EEYLQAFTY,0.1529,0.3118,0.9604,-1.0000,0.7760,1.0000
58,,ASSLGQAYEQY,FLRGRAYGL,0.0712,0.2311,0.8581,-1.0000,-1.0000,1.0000
58,,ASSLGQAYEQY,FLRGRFYGL,0.2526,0.2816,0.7596,-1.0000,-1.0000,1.0000
...,...,...,...,...,...,...,...,...,...
202530,AVDNFNKFY,ASSSQGGYGYT,KVDPIGHVY,0.1664,0.1823,0.5976,0.0466,0.7590,0.9579
202531,AGSGSRLT,ASSFDRGYGYT,KVDPIGHVY,0.0303,0.2057,0.4328,0.1702,0.1800,0.0392
202532,AFTELNSGGSNYKLT,ASSLSGGLLRTGELF,FVVPYMIYLL,0.6925,0.2270,0.9137,-1.0000,0.0008,0.9999
202533,ASSGGNTPLV,ASSFGGAYEQY,VQIISCQY,0.6046,0.0478,0.1503,0.1789,0.8710,0.0360


## Testing scirpy

In [10]:
df = ir.datasets.wu2020().obs
# get all TCR seqs in scirpy format
df = process_dataset_TCR(df=df, source="scirpy")
df = df[["Receptor_ID", 'TRA', 'TRB', "TRAV", "TRAJ", "TRBV", "TRBJ", "T-Cell-Type", "Species", "Antigen.species", "Tissue"]]
df2 = pd.DataFrame({"Peptide": [str(pep) for pep in epitopes],
                    "MHC": [pep.mhc for pep in epitopes]})
# map each TCR seq to each epitope in the epitopes list
df = pd.merge(df, df2, how='cross')
df = df[["Receptor_ID", 'TRA', 'TRB', "TRAV", "TRAJ", "TRBV", "TRBJ", "T-Cell-Type", "Peptide", "MHC", "Species",
         "Antigen.species", "Tissue"]]
outputs_scirpy = []
for m in TCRSpecificityPredictorFactory.available_methods():
    mo = TCRSpecificityPredictorFactory(m)
    outputs_scirpy.append(mo.predict_from_dataset(df=df,
                                           repository=repository[mo.name],
                                           trained_on="vdjdb", 
                                           trained_model="/home/mahmoud/Documents/BA/TITAN/TITAN/public/trained_model",
                                           nettcr_chain="ab", 
                                           pMTnet_interpreter=pMTnet_interpreter))
pd.concat(outputs_scirpy, axis=1)

Testing scirpy



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ERGO-II
Receptor_ID,TRA,TRB,Peptide,Unnamed: 4_level_1
0,CALSDQVDDKLIF,CASSGGYYNEQFF,FLKEKGGL,0.2079
0,CALSDQVDDKLIF,CASSGGYYNEQFF,SQLLNAKYL,0.0748
1,,CASSPVSVLASSYEQYF,FLKEKGGL,0.6005
1,,CASSPVSVLASSYEQYF,SQLLNAKYL,0.5994
3,CALDTGGGNKLTF,CASSESQGQEKLFF,FLKEKGGL,0.1881
...,...,...,...,...
141618,CAASPAGSARQLTF,CASSEYKRHTDTQYF,SQLLNAKYL,0.0019
141620,,CASSIGLRDIQYF,FLKEKGGL,0.6148
141620,,CASSIGLRDIQYF,SQLLNAKYL,0.6638
141621,CAVNMGDMRF,CASSPRGGGPNEQYF,FLKEKGGL,0.0174
