# Preprocess data

We need to fetch the data and get specific columns. We'll then try and create some different tables to see how many records we have for each epitope. This way we can use them for different classifiers or one multiclassifier (combine them into one table).

In [13]:
import pandas as pd
df = pd.read_csv('./data/vdjdb.txt', sep="\t")

In [14]:
## Select the columns we need
selected_features = df[['gene','cdr3','v.segm','j.segm','species','mhc.a','mhc.b','mhc.class','antigen.epitope','antigen.species','vdjdb.score']]

In [15]:
## Select all human data
human_data = selected_features[(selected_features['species'] == 'HomoSapiens') & (selected_features['vdjdb.score'] > 0)]
# Drop duplicate rows
human_data = human_data.drop_duplicates()

# Delete rows with null values
human_data  = human_data.dropna()

# Create a table for covid data

In [16]:
## Create a table for covid data
covid_19_table = human_data[human_data['antigen.species'] =='SARS-CoV-2']
covid_19_table

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.species,vdjdb.score
419,TRB,CASSQTTKDEQYF,TRBV4-2*01,TRBJ2-7*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KTFPPTEPK,SARS-CoV-2,1
420,TRB,CASSQGQKDEQYF,TRBV4-2*01,TRBJ2-7*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KTFPPTEPK,SARS-CoV-2,1
421,TRB,CASSQFTGRKEKLFF,TRBV16*01,TRBJ1-4*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KTFPPTEPK,SARS-CoV-2,1
422,TRB,CASSLTSGGPTSDTQYF,TRBV5-1*01,TRBJ2-3*01,HomoSapiens,HLA-A*02:01,B2M,MHCI,LLYDANYFL,SARS-CoV-2,1
423,TRB,CASTTPVTERYNEQFF,TRBV28*01,TRBJ2-1*01,HomoSapiens,HLA-A*02:01,B2M,MHCI,LLYDANYFL,SARS-CoV-2,1
...,...,...,...,...,...,...,...,...,...,...,...
90030,TRB,CASSSYTQRGLDYTGELFF,TRBV27*01,TRBJ2-2*01,HomoSapiens,HLA-A*24:01,B2M,MHCI,QYIKWPWYI,SARS-CoV-2,1
90031,TRB,CSVEDPMGVGTEAFF,TRBV29-1*01,TRBJ1-1*01,HomoSapiens,HLA-A*24:01,B2M,MHCI,QYIKWPWYI,SARS-CoV-2,1
90032,TRA,CLVGENSGGFKTIF,TRAV4*01,TRAJ9*01,HomoSapiens,HLA-A*24:01,B2M,MHCI,QYIKWPWYI,SARS-CoV-2,1
90033,TRA,CAMREGQGNAGNMLTF,TRAV14/DV4*01,TRAJ39*01,HomoSapiens,HLA-A*24:01,B2M,MHCI,QYIKWPWYI,SARS-CoV-2,1


# Create a table for flu data

In [17]:
## Create a table for flu
flu_table = human_data[human_data['antigen.species'] =='InfluenzaA']
flu_table

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.species,vdjdb.score
1273,TRA,CAVSESPFGNEKLTF,TRAV8-4*01,TRAJ48*01,HomoSapiens,HLA-DRA*01:02:03,HLA-DRB1*01:01:01,MHCII,PKYVKQNTLKLAT,InfluenzaA,3
1274,TRB,CASSSTGLPYGYTF,TRBV28*01,TRBJ1-2*01,HomoSapiens,HLA-DRA*01:02:03,HLA-DRB1*01:01:01,MHCII,PKYVKQNTLKLAT,InfluenzaA,3
1277,TRA,CAVSESPFGNEKLTF,TRAV8-4*01,TRAJ48*01,HomoSapiens,HLA-DRA*01:02:03,HLA-DRB1*04:01:01,MHCII,PKYVKQNTLKLAT,InfluenzaA,3
1278,TRB,CASSSTGLPYGYTF,TRBV28*01,TRBJ1-2*01,HomoSapiens,HLA-DRA*01:02:03,HLA-DRB1*04:01:01,MHCII,PKYVKQNTLKLAT,InfluenzaA,3
1287,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,HomoSapiens,HLA-A*02:01:48,B2M,MHCI,GILGFVFTL,InfluenzaA,3
...,...,...,...,...,...,...,...,...,...,...,...
82102,TRB,CSAGGWDRVNQPQHF,TRBV20-1*01,TRBJ1-5*01,HomoSapiens,HLA-DRA*01,HLA-DRB1*01,MHCII,PKYVKQNTLKLAT,InfluenzaA,3
82103,TRB,CASSESQTGDYEQYF,TRBV25-1*01,TRBJ2-7*01,HomoSapiens,HLA-DRA*01,HLA-DRB1*01,MHCII,PKYVKQNTLKLAT,InfluenzaA,3
82104,TRB,CATSDSTSGGTDTQYF,TRBV24-1*01,TRBJ2-3*01,HomoSapiens,HLA-DRA*01,HLA-DRB1*01,MHCII,PKYVKQNTLKLAT,InfluenzaA,3
82105,TRB,CASSFSAEATGELFF,TRBV9*01,TRBJ2-2*01,HomoSapiens,HLA-DRA*01,HLA-DRB1*01,MHCII,PKYVKQNTLKLAT,InfluenzaA,1


# Create a table for cancer data

In [18]:
## Create a table for cancer
cancer_table =  human_data[human_data['antigen.species'] =='HomoSapiens']

In [19]:
cancer_table

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.species,vdjdb.score
682,TRA,CAVAGYGGSQGNLIF,TRAV12-2*01,TRAJ42*01,HomoSapiens,HLA-A*02,B2M,MHCI,ELAGIGILTV,HomoSapiens,1
683,TRB,CASSPQGLGTEAFF,TRBV28*01,TRBJ1-1*01,HomoSapiens,HLA-A*02,B2M,MHCI,ELAGIGILTV,HomoSapiens,1
1297,TRA,CALSGGDSSYKLIF,TRAV9-2*01,TRAJ12*01,HomoSapiens,HLA-DRA*01:02:03,HLA-DRB5*01:01:01,MHCII,VHFFKNIVTPRTPG,HomoSapiens,3
1298,TRB,CASSLADRVNTEAFF,TRBV5-1*01,TRBJ1-1*01,HomoSapiens,HLA-DRA*01:02:03,HLA-DRB5*01:01:01,MHCII,VHFFKNIVTPRTPG,HomoSapiens,3
1301,TRA,CAVRPTSGGSYIPTF,TRAV21*01,TRAJ6*01,HomoSapiens,HLA-A*02:01:48,B2M,MHCI,SLLMWITQV,HomoSapiens,3
...,...,...,...,...,...,...,...,...,...,...,...
84140,TRB,CASSIQRNYGYTF,TRBV6-5*01,TRBJ1-2*01,HomoSapiens,HLA-A*02,B2M,MHCI,NLSALGIFST,HomoSapiens,1
84141,TRB,CASSATFNDNEKLFF,TRBV10-1*01,TRBJ1-4*01,HomoSapiens,HLA-A*02,B2M,MHCI,NLSALGIFST,HomoSapiens,1
84142,TRB,CATSDFGTEAFF,TRBV24-1*01,TRBJ1-1*01,HomoSapiens,HLA-A*02,B2M,MHCI,NLSALGIFST,HomoSapiens,1
84144,TRB,CASSQGQLAGGLVF,TRBV4-1*01,TRBJ2-4*01,HomoSapiens,HLA-A*02,B2M,MHCI,NLSALGIFST,HomoSapiens,1


# Create a table for CMV data

In [20]:
## Create a table for CMV
cmv_table = human_data[human_data['antigen.species'] =='CMV']

In [21]:
cmv_table

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.species,vdjdb.score
963,TRB,CASSYQTGAAYGYTF,TRBV6-5*01,TRBJ1-2*01,HomoSapiens,HLA-A*02:01,B2M,MHCI,NLVPMVATV,CMV,3
964,TRB,CASRSAQRGFTETQYF,TRBV28*01,TRBJ2-5*01,HomoSapiens,HLA-A*02:01,B2M,MHCI,NLVPMVATV,CMV,1
965,TRB,CASSSSTGTGRCLLPNNTEAFF,TRBV11-3*01,TRBJ1-1*01,HomoSapiens,HLA-A*02:01,B2M,MHCI,NLVPMVATV,CMV,1
966,TRB,CASSPVTGQGFYGYTF,TRBV6-5*01,TRBJ1-2*01,HomoSapiens,HLA-A*02:01,B2M,MHCI,NLVPMVATV,CMV,1
968,TRB,CASSEEPSGGAYEQYF,TRBV6-1*01,TRBJ2-7*01,HomoSapiens,HLA-A*02:01,B2M,MHCI,NLVPMVATV,CMV,1
...,...,...,...,...,...,...,...,...,...,...,...
83846,TRB,CASSWAPNTGELFF,TRBV12-3*01,TRBJ2-2*01,HomoSapiens,HLA-B*07:02,B2M,MHCI,RPHERNGFTVL,CMV,1
83847,TRB,CSATPARASNTEAFF,TRBV20-1*01,TRBJ1-1*01,HomoSapiens,HLA-B*07:02,B2M,MHCI,RPHERNGFTVL,CMV,1
83848,TRB,CSARDPQGVTEAFF,TRBV20-1*01,TRBJ1-1*01,HomoSapiens,HLA-B*07:02,B2M,MHCI,RPHERNGFTVL,CMV,1
83849,TRB,CSARDSSGGAKNIQYF,TRBV20-1*01,TRBJ2-4*01,HomoSapiens,HLA-B*07:02,B2M,MHCI,RPHERNGFTVL,CMV,1
