# Incidence of CMV-associated TCRs

**This notebook gets data about the incidence of the CMV-associated TCRs in two classes seperately, to check whether the TCRs are defined correctly** 

In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import csv


# visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 10,8
plt.style.use('ggplot')
%matplotlib inline

import os
import sys
# print(sys.path)
sys.path.insert(0,'../../')
from utils import data_path,raw_data_path,raw_train_data_path,raw_test_data_path

# from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

## Processing CMV-associated TCRs

### Load raw data

In [4]:
CMV_associated = pd.read_excel('../'+raw_data_path+'CMV-associated.xlsx')

CMV_associated.head()

Unnamed: 0,V gene,CDR3,J gene,Incidence in CMV+ subjects,Incidence in CMV- subjects,P-value,HLA restriction
0,TCRBV09-01,CASSGQGAYEQYF,TCRBJ02-07*01,61,11,2.16e-13,
1,TCRBV19-01,CASSIGPLEHNEQFF,TCRBJ02-01*01,30,0,1.75e-11,A1
2,TCRBV05-01*01,CASSPDRVGQETQYF,TCRBJ02-05*01,33,1,2.66e-11,
3,TCRBV07-02*01,CASSLEAEYEQYF,TCRBJ02-07*01,30,1,3.2e-10,B8
4,TCRBV28-01*01,CASSIEGNQPQHF,TCRBJ01-05*01,26,0,5.31e-10,


**The 'V gene' / 'J gene' containing '\*' symbol means that it contains information about allele. The string before '-' is the family of the gene, while the number following '\*' represents the allele.**

**To match with the representation of the dataset, allelies should be extracted as a new column, and deleted from genes**

### Get V family, J family

In [3]:
def get_family(row,gene_col):
    if row[gene_col].find('-')!=-1:
        family, gene_num = row[gene_col].split('-')
    else:
        family = row[gene_col]
    return family

In [4]:
V_family = CMV_associated.apply(lambda row: get_family(row,'V gene'), axis=1)
J_family = CMV_associated.apply(lambda row: get_family(row,'J gene'), axis=1)

In [5]:
CMV_associated.insert(0,'V family',V_family)
CMV_associated.insert(3,'J family',J_family)

### Get V allele, J allele

In [6]:
def get_allele(row,gene_col):
    
    if row[gene_col].find('*')!=-1: # if gene contains '*',meaning that it contains allele
        gene,allele = row[gene_col].split('*') # get allele
        
        # To match the format of allele in the  dataset
        if len(allele)>1 and allele[0]=='0':  
            allele = np.float64(allele[1:]) # Drop the beginning '0' and then convert to float data type
        else:
            allele = np.float64(allele)
    else:
        allele = np.nan
        
    return allele

In [7]:
V_allele = CMV_associated.apply(lambda row: get_allele(row,'V gene'), axis=1)
J_allele = CMV_associated.apply(lambda row: get_allele(row,'J gene'), axis=1)

In [8]:
CMV_associated.insert(2,'V allele',V_allele)
CMV_associated.insert(6,'J allele',J_allele)

### Modify V gene, J gene

Deleting the information about allele in the raw data of V gene, J gene. And notice that the gene is 'unresolved' if the gene_num is not provided(No '-' symbol in the raw gene)

In [9]:
def modify_gene(row,gene_col):
    
    # Deleting allele
    if row[gene_col].find('*')!=-1: # if gene contains '*',meaning that it contains allele
        gene,allele = row[gene_col].split('*') # delete allele
    else: # if gene does not contain '*'
        gene = row[gene_col] # keep original gene
        
    # if gene_num is not provided, set gene 'unresolved'
    if gene.find('-')==-1:
        gene = 'unresolved'
    
    return gene

In [10]:
V_gene = CMV_associated.apply(lambda row: modify_gene(row,'V gene'), axis=1)
J_gene = CMV_associated.apply(lambda row: modify_gene(row,'J gene'), axis=1)

In [11]:
CMV_associated['V gene'] = V_gene
CMV_associated['J gene'] = J_gene

### Get TCRs

In [53]:
CMV_associated.fillna(-1,inplace=True)

CMV_associated['TCR'] = [ tuple(row[col] for col in ['V family','V gene','J family' ,
                                'J gene','CDR3',]) for _, row in CMV_associated.iterrows()]

# grouped = CMV_associated.fillna(-1).groupby(['V family','V gene','V allele',
#                         'CDR3','V family','V gene','V allele'])

# TCRs = np.array(list(grouped.groups.keys()))

**Save the new table**

In [7]:
# CMV_associated.to_csv(data_path+'CMV-associated_TCRs.csv',index=False)

with open(data_path+'MAP_estimator/'+'inc_paper.pkl','wb') as f:
        pickle.dump(CMV_associateds, f, pickle.HIGHEST_PROTOCOL)

## Glimpse raw data

**Have a look at a training file**

### Preprocessing data

In [3]:
# Get list of the names of the training files
train_files = []
for file in os.listdir(raw_train_data_path):
    if ('.tsv' in file) and ('.tsv#' not in file):
        train_files.append(file)
print(len(train_files))

666


In [16]:
# The first file in the data path
train_0 = pd.read_csv(raw_train_data_path+train_files[0], delimiter='\t')

# Look at the original data
train_0.head()

Unnamed: 0,rearrangement,amino_acid,frame_type,rearrangement_type,templates,reads,frequency,productive_frequency,cdr3_length,v_family,...,max_productive_frequency,max_frequency,counting_method,primer_set,release_date,sample_tags,fraction_productive,order_name,kit_id,total_t_cells
0,ATCCAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCA...,CASSPGDSNQPQHF,In,VDJ,6,406,5.2e-05,6.2e-05,42,TCRBV12,...,0.002984,0.018284,v2,Human-TCRB-PD1x,2013-12-13 22:22:21.367,"Age:46 Years,Biological Sex:Male,Cohort:Cohort...",0.835373,,,0
1,GAGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCA...,CASNEDSFSGNTIYF,In,VDJ,5,404,5.2e-05,6.2e-05,45,TCRBV06,...,0.002984,0.018284,v2,Human-TCRB-PD1x,2013-12-13 22:22:21.367,"Age:46 Years,Biological Sex:Male,Cohort:Cohort...",0.835373,,,0
2,CCGCTCAGGCTGGAGTTGGCTGCTCCCTCCCAGACATCTGTGTACT...,CASSYNSEQFF,In,VDJ,5,404,5.2e-05,6.2e-05,33,TCRBV06,...,0.002984,0.018284,v2,Human-TCRB-PD1x,2013-12-13 22:22:21.367,"Age:46 Years,Biological Sex:Male,Cohort:Cohort...",0.835373,,,0
3,ACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTA...,CSARDPVGGVNTIYF,In,VDJ,5,404,5.2e-05,6.2e-05,45,TCRBV20,...,0.002984,0.018284,v2,Human-TCRB-PD1x,2013-12-13 22:22:21.367,"Age:46 Years,Biological Sex:Male,Cohort:Cohort...",0.835373,,,0
4,AAGATCCAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTG...,CASSLSLSYEQYF,In,VDJ,5,404,5.2e-05,6.2e-05,39,TCRBV12,...,0.002984,0.018284,v2,Human-TCRB-PD1x,2013-12-13 22:22:21.367,"Age:46 Years,Biological Sex:Male,Cohort:Cohort...",0.835373,,,0


There are 81 columns in a sample, we only need some of them

In [17]:
# Get index and data types of needed columns
cols = train_0.columns.values.tolist()

needed_cols = ['amino_acid','frame_type','v_family','v_gene','v_allele','j_family',
               'j_gene','j_allele','sample_name','productive_rearrangements']

for col in needed_cols:    
    print(col+':'.ljust(10),'index:',cols.index(col),'data type:',train_0.dtypes[col])

amino_acid:          index: 1 data type: object
frame_type:          index: 2 data type: object
v_family:          index: 9 data type: object
v_gene:          index: 10 data type: object
v_allele:          index: 11 data type: float64
j_family:          index: 15 data type: object
j_gene:          index: 16 data type: object
j_allele:          index: 17 data type: float64
sample_name:          index: 42 data type: object
productive_rearrangements:          index: 53 data type: int64


In [19]:
train_0 = train_0[needed_cols] # Only keep needed columns
train_0 = train_0[train_0.frame_type=='In'] # Only frame_type==In are kept

sample_name = train_0['sample_name'][0] 
producitve_rearrangements = train_0['productive_rearrangements'][0]

# only keep the five columns realated to TCRs
train_0.drop(['frame_type','sample_name','productive_rearrangements'],axis=1,inplace=True)
train_0.fillna(-1,inplace=True)
train_0.reset_index(drop=True,inplace=True)

train_0.head()

Unnamed: 0,amino_acid,v_family,v_gene,v_allele,j_family,j_gene,j_allele
0,CASSPGDSNQPQHF,TCRBV12,unresolved,-1.0,TCRBJ01,TCRBJ01-05,1.0
1,CASNEDSFSGNTIYF,TCRBV06,TCRBV06-01,1.0,TCRBJ01,TCRBJ01-03,1.0
2,CASSYNSEQFF,TCRBV06,TCRBV06-06,-1.0,TCRBJ02,TCRBJ02-01,1.0
3,CSARDPVGGVNTIYF,TCRBV20,unresolved,-1.0,TCRBJ01,TCRBJ01-03,1.0
4,CASSLSLSYEQYF,TCRBV12,unresolved,-1.0,TCRBJ02,TCRBJ02-07,1.0


### Counting by groups

#### by family

In [20]:
train_0['v_family'].value_counts()

TCRBV06    16102
TCRBV05    14835
TCRBV07    14725
TCRBV20     9573
TCRBV28     8783
TCRBV12     8070
TCRBV19     7664
TCRBV02     5739
TCRBV03     5149
TCRBV04     4788
TCRBV18     4286
TCRBV09     3874
TCRBV11     3851
TCRBV27     3705
TCRBV10     3451
TCRBV29     3348
TCRBV30     2008
TCRBV24     1735
TCRBV15     1441
TCRBV14     1400
TCRBV25     1182
TCRBV13      634
TCRBV21      516
TCRBV23      164
TCRBV16      127
-1           117
TCRBV01       58
TCRBVA         1
Name: v_family, dtype: int64

In [21]:
train_0['j_family'].value_counts()

TCRBJ02    79018
TCRBJ01    48308
Name: j_family, dtype: int64

#### by gene

In [22]:
train_0['v_gene'].value_counts()

unresolved        26262
TCRBV28-01         8783
TCRBV05-01         8774
TCRBV19-01         7664
TCRBV07-09         6025
TCRBV02-01         5739
TCRBV06-01         5141
TCRBV06-05         4884
TCRBV18-01         4286
TCRBV09-01         3874
TCRBV27-01         3705
TCRBV29-01         3348
TCRBV07-02         3012
TCRBV04-01         2671
TCRBV11-02         2567
TCRBV10-03         2397
TCRBV07-03         2202
TCRBV05-04         2180
TCRBV06-06         2137
TCRBV04-02         2099
TCRBV30-01         2008
TCRBV05-06         1965
TCRBV07-08         1912
TCRBV15-01         1441
TCRBV14-01         1400
TCRBV25-01         1158
TCRBV07-06         1069
TCRBV05-05         1059
TCRBV12-05          881
TCRBV06-04          855
TCRBV11-03          758
TCRBV05-08          698
TCRBV13-01          634
TCRBV10-01          559
TCRBV11-01          519
TCRBV21-01          516
TCRBV20-01          512
TCRBV10-02          495
TCRBV07-07          470
TCRBV23-01          164
TCRBV16-01          127
TCRBV05-03      

In [23]:
train_0['j_gene'].value_counts()

TCRBJ02-07    20960
TCRBJ02-01    20065
TCRBJ01-01    14536
TCRBJ02-03    11773
TCRBJ02-05    11674
TCRBJ01-02    10635
TCRBJ01-05    10415
TCRBJ02-02     9137
TCRBJ01-04     5470
TCRBJ01-03     4305
TCRBJ02-06     3092
TCRBJ01-06     2947
TCRBJ02-04     2225
unresolved       92
Name: j_gene, dtype: int64

#### by CDR3 amino acid

In [24]:
train_0['amino_acid'].value_counts()

CASSLGYEQYF           15
CASSYSNGGEAFF         13
CASSPSYEQYF           12
CASSLSYEQYF           11
CASSLGGNQPQHF         10
CASSYRETDTQYF         10
CASSLYNEQFF            9
CASSQGYEQYF            9
CASSLAGTDTQYF          9
CASSLGQSQPQHF          9
CASSLAVAVYEQYF         9
CASSSSYEQYF            8
CASSLSTDTQYF           8
CASSLSGSSYEQYF         8
CASSLGGNTEAFF          8
CASSRDSYEQYF           8
CASSLGGSYEQYF          8
CASSLQGDYGYTF          8
CASSQEVTSYEQYF         8
CASSLGQNTEAFF          7
CASSSSTDTQYF           7
CASSLDSYGYTF           7
CASSLGQNYGYTF          7
CASSPRGYNEQFF          7
CASSLQETQYF            7
CASSPDSNQPQHF          7
CASSLGPNTEAFF          7
CASSSSSGRSYNEQFF       7
CASSETGTTYEQYF         7
CASSSLDNQPQHF          6
                      ..
CASKGSTGLTDTQYF        1
CASTLLAGGGNEQFF        1
CASSYPDGSEQYF          1
CASSSGTGGALGEKLFF      1
CAITQGGTRADTQYF        1
CASSLQTGNYGYTF         1
CASSPRGISTDTQYF        1
CSARGGRGNTEAFF         1
CASSPLSVSSYEQYF        1
