# Part 1: Preprocessing of the data

In this portion of the tutorials, you will be guided through the different parsing steps needed to ensure high-quality consistent data formats. <br>
<br>
This includes: 
- Loading in the data
- Column selection
- Remove entries with missing data
- remove CDR3β sequences containing non-amino acid characters
- remove CDR3β sequences not starting/ending with the conserved cysteine (C) and end with a phenylalanine (F) residue
- Merging and excluding duplicate sequences from the data


In [20]:
#######################
### Import packages ###
#######################
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from pyteomics import parser

In [21]:
#####################
### Load the data ###
#####################

# Define the current working directory
cwd = '/Users/romivandoren/Desktop/Romi/Adrem/PhD/MCB_chapter/Chapter_final'

# Read in the correct data files
Patient_1_pre = pd.read_csv(f'{cwd}/Data/HN005_TUMOR_PRE.tsv', sep='\t')
Patient_1_post = pd.read_csv(f'{cwd}/Data/HN005_TUMOR_POST.tsv', sep='\t')

Patient_2_pre = pd.read_csv(f'{cwd}/Data/HN006_TUMOR_PRE.tsv', sep='\t')
Patient_2_post = pd.read_csv(f'{cwd}/Data/HN006_TUMOR_POST.tsv', sep='\t')

Patient_3_pre = pd.read_csv(f'{cwd}/Data/HN009_TUMOR_PRE.tsv', sep='\t')
Patient_3_post = pd.read_csv(f'{cwd}/Data/HN009_TUMOR_POST.tsv', sep='\t')

In [22]:
################################
### Define parsing functions ###
################################

# Extract informative columns from large dataframes
def get_columns(data):
    result = data[['aminoAcid','vFamilyName', 'jFamilyName',  'count (templates/reads)']]
    result.rename(columns={'aminoAcid': 'junction_aa', 'count (templates/reads)': 'Frequency_count', 'vFamilyName': 'TRBV_gene', 'jFamilyName': 'TRBJ_gene'}, inplace=True)
    result['Full_CDR3'] = result['junction_aa'] + '_' + result['TRBV_gene'] + '_' + result['TRBJ_gene']
    result = result[~result['Full_CDR3'].isnull()]
    return result


# Define all known amino acids
def is_amino_acid(cdr3):
    return all(aa in parser.std_amino_acids for aa in cdr3)


# Parse data based on CDR3 standards
def parse_data(dataframe):
    Step1 = dataframe[~dataframe['junction_aa'].isnull()]
    Step2 = Step1[Step1['junction_aa'].apply(is_amino_acid)]
    start_c = Step2['junction_aa'].str.startswith('C', na=False)
    end_f = Step2['junction_aa'].str.endswith('F', na=False)
    parse_result = Step2[start_c & end_f].reset_index(drop=True)
    parse_result['Total_freqeuncy'] = parse_result.groupby(['junction_aa','TRBV_gene','TRBJ_gene'])['Frequency_count'].transform('sum')
    parse_result = parse_result[['junction_aa', 'TRBV_gene', 'TRBJ_gene', 'Full_CDR3', 'Total_freqeuncy']]
    parse_result.drop_duplicates(subset=['junction_aa', 'TRBV_gene', 'TRBJ_gene'], inplace=True)
    return parse_result


# Apply column extraction and data parsing on dataframe
def full_parsing(data, patient):
    Columns = get_columns(data)
    Parsing = parse_data(Columns)
    print(f'Parsing: {patient}')
    print(f'The number of TCRs before parsing is: {len(Columns)}')
    print(f'The number of TCRs after parsing is: {len(Parsing)}')
    print(f'The number of TCRs removed is : {len(Columns) - len(Parsing)} \n')
    return Parsing


# Get total number of TCRs per patient
def get_patient_total(data_pre, data_post, Patient):
    data_pre.rename(columns={'Total_freqeuncy': 'Frequency_pre'}, inplace=True)
    data_post.rename(columns={'Total_freqeuncy': 'Frequency_post'}, inplace=True)
    merged = pd.concat([data_pre,data_post]).drop_duplicates(subset='Full_CDR3').reset_index(drop=True)
    print('The total number of unique sequences for', Patient, 'is:', len(merged))
    return merged

### Apply the parsing function defined above on the TCR data

In [23]:
##################
### Parse data ###
##################

# Execute parsing steps for each dataset
Data = [Patient_1_pre, Patient_1_post, Patient_2_pre, Patient_2_post, Patient_3_pre, Patient_3_post]
Data_names = ['Patient_1_pre', 'Patient_1_post', 'Patient_2_pre', 'Patient_2_post', 'Patient_3_pre', 'Patient_3_post']
Parsed_frames = []

for i,j in zip(Data,Data_names):
    res = full_parsing(i,j)
    Parsed_frames.append(res)

P1_pre, P1_post, P2_pre, P2_post, P3_pre, P3_post = [pd.DataFrame(j) for j in Parsed_frames]

# Get total number of unique sequences for each patient
Patient1_total = get_patient_total(P1_pre, P1_post, 'Patient 1')
Patient2_total = get_patient_total(P2_pre, P2_post, 'Patient 2')
Patient3_total = get_patient_total(P3_pre, P3_post, 'Patient 3')

Parsing: Patient_1_pre
The number of TCRs before parsing is: 16184
The number of TCRs after parsing is: 15074
The number of TCRs removed is : 1110 

Parsing: Patient_1_post
The number of TCRs before parsing is: 2976
The number of TCRs after parsing is: 2720
The number of TCRs removed is : 256 

Parsing: Patient_2_pre
The number of TCRs before parsing is: 19795
The number of TCRs after parsing is: 17976
The number of TCRs removed is : 1819 

Parsing: Patient_2_post
The number of TCRs before parsing is: 43866
The number of TCRs after parsing is: 39808
The number of TCRs removed is : 4058 

Parsing: Patient_3_pre
The number of TCRs before parsing is: 12050
The number of TCRs after parsing is: 11584
The number of TCRs removed is : 466 

Parsing: Patient_3_post
The number of TCRs before parsing is: 48822
The number of TCRs after parsing is: 46629
The number of TCRs removed is : 2193 

The total number of unique sequences for Patient 1 is: 16228
The total number of unique sequences for Patie

In [24]:
######################################
### Example of resulting dataframe ###
######################################

P1_pre

Unnamed: 0,junction_aa,TRBV_gene,TRBJ_gene,Full_CDR3,Frequency_pre
0,CSARDTVHYNEQFF,TCRBV20,TCRBJ02,CSARDTVHYNEQFF_TCRBV20_TCRBJ02,1769
1,CASRSRESNTIYF,TCRBV06,TCRBJ01,CASRSRESNTIYF_TCRBV06_TCRBJ01,1975
2,CASSPRDWTYEQYF,TCRBV07,TCRBJ02,CASSPRDWTYEQYF_TCRBV07_TCRBJ02,1253
3,CASRTSGSTDTQYF,TCRBV06,TCRBJ02,CASRTSGSTDTQYF_TCRBV06_TCRBJ02,1125
4,CASTLAGPTPVEQYF,TCRBV03,TCRBJ02,CASTLAGPTPVEQYF_TCRBV03_TCRBJ02,839
...,...,...,...,...,...
15849,CASSGLRGTEAFF,TCRBV02,TCRBJ01,CASSGLRGTEAFF_TCRBV02_TCRBJ01,1
15851,CAISEVLDSTEAFF,TCRBV10,TCRBJ01,CAISEVLDSTEAFF_TCRBV10_TCRBJ01,1
15854,CASSLDAVNTEAFF,TCRBV07,TCRBJ01,CASSLDAVNTEAFF_TCRBV07_TCRBJ01,1
15855,CASTEGALGNTEAFF,TCRBV05,TCRBJ01,CASTEGALGNTEAFF_TCRBV05_TCRBJ01,1


In [25]:
##################################
### Export parsed data to file ###
##################################

# Export the parsed dataframes to files for further analysis
Dataframes = [P1_pre, P1_post, P2_pre, P2_post, P3_pre, P3_post, Patient1_total, Patient2_total, Patient3_total]
Names=['P1_pre', 'P1_post', 'P2_pre', 'P2_post', 'P3_pre', 'P3_post', 'Patient1_total', 'Patient2_total', 'Patient3_total']

for i,j in zip(Dataframes, Names):
    i.to_csv(f'{cwd}/Processed_data/{j}_data.tsv', sep='\t', index=False)