# Part 1: Data Parsing

In this part of the tutorials, you will be guided through the different parsing steps needed to ensure high-quality consistent data formats. <br>
<br>
This includes: 
- Loading the data
- Column selection
- Remove entries with missing data
- Remove CDR3β sequences containing non-amino acid characters
- Remove CDR3β sequences not starting/ending with the conserved cysteine (C) and end with a phenylalanine (F) residue
- Merging and excluding duplicate sequences from the data


In [10]:
# Import packages

import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from pyteomics import parser

In [11]:
# Load the data

# Define your current working directory
cwd = '/your/working/directory'

# Read in the correct data files
Patient_1_pre = pd.read_csv(f'{cwd}/Data/HN005_TUMOR_PRE.tsv', sep='\t')
Patient_1_post = pd.read_csv(f'{cwd}/Data/HN005_TUMOR_POST.tsv', sep='\t')

Patient_2_pre = pd.read_csv(f'{cwd}/Data/HN006_TUMOR_PRE.tsv', sep='\t')
Patient_2_post = pd.read_csv(f'{cwd}/Data/HN006_TUMOR_POST.tsv', sep='\t')

Patient_3_pre = pd.read_csv(f'{cwd}/Data/HN009_TUMOR_PRE.tsv', sep='\t')
Patient_3_post = pd.read_csv(f'{cwd}/Data/HN009_TUMOR_POST.tsv', sep='\t')

In [12]:
Patient_3_pre

Unnamed: 0,nucleotide,aminoAcid,count (templates/reads),frequencyCount (%),cdr3Length,vMaxResolved,vFamilyName,vGeneName,vGeneAllele,vFamilyTies,...,jOrphon,vFunction,dFunction,jFunction,fractionNucleated,vAlignLength,vAlignSubstitutionCount,vAlignSubstitutionIndexes,vAlignSubstitutionGeneThreePrimeIndexes,vSeqWithMutations
0,AGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAATC...,CSANR*D*RGYNEQFF,358,0.011601,48,TCRBV20,TCRBV20,,,,...,,,,,,,,,,
1,AGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGTC...,CSASRADRTPIYGYTF,283,0.009171,48,TCRBV20,TCRBV20,,,,...,,,,,,,,,,
2,CACGCCCTGCAGCCAGAAGACTCAGCCCTGTATCTCTGCGCCAGCA...,CASSQARGINQPQHF,325,0.010532,45,TCRBV04-01*01,TCRBV04,TCRBV04-01,1.0,,...,,,,,,,,,,
3,CCAGGCCTGGGGGACGCAGCCATGTACCTGTGTGCCACCAGCAGAG...,CATSRESPGQGIDEQFF,228,0.007388,51,TCRBV15-01*01,TCRBV15,TCRBV15-01,1.0,,...,,,,,,,,,,
4,AGCTCCTTCTCAGTGACTCTGGCTTCTATCTCTGTGCCTGGAGCTG...,,148,0.004796,49,TCRBV30-01*01,TCRBV30,TCRBV30-01,1.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14560,CCTGAGCTCTCTGAGCTGGGGGACTCAGCTTTGTATTTCTGTGCCA...,CASSVGQANTEAFF,1,0.000032,42,TCRBV09-01,TCRBV09,TCRBV09-01,,,...,,,,,,,,,,
14561,GAGTTCTAAGAAGCTCCTTCTCAGTGACTCTGGCTTCTATCTCTGT...,,1,0.000032,38,TCRBV30-01*01,TCRBV30,TCRBV30-01,1.0,,...,,,,,,,,,,
14562,CTTGTCCACTCTGACAGTGACCAGTGCCATCCTGAAGACAGCAGCT...,CSARDEAFF,1,0.000032,27,TCRBV20,TCRBV20,,,,...,,,,,,,,,,
14563,GAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTTTGCGCCAGC...,,1,0.000032,44,TCRBV05-01*01,TCRBV05,TCRBV05-01,1.0,,...,,,,,,,,,,


In [13]:
# Define parsing functions

def get_columns(data):

    # Extract informative columns from the large dataframes
    result = data[['aminoAcid','vFamilyName', 'jFamilyName',  'count (templates/reads)', 'frequencyCount (%)', 'sequenceStatus']]

    # Include only the productive TCRs as described by Immuneaccess
    result = result.loc[result['sequenceStatus'] == 'In']

    # Change column names and add a column describing the full sequence (V- and J-family included)
    result.rename(columns={'aminoAcid': 'junction_aa', 'count (templates/reads)': 'Clone_count', 'frequencyCount (%)': 'Clone_frequency', 'vFamilyName': 'v_call', 'jFamilyName': 'j_call'}, inplace=True)
    result['Full_CDR3'] = result['junction_aa'] + '_' + result['v_call'] + '_' + result['j_call']
    result = result.dropna()

    return result


# Define all known amino acids
def is_amino_acid(cdr3):
    return all(aa in parser.std_amino_acids for aa in cdr3)


def parse_data(dataframe):

    # Select only the sequences with known amino acids
    Step1 = dataframe[dataframe['junction_aa'].apply(is_amino_acid)]

    # Make sure each sequence starts with C and ends with F
    start_c = Step1['junction_aa'].str.startswith('C', na=False)
    end_f = Step1['junction_aa'].str.endswith('F', na=False)
    parse_result = Step1[start_c & end_f].reset_index(drop=True)

    # Sum the clone counts for duplicate TCRs and merge into one entry
    parse_result['Total_count'] = parse_result.groupby(['junction_aa','v_call','j_call'])['Clone_count'].transform('sum')
    parse_result['Total_frequency'] = parse_result.groupby(['junction_aa','v_call','j_call'])['Clone_frequency'].transform('sum')
    parse_result = parse_result[['junction_aa', 'v_call', 'j_call', 'Full_CDR3', 'Total_count', 'Total_frequency']]
    parse_result.drop_duplicates(subset=['junction_aa', 'v_call', 'j_call'], inplace=True)

    return parse_result


# Apply column extraction and data parsing on dataframe
def full_parsing(data, patient):
    Columns = get_columns(data)
    Parsing = parse_data(Columns)
    print(f'Parsing: {patient}')
    print(f'The number of TCRs before parsing is: {len(Columns)}')
    print(f'The number of TCRs after parsing is: {len(Parsing)}')
    print(f'The number of TCRs removed is : {len(Columns) - len(Parsing)} \n')

    return Parsing


# Get total number of TCRs per patient
def get_patient_total(data_pre, data_post, Patient):
    data_pre = pd.DataFrame(data_pre['Full_CDR3'])
    data_post = pd.DataFrame(data_post['Full_CDR3'])
    merged = pd.concat([data_pre,data_post]).drop_duplicates().reset_index(drop=True)
    print('The total number of unique sequences for', Patient, 'is:', len(merged))

    return merged

### Apply the parsing function defined above on the TCR data

In [14]:
# Parse data

# Execute parsing steps for each dataset
Data = [Patient_1_pre, Patient_1_post, Patient_2_pre, Patient_2_post, Patient_3_pre, Patient_3_post]
Data_names = ['Patient_1_pre', 'Patient_1_post', 'Patient_2_pre', 'Patient_2_post', 'Patient_3_pre', 'Patient_3_post']
Parsed_frames = []

for i,j in zip(Data,Data_names):
    res = full_parsing(i,j)
    Parsed_frames.append(res)

P1_pre, P1_post, P2_pre, P2_post, P3_pre, P3_post = [pd.DataFrame(j) for j in Parsed_frames]

# Get total number of unique sequences for each patient
Patient1_total = get_patient_total(P1_pre, P1_post, 'Patient 1')
Patient2_total = get_patient_total(P2_pre, P2_post, 'Patient 2')
Patient3_total = get_patient_total(P3_pre, P3_post, 'Patient 3')

Parsing: Patient_1_pre
The number of TCRs before parsing is: 15881
The number of TCRs after parsing is: 15074
The number of TCRs removed is : 807 

Parsing: Patient_1_post
The number of TCRs before parsing is: 2929
The number of TCRs after parsing is: 2720
The number of TCRs removed is : 209 

Parsing: Patient_2_pre
The number of TCRs before parsing is: 19456
The number of TCRs after parsing is: 17976
The number of TCRs removed is : 1480 

Parsing: Patient_2_post
The number of TCRs before parsing is: 43222
The number of TCRs after parsing is: 39808
The number of TCRs removed is : 3414 

Parsing: Patient_3_pre
The number of TCRs before parsing is: 11836
The number of TCRs after parsing is: 11584
The number of TCRs removed is : 252 

Parsing: Patient_3_post
The number of TCRs before parsing is: 48045
The number of TCRs after parsing is: 46629
The number of TCRs removed is : 1416 

The total number of unique sequences for Patient 1 is: 16228
The total number of unique sequences for Patien

In [15]:
# Example of resulting dataframe

P1_pre

Unnamed: 0,junction_aa,v_call,j_call,Full_CDR3,Total_count,Total_frequency
0,CSARDTVHYNEQFF,TCRBV20,TCRBJ02,CSARDTVHYNEQFF_TCRBV20_TCRBJ02,1769,0.020675
1,CASRSRESNTIYF,TCRBV06,TCRBJ01,CASRSRESNTIYF_TCRBV06_TCRBJ01,1975,0.023082
2,CASSPRDWTYEQYF,TCRBV07,TCRBJ02,CASSPRDWTYEQYF_TCRBV07_TCRBJ02,1253,0.014644
3,CASRTSGSTDTQYF,TCRBV06,TCRBJ02,CASRTSGSTDTQYF_TCRBV06_TCRBJ02,1125,0.013148
4,CASTLAGPTPVEQYF,TCRBV03,TCRBJ02,CASTLAGPTPVEQYF_TCRBV03_TCRBJ02,839,0.009806
...,...,...,...,...,...,...
15849,CASSGLRGTEAFF,TCRBV02,TCRBJ01,CASSGLRGTEAFF_TCRBV02_TCRBJ01,1,0.000012
15851,CAISEVLDSTEAFF,TCRBV10,TCRBJ01,CAISEVLDSTEAFF_TCRBV10_TCRBJ01,1,0.000012
15854,CASSLDAVNTEAFF,TCRBV07,TCRBJ01,CASSLDAVNTEAFF_TCRBV07_TCRBJ01,1,0.000012
15855,CASTEGALGNTEAFF,TCRBV05,TCRBJ01,CASTEGALGNTEAFF_TCRBV05_TCRBJ01,1,0.000012


In [16]:
# Export parsed data to file

Dataframes = [P1_pre, P1_post, P2_pre, P2_post, P3_pre, P3_post, Patient1_total, Patient2_total, Patient3_total]
Names=['P1_pre', 'P1_post', 'P2_pre', 'P2_post', 'P3_pre', 'P3_post', 'Patient1_total', 'Patient2_total', 'Patient3_total']

for i,j in zip(Dataframes, Names):
    i.to_csv(f'{cwd}/Processed_data/{j}_data.tsv', sep='\t', index=False)

In [18]:
# Export part of parsed data to files for clustering

Dataframes = [P1_pre, P1_post, P2_pre, P2_post, P3_pre, P3_post]
Names=['P1_pre', 'P1_post', 'P2_pre', 'P2_post', 'P3_pre', 'P3_post']

for i,j in zip(Dataframes, Names):
    i = i[['junction_aa', 'v_call', 'j_call']]
    i.to_csv(f'{cwd}/Processed_data/{j}_cluster_data.tsv', sep='\t', index=False)