# Get distance matrices

We need to get different distance matrices (alpha, beta, and combined).

In [1]:
from tcrdist.repertoire import TCRrep
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import umap

In [2]:
import pandas as pd
df = pd.read_csv('./data/vdjdb.txt', sep="\t")

In [3]:
# Select the columns we need
selected_features = df[['complex.id', 'gene','cdr3','v.segm','j.segm','species','mhc.a','mhc.b','mhc.class','antigen.epitope','antigen.species','vdjdb.score']]

In [4]:
# Select all human data
human_data = selected_features[(selected_features['species'] == 'HomoSapiens') & (selected_features['vdjdb.score'] > 0)]

# Drop duplicate rows
human_data = human_data.drop_duplicates()

# Delete rows with null values
human_data  = human_data.dropna()
# Print all data
human_data.head()

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.species,vdjdb.score
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2


# Get alpha chains

In [5]:
# Do the same as above but for alpha chains

TRA = human_data[human_data['gene'] =='TRA']
alpha_chains = TRA[['cdr3', 'v.segm', 'j.segm','antigen.epitope']]
alpha_chains.rename(columns={'cdr3':'cdr3_a_aa','v.segm':'v_a_gene', 'j.segm':'j_a_gene','antigen.epitope':'epitope'}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alpha_chains.rename(columns={'cdr3':'cdr3_a_aa','v.segm':'v_a_gene', 'j.segm':'j_a_gene','antigen.epitope':'epitope'}, inplace=True)


# Get beta chains

In [6]:
# get beta chains only
TRB = human_data[human_data['gene'] =='TRB']
# rename the columns for our beta chain matrix calculation
beta_chains = TRB[['cdr3', 'v.segm', 'j.segm','antigen.epitope']]
beta_chains.rename(columns={'cdr3':'cdr3_b_aa','v.segm':'v_b_gene', 'j.segm':'j_b_gene','antigen.epitope':'epitope'}, inplace=True)
beta_chains

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  beta_chains.rename(columns={'cdr3':'cdr3_b_aa','v.segm':'v_b_gene', 'j.segm':'j_b_gene','antigen.epitope':'epitope'}, inplace=True)


Unnamed: 0,cdr3_b_aa,v_b_gene,j_b_gene,epitope
1,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,FLKEKGGL
2,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,FLKEKGGL
4,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,FLKEKGGL
6,CASSYEPGQVSHYSNQPQHF,TRBV13*01,TRBJ1-5*01,FLKEKGGL
8,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01,FLKEKGGL
...,...,...,...,...
92682,CASSLRATDTQYF,TRBV7-2*01,TRBJ2-3*01,PQPELPYPQPQL
92684,CATSRAGGGGEKLFF,TRBV15*01,TRBJ1-4*01,FPQPEQPFPWQP
92686,CASSQGSGGNEQFF,TRBV4-3*01,TRBJ2-1*01,FPQPEQPFPWQP
92690,CASSIVGSGGYNEQFF,TRBV19*01,TRBJ2-1*01,QLQPFPQPELPY


# Get paired alpha and beta pairs

In [7]:
# Let's get all the IDs for the TCRs (A & B pairs should have the same ID)
_ids = human_data['complex.id']
_ids

0            1
1            1
2            0
3            2
4            2
         ...  
92686    30552
92689    30554
92690    30554
92767    30593
92768    30593
Name: complex.id, Length: 7799, dtype: int64

In [8]:
list_to_combine = []
checked_ids = []
def process_row(row):
    # check the complex id not already checked
    if row['complex.id'] not in checked_ids:
        # find matching rows
        matched_rows = human_data[human_data['complex.id'] == row['complex.id']]
        # should be two (some rows have only 1 match)
        if len(matched_rows) == 2:
            # get the tcra row
            tra_row = matched_rows.iloc[0]
            # get the tcrb row
            trb_row = matched_rows.iloc[1]
            # add to list as a combined row
            list_to_combine.append({'tcr_id_a':tra_row['complex.id'], 'tcr_id_b':trb_row['complex.id'], 
                                    'cdr3_a_aa': tra_row['cdr3'], 'cdr3_b_aa': trb_row['cdr3'],
                                    'v_b_gene' :trb_row['v.segm'],
                                    'j_b_gene':trb_row['j.segm'],
                                    'v_a_gene':tra_row['v.segm'],
                                    'j_a_gene':tra_row['j.segm'],
                                    'epitope':tra_row['antigen.epitope'] ,
                                   })
            # we've checked this id now, so we need to make sure we don't have to check it again.
            checked_ids.append(row['complex.id'])
human_data.apply(process_row, axis=1)


KeyboardInterrupt



In [None]:
# Create table representing paired tcr rows
paired_chains = pd.DataFrame(list_to_combine)
paired_chains

# Testing KNN as benchmark for predictions

In [None]:
chains_to_use = ['alpha','beta'] # specify which chains we want to use
data_to_use = paired_chains # specify the data that represents these chains

def set_up_data(chains_to_use, data_to_use):

    # We only want to use the top 5 antigens with the most tcrs
    value_counts_antigen = data_to_use['epitope'].value_counts()
    filtered_value_counts = value_counts_antigen[value_counts_antigen >= 70].index
    matrix_filtered = data_to_use[data_to_use['epitope'].isin(filtered_value_counts)]
    matrix_filtered.reset_index(drop=True, inplace=True)
    data_to_use = matrix_filtered
    
    train_df, test_df = train_test_split(data_to_use, test_size=0.2, random_state=42) # split our data into training/test sets
    train_df.head()
    train_df.rename(columns={'antigen.epitope': 'epitope'}, inplace=True) # we need to rename this column so it matches what is expected.
    number_tcr_originally = train_df.shape[0] # let's figure out how many tcrs we have 
    print(f'Our dataset has {train_df.shape[0]} TCRs for {len(train_df["epitope"].unique())} different epitopes\n') # let's see how manyt epitopes
    # Specify columns we need to drop from.
    cols_to_drop = ['epitope']

    # we need to add alpha chain cdr3_a_aa column if we want to use alpha chains
    if 'alpha' in chains_to_use:
        cols_to_drop += ['cdr3_a_aa']

    # we need to add beta chain cdr3_b_aa column if we want to use beta chains
    if 'beta' in chains_to_use:
        cols_to_drop += ['cdr3_b_aa']
    
    train_df = train_df.drop_duplicates(subset=cols_to_drop, keep='first').copy() # removing duplicates
    train_df.reset_index(drop=True, inplace=True)

    print(f'Removed {number_tcr_originally - train_df.shape[0]} duplicate TCRs by columns {cols_to_drop}.')
    print(f'The new number of TCRs for each epitope:')
    print(train_df['epitope'].value_counts())
    
    return [train_df, test_df]

## Get distances for training set

In [None]:
def get_distances_training_set(train_df,chains_to_use):
    
    tr = TCRrep(cell_df=train_df,
                chains = chains_to_use,
                organism='human', 
                deduplicate=True)
    # we need to create different X data frame depending on what chains we want to use..
    if chains_to_use==['alpha']:
        X = pd.DataFrame(tr.pw_cdr3_a_aa) # for alpha chains only
    if chains_to_use==['beta']: 
        X = pd.DataFrame(tr.pw_cdr3_b_aa) # for beta chains only
    if chains_to_use==['alpha', 'beta']:
        X = pd.DataFrame(tr.pw_cdr3_a_aa + tr.pw_cdr3_b_aa) # for combined chains

    Y = pd.DataFrame(tr.clone_df['epitope']) # training labels for our tcrs

    # let's project data using UMAP
    training_reduced_embeddings = umap.UMAP(n_components = 30, n_neighbors = 5, random_state=42).fit_transform(X)
    train_reduced_df = pd.DataFrame(training_reduced_embeddings)
    train_reduced_df.columns = train_reduced_df.columns.astype(str)
    
    return [train_reduced_df,Y, tr]

## Train our model

In [None]:
def train_model(train_reduced_df,Y):
    ## Train knn model
    # we train the model on our whole train set
    knn = KNeighborsClassifier()

    # We use grid search for cross validation to get the best K.
    param_grid = {'n_neighbors': range(2, 100)} 

    # Perform grid search using cross-validation
    grid_search = GridSearchCV(knn, param_grid, cv=10)  #  Let's use 5 fold.

    # We fit to our training data
    grid_search.fit(train_reduced_df, Y.values.ravel())

    # set the model to be the best performing one.
    model = grid_search.best_estimator_
    
    return model


In [None]:
def format_test_set(test_df, train_df):
    
    print('Our test set has {} TCRs'.format(test_df.shape[0]))
    # change column name
    test_df.rename(columns={'antigen.epitope': 'epitope'}, inplace=True)
    return test_df


# Get distances matrix (call it X_test) between test and train TCRs

In [None]:
def get_test_distances_predictions(test_df,chains_to_use, tr, model):
    test_tcrs = TCRrep(cell_df=test_df,
                         organism='human', 
                         chains=chains_to_use,
                         deduplicate=True)

    test_tcrs.compute_rect_distances(df=test_tcrs.clone_df, 
                                        df2=tr.clone_df)
    if chains_to_use==['alpha']:
        X_test = pd.DataFrame(test_tcrs.rw_cdr3_a_aa)
    if chains_to_use==['beta']:
        X_test = pd.DataFrame(test_tcrs.rw_cdr3_b_aa)
    if chains_to_use ==['alpha', 'beta']:
        X_test = pd.DataFrame(test_tcrs.rw_cdr3_a_aa + test_tcrs.rw_cdr3_b_aa)

    test_reduced_embeddings = umap.UMAP(n_components = 30, n_neighbors = 5).fit_transform(X_test)
    test_reduced_df = pd.DataFrame(test_reduced_embeddings)
    test_reduced_df.columns = test_reduced_df.columns.astype(str)
    X_test = test_reduced_embeddings
    # get predictions
    prediction_labels = model.predict(X_test)

    # Let's fetch test labels
    Y_test = test_tcrs.clone_df['epitope']
    return [Y_test, prediction_labels]



# Getting performance metrics

In [None]:
def get_performance_metrics(Y_test,prediction_labels, model):
    # create metrics dfs so we can plot matrices.
    metrics_holder = {}
    metrics_holder['accuracy'] = accuracy_score(Y_test, prediction_labels)
    metrics_holder['actual_vs_labeled_matrix'] = pd.DataFrame(confusion_matrix(Y_test, prediction_labels), columns=model.classes_, index=model.classes_)
    metrics_table_df = pd.DataFrame(columns=model.classes_)
    metrics_table_df.loc['f1'] = f1_score(Y_test, prediction_labels, average=None)
    metrics_table_df.loc['precision'] = precision_score(Y_test, prediction_labels, average=None)
    metrics_table_df.loc['recall'] = recall_score(Y_test, prediction_labels, average=None)
    metrics_table_df = metrics_table_df.loc[:, (metrics_table_df != 0).any(axis=0)]
    metrics_holder['actual_vs_labeled_matrix'] = metrics_holder['actual_vs_labeled_matrix'].loc[:, (metrics_holder['actual_vs_labeled_matrix'] != 0).any(axis=0)]
    return [metrics_holder,metrics_table_df]



# Plot confusion matrices

In [None]:
def set_m_color(chains_to_use):
    # change colour depending on type of chain plotted in matrices
    if chains_to_use == ['alpha']:
            m_color = 'Blues'
    if chains_to_use == ['beta']:
            m_color = 'Reds'
    if chains_to_use == ['alpha','beta']:
            m_color = 'Purples'
    return m_color

In [None]:
def plot_confusion_matrix(metrics_holder, chains_to_use):
    # plotting confusion matrix
    m_color = set_m_color(chains_to_use)
    figure, axis = plt.subplots(nrows=1,figsize=(14,7))
    sns.heatmap(metrics_holder['actual_vs_labeled_matrix'], annot=True, cmap=m_color, fmt='', square=True, annot_kws={"size": 12})
    axis.set_xlabel('Predicted', fontsize=13)
    axis.set_ylabel('Actual', fontsize=13)
    axis.set_title(f'Predicted vs actual epitopes for {chains_to_use} chains', fontsize=17)
    for tick in axis.get_xticklabels():
            tick.set_rotation(90)



# Plot of metrics for each epitope

In [None]:
def plot_metrics_for_each_epitope(metrics_table_df,metrics_holder, chains_to_use):
    #plotting matrix to show metric performance for each epitope tested.
    m_color = set_m_color(chains_to_use)
    plt.figure(figsize=(15,4))
    axis = sns.heatmap(metrics_table_df, annot=True, cmap=m_color )
    axis.set_ylabel('Metrics', fontsize=15)
    axis.set_xlabel('Epitopes', fontsize=15)
    axis.set_title(f"Average accuracy: {metrics_holder['accuracy']:.3f} for {chains_to_use} chains", fontsize=15)

In [None]:
#chains_to_use = # specify which chains we want to use
#data_to_use = # specify the data that represents these chains

def measure_performance(chains_to_use, data_to_use):
    # let's now run all the functions so we can create our model, train it, and test its performance.
    
    train_df, test_df = set_up_data(chains_to_use, data_to_use)
    train_reduced_df, Y, tr = get_distances_training_set(train_df,chains_to_use)
    model = train_model(train_reduced_df,Y)
    test_df = format_test_set(test_df, train_df)
    Y_test, prediction_labels = get_test_distances_predictions(test_df,chains_to_use, tr, model)
    metrics_holder,metrics_table_df = get_performance_metrics(Y_test,prediction_labels, model)
    plot_confusion_matrix(metrics_holder, chains_to_use)
    plot_metrics_for_each_epitope(metrics_table_df,metrics_holder, chains_to_use)




In [None]:
# Let's see the performance for alpha chains
measure_performance(chains_to_use = ['alpha'],data_to_use = alpha_chains)

In [None]:
#Let's see the performance for beta chains
measure_performance(chains_to_use = ['beta'],data_to_use = beta_chains)

In [None]:
# Let's see the performance for paired chains
measure_performance(chains_to_use = ['alpha','beta'],data_to_use = paired_chains)