# Part 4: Clustering of the TCR repertoire with ClusTCR

In this section of the tutorial you will learn how to use the ClusTCR tool to cluster together the different TCR repertoires based on sequence similarity.

In [1]:
# Import packages

# Clustering one repertoire
import clustcr
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from clustcr import read_cdr3, Clustering

In [2]:
# Define the current working directory
cwd = '/Users/romivandoren/Desktop/Romi/Adrem/PhD/MCB_chapter/Chapter_final'

## Clustering of a single repertoire

In [3]:
# Clustering one repertoire
import clustcr
import pandas as pd
from clustcr import read_cdr3, Clustering

# Initiate ClusTCR clustering object
clustering = clustcr.Clustering()

# Import the data
data = read_cdr3(file=f'{cwd}/Processed_data/P2_pre_data.tsv', data_format='tcrex')
print('\nNumber of TCRs in input data is:', len(data))

# Fit data to the clustering object
result = clustering.fit(data)


Number of TCRs in input data is: 17644
Clustering 17644 TCRs using two-step approach.
Total time to run ClusTCR: 2.066s


## Clustering of multiple repertoires

In [4]:
# Define clustering function and export result to files
def get_clusters(file_dir, name, Cluster_dir):
    Data = read_cdr3(file=f'{file_dir}/{name}_data.tsv', data_format='tcrex')
    print('\nNumber of TCRs in input data: %s' % (len(Data)))
    # Fit data to clustering object
    clustering = Clustering(n_cpus=8)
    result = clustering.fit(Data)
    Clusters = result.clusters_df
    Summary = result.summary()
    print('The number of clusters for', name, 'is:', len(Summary))
    features = result.compute_features()
    Cl_content = result.cluster_contents()
    Cl_content = pd.DataFrame(Cl_content)
    Clusters.to_csv(f'{Cluster_dir}/{name}_clusters.csv', index=False)
    Summary.to_csv(f'{Cluster_dir}/{name}_summary.csv', index=False)
    features.to_csv(f'{Cluster_dir}/{name}_features.csv', index=False)
    Cl_content.to_csv(f'{Cluster_dir}/{name}_cl_content.csv', index=False)

In [5]:
File_dir = f'{cwd}/Processed_data'
Cluster_dir = f'{cwd}/Cluster_results'
Names = ['P1_pre', 'P1_post', 'P2_pre', 'P2_post', 'P3_pre', 'P3_post']

for i in Names:
    get_clusters(File_dir, i, Cluster_dir)


Number of TCRs in input data: 14858
Clustering 14858 TCRs using two-step approach.
Total time to run ClusTCR: 7.704s
The number of clusters for P1_pre is: 639

Number of TCRs in input data: 2695
Clustering 2695 TCRs using two-step approach.
Total time to run ClusTCR: 5.805s
The number of clusters for P1_post is: 81

Number of TCRs in input data: 17644
Clustering 17644 TCRs using two-step approach.
Total time to run ClusTCR: 7.351s
The number of clusters for P2_pre is: 733

Number of TCRs in input data: 38872
Clustering 38872 TCRs using two-step approach.
Total time to run ClusTCR: 10.504s
The number of clusters for P2_post is: 1768

Number of TCRs in input data: 11527
Clustering 11527 TCRs using two-step approach.
Total time to run ClusTCR: 6.790s
The number of clusters for P3_pre is: 455

Number of TCRs in input data: 45790
Clustering 45790 TCRs using two-step approach.
Total time to run ClusTCR: 13.690s
The number of clusters for P3_post is: 2112


In [6]:
# Define the function to extract clusters that are shared between the pre- and post-treatment repertoires in an individual
def get_shared_clusters(Clusdir, name):
    Summary_pre = pd.read_csv(f'{Clusdir}/{name}_pre_summary.csv')
    Summary_post = pd.read_csv(f'{Clusdir}/{name}_post_summary.csv')
    set1 = set(Summary_pre['motif'])
    set2 = set(Summary_post['motif'])
    Shared = set1.intersection(set2)
    Pre_shared = Summary_pre.loc[Summary_pre['motif'].isin(Shared)]
    Post_shared = Summary_post.loc[Summary_post['motif'].isin(Shared)]
    Pre_shared.rename(columns={'size': 'size_pre'}, inplace=True)
    Post_shared.rename(columns={'size': 'size_post'}, inplace=True)
    Result = Pre_shared.merge(Post_shared, on='motif')
    Result.sort_values(by=['size_post'], ascending=False, inplace=True)
    return Result

In [7]:
All_shared = []
Clusdir=f'{cwd}/Cluster_results'
Names = ['P1', 'P2', 'P3']

for i in Names:
    res = get_shared_clusters(Clusdir, i)
    All_shared.append(res)

Patient1_shared, Patient2_shared, Patient3_shared = [pd.DataFrame(item) for item in All_shared]

print('The number of shared clusters in patient 1 is:', len(Patient1_shared))
print('The number of shared clusters in patient 2 is:', len(Patient2_shared))
print('The number of shared clusters in patient 3 is:', len(Patient3_shared))

The number of shared clusters in patient 1 is: 11
The number of shared clusters in patient 2 is: 40
The number of shared clusters in patient 3 is: 26


In [8]:
Patient2_shared

Unnamed: 0,size_pre,motif,size_post
0,120,CASS...nTEAFF,192
3,28,CASS..YEQYF,74
2,21,CASS..TGELFF,72
1,9,CASS..GNTIYF,22
4,4,CAISGVSYNEQFF,4
31,4,CASSYRTGGTEAFF,4
30,2,CSAT[GT]TGSTDTQYF,2
24,2,CASSL[NL]GPATNEKLFF,2
25,2,CASSRTV[GT]SYEQYF,2
26,2,CASSPAG[DW]TGELFF,2


# Find significantly increased TCRs in expanded clusters

In [9]:
def get_Fisher_CDR3s(fisher_dir, fisher_name):
    Fisher_data = pd.read_csv(f'{fisher_dir}/{fisher_name}.csv')
    Fisher_data['CDR3'] = Fisher_data['junction_aa'].str.extract(r'(^C.*)_TCRBV.*')

    return Fisher_data

def get_cluster_contents(Clusdir, patient, motif, Fisher_data):
    Summary_pre = pd.read_csv(f'{Clusdir}/{patient}_pre_summary.csv')
    Summary_post = pd.read_csv(f'{Clusdir}/{patient}_post_summary.csv')
    Clusters_pre = pd.read_csv(f'{Clusdir}/{patient}_pre_clusters.csv')
    Clusters_post = pd.read_csv(f'{Clusdir}/{patient}_post_clusters.csv')
    Index_pre = Summary_pre.index[Summary_pre['motif'] == motif].tolist()
    Index_post = Summary_post.index[Summary_post['motif'] == motif].tolist()
    Cluster_pre = Clusters_pre.loc[Clusters_pre['cluster'] == Index_pre[0]]
    Cluster_post = Clusters_post.loc[Clusters_post['cluster'] == Index_post[0]]
    set_fisher = set(Fisher_data['CDR3'])
    set_pre = set(Cluster_pre['junction_aa'])
    set_post = set(Cluster_post['junction_aa'])
    Enriched_TCRs_pre = set_fisher&set_pre
    Enriched_TCRs_post = set_fisher&set_post
    print('The enriched TCRs for', patient, 'pre are:', len(Enriched_TCRs_pre))
    print('The enriched TCRs for', patient, 'post are:', len(Enriched_TCRs_post))

    return Enriched_TCRs_pre, Enriched_TCRs_post

def find_enriched_TCRs(fisher_dir, fisher_name, Clusdir, patient, motif):
    Fisher_dataframe = get_Fisher_CDR3s(fisher_dir, fisher_name)
    Enriched_clusters = get_cluster_contents(Clusdir, patient, motif, Fisher_dataframe)
    Enriched_pre = Enriched_clusters[0]
    Enriched_post = Enriched_clusters[1]
    Unique_elements = Enriched_post - Enriched_pre
    All_enriched = list(Enriched_pre) + list(Unique_elements)
    print('The number of enriched TCRs for cluster', motif, 'in', patient, 'is:', len(All_enriched))

    return All_enriched

In [10]:
Motifs = ['CASS...nTEAFF', 'CASS..YEQYF', 'CASS..TGELFF', 'CASS..GNTIYF']

for i in Motifs:
    Fisher_dir = f'{cwd}/Processed_data'
    Fisher_name = 'Fisher_p2'
    Clusdir = f'{cwd}/Cluster_results'
    Patient = 'P2'
    Full_enriched_clusters = find_enriched_TCRs(Fisher_dir, Fisher_name, Clusdir, Patient, i)
    print(Full_enriched_clusters)

The enriched TCRs for P2 pre are: 9
The enriched TCRs for P2 post are: 9
The number of enriched TCRs for cluster CASS...nTEAFF in P2 is: 13
['CASSLALNTEAFF', 'CASSFVGGTEAFF', 'CASSLHTNTEAFF', 'CASRWTGGTEAFF', 'CASSRGRDTEAFF', 'CASSLALDTEAFF', 'CASSSRGNTEAFF', 'CASSLEGATEAFF', 'CASSLWRNTEAFF', 'CASSPDRNTEAFF', 'CASSFRGDTEAFF', 'CASRPQLNTEAFF', 'CASSFRTDTEAFF']
The enriched TCRs for P2 pre are: 4
The enriched TCRs for P2 post are: 5
The number of enriched TCRs for cluster CASS..YEQYF in P2 is: 5
['CASSSSYEQYF', 'CASSLSYEQYF', 'CASSTSYEQYF', 'CASSDGYEQYF', 'CASRATYEQYF']
The enriched TCRs for P2 pre are: 2
The enriched TCRs for P2 post are: 2
The number of enriched TCRs for cluster CASS..TGELFF in P2 is: 2
['CASSLPTGELFF', 'CASSARTGELFF']
The enriched TCRs for P2 pre are: 1
The enriched TCRs for P2 post are: 1
The number of enriched TCRs for cluster CASS..GNTIYF in P2 is: 1
['CASSLSGNTIYF']


In [11]:
Data = pd.read_csv(f'{cwd}/Processed_data/Fisher_p2.csv')
Data['CDR3'] = Data['junction_aa'].str.extract(r'(^C.*)_TCRBV.*')

Data.loc[Data['CDR3'] == 'CASSLWRNTEAFF']

Unnamed: 0,junction_aa,p_value,Odds_Ratio,BH_p_values,CDR3
13,CASSLWRNTEAFF_TCRBV07_TCRBJ01,1.244064e-149,88.720819,7.122269e-147,CASSLWRNTEAFF
