In [15]:
import os
import time

import pandas as pd
import numpy as np

from datetime import datetime
from sklearn.cluster import DBSCAN
from scipy.sparse import csr_matrix
from tcrdist.repertoire import TCRrep
from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory2

import general_data_preprocessing
import metrics

In [37]:
# create TCRDist3 class
class TCRDist3:
    def __init__(self, input_data: pd.DataFrame) -> None:
        self._input_data = input_data
        self._processed_data = None
        self._clustered_data = None
        self._save_path = os.path.join(os.getcwd(), 'results')
        self._settings = {
            'Datetime': datetime.now().strftime('%Y%m%d_%H%M%S'),
            'Model': 'TCRDist3',
            }
        
    def preprocess_input_data(self, species: str, antigen_species: str, chain_selection: str, min_vdj_score: int) -> None:
        data = self._input_data
        chain = chain_selection.lower()
        self._settings['Species'] = species
        self._settings['Antigen_species'] = antigen_species
        self._settings['Chain'] = chain
        self._settings['Minimum_VDJ_score'] = min_vdj_score
        gene = "TR" + chain[0].upper()
        cdr3_col_name = "cdr3." + chain
        variable_col_name = "v." + chain
        joining_col_name = "j." + chain

        # filter dataframe
        data = data[(data["species"] == species) & (data["vdjdb.score"] >= min_vdj_score) & (data["gene"] == gene) & (data["antigen.species"] == species)]

        # drop duplicates and null values
        data = data.drop_duplicates()
        data = data.dropna()

        data = data.rename(columns={'cdr3': cdr3_col_name, 'v.segm': variable_col_name, 'j.segm': joining_col_name})

        data = general_data_preprocessing.get_bio(data, chain_selection, True)
        data["count"] = [1] * len(data)

        # filter input_data to only show genes belonging to passed chain
        
        if chain == "alpha":
            data = data.rename(columns={cdr3_col_name: 'cdr3_a_aa', variable_col_name:'j_a_gene', joining_col_name:'j_a_gene', 'antigen.epitope': 'epitope'})
            self._processed_data = data[['cdr3_a_aa', 'v_a_gene', 'j_a_gene', 'bio', 'epitope', 'count']]
        else:
            data = data.rename(columns={cdr3_col_name: 'cdr3_b_aa', variable_col_name:'v_b_gene', joining_col_name:'j_b_gene', 'antigen.epitope': 'epitope'})
            self._processed_data = data[['cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'bio', 'epitope', 'count']]
    
    def run_tcrdist(self, data: pd.DataFrame) -> None:
        tr_data = data.drop(columns=['epitope'], axis=1).reset_index(drop=True)
        tr = TCRrep(
            cell_df=tr_data,
            organism='human',
            chains=[self._settings['Chain']],
            compute_distances=False,
        )
        out, _ = compute_pw_sparse_out_of_memory2(tr=tr, row_size=50, pm_processes=10)
        out = out[self._settings['Chain']]
        col_name_conversion = {
            'cdr3_a_aa': 'cdr3.alpha',
            'j_a_gene':'v.alpha',
            'j_a_gene':'j.alpha',
            'cdr3_b_aa': 'cdr3.beta',
            'v_b_gene': 'v.beta',
            'j_b_gene': 'j.beta',
        }
        # Record results
        start_time = time.time()

        # Cluster tcrdist matrix
        clustering = DBSCAN(eps=0.5, min_samples=2, n_jobs=1).fit(out)
        labels = clustering.labels_
        mapper = {seq: label for seq, label in zip(tr_data['bio'].values,labels) if label!=-1}
        end_time = time.time()
        self._settings['time_to_run'] = end_time - start_time

        # map cluster back to sequence based on bioidentity
        data.loc[:, 'cluster'] = data['bio'].map(mapper)
        self._clustered_data = data.rename(columns = col_name_conversion)
    
    def record_performance(self, data: pd.DataFrame) -> None:
        df = data.copy()
        counts = [l for l, n in df['cluster'].value_counts().reset_index()[['cluster','count']].values.tolist() if n<=1]
        df.loc[:, 'cluster'] = df.loc[:, 'cluster'].replace(counts, np.nan)

        # save clusters
        if not os.path.exists(self._save_path):
            os.mkdir(self._save_path)

        df.to_csv(f'{self._save_path}/tcrdist3_{self._settings["Chain"]}_clusters.csv')

        # record performance
        self._settings['n_clusters'] = len(df['cluster'].dropna().unique())

        clusterscores = pd.DataFrame()
        statistics = pd.DataFrame()
        c_scores, stats = metrics.score(df, self._settings)
        clusterscores = pd.concat([clusterscores, c_scores])
        statistics = pd.concat([statistics, stats])

        clusterscores.to_csv(f'{self._save_path}/tcrdist3_total.csv')
        statistics.to_csv(f'{self._save_path}/tcrdist3_stats.csv')

        print('tcrdist3 results saved')
        

In [38]:
df = pd.read_csv('./data/vdjdb.txt', sep="\t")

In [39]:
tcrdist = TCRDist3(df)

In [40]:
tcrdist.preprocess_input_data("HomoSapiens", "SARS-CoV-2", "beta", 1)

Getting Bio-ID on V, CDR3, J


In [41]:
tcrdist.run_tcrdist(tcrdist._processed_data)

CREATED /6e39326ff88f/ FOR HOLDING DISTANCE OUT OF MEMORY


100%|██████████| 11/11 [00:05<00:00,  2.14it/s]


RETURNING scipy.sparse csr_matrix w/dims (519, 519)
CLEANING UP 6e39326ff88f


In [42]:
tcrdist.record_performance(tcrdist._clustered_data)

No predictions made for:  {'VVVGADGVGK', 'ALWGFFPVL', 'MTEYKLVVVGARGVGKSALTIQLI', 'TMETIDWKV', 'NLNCCSVPV', 'QPLALEGSLQKRG', 'SLLMWITQV', 'TEDEHFEFY', 'VVGAVGVGK', 'ELLVRINRL', 'ELAGIGILTV', 'KLFEFLVYGV', 'VHFFKNIVTPRTPG', 'GDFGLATEKSRWSGS', 'RHDLPPYRVYL', 'VLDLFQGQL', 'QTNPVTLQY', 'LQPLALEGSLQKRG', 'ALHGGWTTK', 'HRRGSRSYV', 'VVRHCPHHERCSDSD', 'FMPDFDLHL', 'GARGVGKSAL', 'RVRFFFPSL', 'RMFPNAPYL', 'KLSHQLVLL', 'EEAAGIGILTVI', 'VLEEVDWLI', 'ILCETCLIV', 'FIASNGVKLV', 'NSKEETGHLENGN', 'ALLPGLPAA', 'FNNFTVSFWLRVPKVSASHLE', 'ILAKFLHWL', 'GYNSYSVSNSEKHIM', 'SLLMWITQC', 'RYGSFSVTL', 'RLPGVLPRA', 'VVVGAVGVGK', 'MEVDPIGHLY', 'APARLERRHSA', 'KLTESLHKV', 'EEYLQAFTY', 'MTEYKLVVVGAVGVGKSALTIQLI', 'YLYDRLLRV', 'EMLFSHGLVK', 'RLLQCTQQAV', 'QLCDVMFYL', 'NLSALGIFST', 'RLARLALVL', 'ALTPVVVTL', 'MTEYKLVVVGAGDVGKSALTIQLI', 'EAAGIGILTV', 'GQVELGGGNAVEVCK', 'AAGIGILTV', 'VVVGAVGVGKSALTIQLIQN', 'ESITGSLGPLL', 'AVGSYVYSV', 'GELIGTLNAAKVPAD', 'RNTFRHSVVVPCE', 'YLAMPFATPMEAELARRSLA', 'LYPEFIASI', 'DFHFEVFNFVPCSI'

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [43]:
tcrdist._clustered_data

Unnamed: 0,cdr3.beta,v.beta,j.beta,bio,epitope,count,cluster
683,CASSPQGLGTEAFF,TRBV28*01,TRBJ1-1*01,TRBV28*01-CASSPQGLGTEAFF-TRBJ1-1*01,ELAGIGILTV,1,
1298,CASSLADRVNTEAFF,TRBV5-1*01,TRBJ1-1*01,TRBV5-1*01-CASSLADRVNTEAFF-TRBJ1-1*01,VHFFKNIVTPRTPG,1,
1302,CASSYVGNTGELFF,TRBV6-5*01,TRBJ2-2*01,TRBV6-5*01-CASSYVGNTGELFF-TRBJ2-2*01,SLLMWITQV,1,
1304,CASSYVGNTGELFF,TRBV6-5*01,TRBJ2-2*01,TRBV6-5*01-CASSYVGNTGELFF-TRBJ2-2*01,SLLMWITQC,1,
1310,CASSYVGNTGELFF,TRBV6-5*01,TRBJ2-2*01,TRBV6-5*01-CASSYVGNTGELFF-TRBJ2-2*01,SLLMWITQC,1,
...,...,...,...,...,...,...,...
84146,CASSLEVTYEQYF,TRBV7-9*01,TRBJ2-7*01,TRBV7-9*01-CASSLEVTYEQYF-TRBJ2-7*01,NLSALGIFST,1,
84147,CASSLAGEGGNTIYF,TRBV7-9*01,TRBJ1-3*01,TRBV7-9*01-CASSLAGEGGNTIYF-TRBJ1-3*01,NLSALGIFST,1,
84148,CASSEGQNYGYTF,TRBV27*01,TRBJ1-2*01,TRBV27*01-CASSEGQNYGYTF-TRBJ1-2*01,NLSALGIFST,1,
84153,CASGLDIHAFF,TRBV12-5*01,TRBJ1-1*01,TRBV12-5*01-CASGLDIHAFF-TRBJ1-1*01,NLSALGIFST,1,
