In [1]:
# Libraries
import os
import pandas as pd
import numpy as np

import time
from datetime import datetime

import metrics

In [38]:
# build GIANA class
class GIANA:
    def __init__(self, input_data: pd.DataFrame) -> None:
        self._input_data = input_data
        self._processed_data = None
        self._start_dir = os.getcwd()
        self._giana_path = os.path.join(self._start_dir, 'functions/GIANA/')
        self._save_path = os.path.join(self._start_dir, 'results')
        self._cluster_map = None
        self._settings = {
            'Datetime': datetime.now().strftime('%Y%m%d_%H%M%S'),
            'Model': 'GIANA',
            }

    def preprocess_data(self, species: str, antigen_species: str, chain_selection: str, min_vdj_score: int) -> None:
        data = self._input_data
        chain = chain_selection.lower()
        self._settings['Species'] = species
        self._settings['Antigen_species'] = antigen_species
        self._settings['Chain'] = chain
        self._settings['Minimum_VDJ_score'] = min_vdj_score
        gene = "TR" + chain[0].upper()
        cdr3_col_name = "cdr3." + chain
        variable_col_name = "v." + chain
        out = {}

        # filter dataframe
        data = data[(data["species"] == species) & (data["vdjdb.score"] >= min_vdj_score) & (data["gene"] == gene)]
        if antigen_species is not None:
            data = data[data["antigen.species"] == species]

        # drop duplicates and null values
        data = data.drop_duplicates()
        data = data.dropna()

        # filter input_data to only show genes belonging to passed chain
        data = data.loc[data["gene"] == gene]

        out[cdr3_col_name] = data["cdr3"].values
        out[variable_col_name] = data["v.segm"].values
        out["epitope"] = data["antigen.epitope"].values

        self._processed_data = pd.DataFrame.from_dict(out)
    
    def get_bio(self, data: pd.DataFrame) -> pd.DataFrame:
        print('Getting Bio-ID on V, CDR3')
        df = data.copy()
        cols = [f'{x}.{self._settings["Chain"]}' for x in ['v','cdr3']]
        df.loc[:,'bio']=['-'.join(x) for x in df[cols].values.tolist()]
        return df
    
    def run_giana(self, data: pd.DataFrame) -> None:
        self._start_dir = os.getcwd()
        os.chdir(self._giana_path)
        cols = [f'{x}.{self._settings["Chain"]}' for x in ['cdr3', 'v']]
        df = data.copy()
        df = df[cols]
        df.columns = ['CDR3', 'V']
        df.to_csv('input.txt', index=False, header=False, sep='\t')
        print(f'Clustering {len(df)} sequences with GIANA.')

        # start timing process
        start_time = time.time()
        # call GIANA clustering script
        %run GIANA4.1.py -f input.txt -O input_clustered.txt -v True
        end_time = time.time()
        self._settings['time_to_run'] = end_time - start_time
        print(f'Elapsed time: {self._settings["time_to_run"]} seconds.')

        with open(os.path.join(self._giana_path, 'input_clustered.txt'), 'r') as f:
            clusters = f.read().splitlines()[3:]
            clusters = pd.DataFrame([x.split('\t') for x in clusters], columns=['CDR3', 'cluster', 'V'])

        # use bioidentities to map from output to input sequences
        clusters = self.get_bio(clusters.rename(columns={'CDR3': f'cdr3.{self._settings["Chain"]}', 'V': f'v.{self._settings["Chain"]}'}))
        self._cluster_map = {bio: cluster for bio, cluster in clusters[['bio', 'cluster']].values.tolist()}

        # reset working directory
        os.chdir(self._start_dir)
    
    def record_performance(self, data: pd.DataFrame)-> None:
        df = data.copy()
        df.loc[:, 'cluster'] = df['bio'].map(self._cluster_map)
        counts = [l for l, n in df['cluster'].value_counts().reset_index()[['cluster','count']].values.tolist() if n<=1]
        df.loc[:, 'cluster'] = df.loc[:, 'cluster'].replace(counts, np.nan)

        # save clusters
        if not os.path.exists(self._save_path):
            os.mkdir(self._save_path)

        df.to_csv(f'{self._save_path}/GIANA_{self._settings["Chain"]}_clusters.csv')

        # record performance
        self._settings['n_clusters'] = len(df['cluster'].dropna().unique())

        clusterscores = pd.DataFrame()
        statistics = pd.DataFrame()
        c_scores, stats = metrics.score(df, self._settings)
        clusterscores = pd.concat([clusterscores, c_scores])
        statistics = pd.concat([statistics, stats])

        clusterscores.to_csv(f'{self._save_path}/GIANA_total.csv')
        statistics.to_csv(f'{self._save_path}/GIANA_stats.csv')

        print('GIANA results saved')
    

In [32]:
df = pd.read_csv('./data/vdjdb.txt', sep="\t")

In [33]:
giana = GIANA(df)

In [34]:
giana.preprocess_data("HomoSapiens", "SARS-CoV-2", "beta", 1)

In [35]:
bio_data = giana.get_bio(giana._processed_data)

Getting Bio-ID on V, CDR3


In [36]:
giana.run_giana(bio_data)

Clustering 634 sequences with GIANA.





GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis

Input columns:
1. CDR3 amino acid sequence (Starting from C, ending with the first F/L in motif [FL]G.G)
2. Variable gene name in Imgt format: TRBVXX-XX*XX
3. Joining gene name (optional)
4. Frequency (optional)
5. Other information (optional)

!!! ALL amino acid letters must be CAPITAL !!!


Processing input.txt
Total time elapsed: 0.072672
Maximum memory usage: 518.160384 MB
Elapsed time: 0.9703741073608398 seconds.
Getting Bio-ID on V, CDR3


In [37]:
giana.record_performance(bio_data)

No predictions made for:  {'GYNSYSVSNSEKHIM', 'FQDYIKSYL', 'YLSFYTAEQL', 'RMFPNAPYL', 'LLIRWQHFL', 'ESITGSLGPLL', 'LGYGFVNYI', 'GLLDEDFYA', 'GQVELGGGNAVEVCK', 'MEVDPIGHLY', 'SYMIMEIE', 'DPPALASTNAEVT', 'FRNEGIHL', 'MTEYKLVVVGARGVGKSALTIQLI', 'AVGVGKSAL', 'KASEKIFYV', 'FGDHPGHSY', 'RLLQCTQQAV', 'ENPVVHFFKNIVTP', 'AALPILFQV', 'TEDEHFEFY', 'MTEYKLVVVGAGDVGKSALTIQLI', 'GQVELGGGNAVEVCKGS', 'SLSKILDTV', 'VVMSWAPPV', 'VHFFKNIVTPRTPG', 'EVLPFFLFF', 'RLPGVLPRA', 'RVSTLRVSL', 'GAVGVGKSAL', 'MLIGIPVYV', 'RHDLPPYRVYL', 'YLSNIIPAL', 'KLTESLHKV', 'RLARLALVL', 'NHVVDISKSGLITIA', 'FGDVGSTLF', 'HYNYMCNSSCMGSMN', 'RLGEVRHPV', 'VYFFLPDHL', 'FLIYLDVSV', 'FYGKTILWF', 'SSCMGGMNWR', 'MVNTVAGAMK', 'MLFSHGLVK', 'MPYGYVLNEF', 'VLNGTVHPV', 'TLYSLTLLY', 'LLQCTQQAV', 'ALYGFVPVL', 'WMRLLPLL', 'MTLHGFMMY', 'TLREIRRYQK', 'SCMGGMNWRPILTII', 'CISSCNPNL', 'EEAAGIGIL', 'KALARALKEGRIR', 'NSKEETGHLENGN', 'ALLLQLFTL', 'VVRHCPHHERCSDSD', 'YLAMPFATPMEAELARRSLA', 'RILLVAASY', 'TYDTVHRHL', 'KIFGSLAFL', 'QAFWIDLFETIG', 'GDFGLATE

In [25]:
giana._processed_data

Unnamed: 0,cdr3.beta,v.beta,epitope
0,CASSPQGLGTEAFF,TRBV28*01,ELAGIGILTV
1,CASSLADRVNTEAFF,TRBV5-1*01,VHFFKNIVTPRTPG
2,CASSYVGNTGELFF,TRBV6-5*01,SLLMWITQV
3,CASSYVGNTGELFF,TRBV6-5*01,SLLMWITQC
4,CASSYVGNTGELFF,TRBV6-5*01,SLLMWITQC
...,...,...,...
629,CASSLEVTYEQYF,TRBV7-9*01,NLSALGIFST
630,CASSLAGEGGNTIYF,TRBV7-9*01,NLSALGIFST
631,CASSEGQNYGYTF,TRBV27*01,NLSALGIFST
632,CASGLDIHAFF,TRBV12-5*01,NLSALGIFST


In [10]:
giana._input_data

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,antigen.species,reference.id,method,meta,cdr3fix,vdjdb.score,web.method,web.method.seq,web.cdr3fix.nc,web.cdr3fix.unmp
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2,sort,sanger,no,no
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2,sort,sanger,no,no
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEAGQGFFSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2,sort,sanger,no,no
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92766,30592,TRB,CASSPGQGGDNEQFF,TRBV7-3*01,TRBJ2-1*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,...,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSPGQGGDNEQFF"", ""cdr3_old"": ""CASSP...",0,sort,singlecell,no,no
92767,30593,TRA,CAPQGATNKLIF,TRAV12-2*01,TRAJ32*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,...,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAPQGATNKLIF"", ""cdr3_old"": ""CAPQGATN...",2,sort,singlecell,no,no
92768,30593,TRB,CASSLGAGGQETQYF,TRBV5-1*01,TRBJ2-5*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,...,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSLGAGGQETQYF"", ""cdr3_old"": ""CASSL...",2,sort,singlecell,no,no
92769,30594,TRA,CLVGGSGGYNKLIF,TRAV4*01,TRAJ4*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,...,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CLVGGSGGYNKLIF"", ""cdr3_old"": ""CLVGGS...",0,sort,singlecell,no,no


In [29]:
bio_data

Unnamed: 0,cdr3.beta,v.beta,epitope,bio
0,CASSPQGLGTEAFF,TRBV28*01,ELAGIGILTV,TRBV28*01-CASSPQGLGTEAFF
1,CASSLADRVNTEAFF,TRBV5-1*01,VHFFKNIVTPRTPG,TRBV5-1*01-CASSLADRVNTEAFF
2,CASSYVGNTGELFF,TRBV6-5*01,SLLMWITQV,TRBV6-5*01-CASSYVGNTGELFF
3,CASSYVGNTGELFF,TRBV6-5*01,SLLMWITQC,TRBV6-5*01-CASSYVGNTGELFF
4,CASSYVGNTGELFF,TRBV6-5*01,SLLMWITQC,TRBV6-5*01-CASSYVGNTGELFF
...,...,...,...,...
629,CASSLEVTYEQYF,TRBV7-9*01,NLSALGIFST,TRBV7-9*01-CASSLEVTYEQYF
630,CASSLAGEGGNTIYF,TRBV7-9*01,NLSALGIFST,TRBV7-9*01-CASSLAGEGGNTIYF
631,CASSEGQNYGYTF,TRBV27*01,NLSALGIFST,TRBV27*01-CASSEGQNYGYTF
632,CASGLDIHAFF,TRBV12-5*01,NLSALGIFST,TRBV12-5*01-CASGLDIHAFF
