In [1]:
# Libraries
import os
import pandas as pd
import numpy as np

import time
from datetime import datetime

import metrics

In [24]:
# build GIANA class
class GIANA:
    def __init__(self, input_data: pd.DataFrame) -> None:
        self._input_data = input_data
        self._processed_data = None
        self._start_dir = os.getcwd()
        self._giana_path = os.path.join(self._start_dir, 'functions/GIANA/')
        self._results_path = os.path.join(self._start_dir, 'results')
        self._cluster_map = None
        self._settings = {
            'Datetime': datetime.now().strftime('%Y%m%d_%H%M%S'),
            'Model': 'GIANA',
            }
        self._save_path = os.path.join(self._results_path, self._settings["Datetime"])

    def preprocess_data(self, species: str, antigen_species: str, chain_selection: str, min_vdj_score: int) -> None:
        data = self._input_data
        chain = chain_selection.lower()
        self._settings['Species'] = species
        self._settings['Antigen_species'] = antigen_species
        self._settings['Chain'] = chain
        self._settings['Minimum_VDJ_score'] = min_vdj_score
        gene = "TR" + chain[0].upper()
        cdr3_col_name = "cdr3." + chain
        variable_col_name = "v." + chain
        out = {}

        # filter dataframe
        data = data[(data["species"] == species) & (data["vdjdb.score"] >= min_vdj_score) & (data["gene"] == gene)]
        if antigen_species is not None:
            data = data[data["antigen.species"] == species]

        # drop duplicates and null values
        data = data.drop_duplicates()
        data = data.dropna()

        # filter input_data to only show genes belonging to passed chain
        data = data.loc[data["gene"] == gene]

        out[cdr3_col_name] = data["cdr3"].values
        out[variable_col_name] = data["v.segm"].values
        out["epitope"] = data["antigen.epitope"].values

        self._processed_data = pd.DataFrame.from_dict(out)
    
    def pad_sequences(self, data: pd.DataFrame) -> None:
        df = data.copy()
        pad = 50
        col = f'cdr3.{self._settings["Chain"]}'
        sequence = df[col].values
        pad_sequence = []
        for seq in sequence:
            pad_len = 0 if len(seq) > pad else pad - len(seq)
            pad_sequence.append(seq[:pad] + 'X' * pad_len)
        self._processed_data[col] = pad_sequence


    def get_bio(self, data: pd.DataFrame) -> pd.DataFrame:
        print('Getting Bio-ID on V, CDR3')
        df = data.copy()
        cols = [f'{x}.{self._settings["Chain"]}' for x in ['v','cdr3']]
        df.loc[:,'bio']=['-'.join(x) for x in df[cols].values.tolist()]
        return df
    
    def run_giana(self, data: pd.DataFrame) -> None:
        self._start_dir = os.getcwd()
        os.chdir(self._giana_path)
        cols = [f'{x}.{self._settings["Chain"]}' for x in ['cdr3', 'v']]
        df = data.copy()
        df = df[cols]
        df.columns = ['CDR3', 'V']
        df.to_csv('input.txt', index=False, header=False, sep='\t')
        print(f'Clustering {len(df)} sequences with GIANA.')

        # start timing process
        start_time = time.time()
        # call GIANA clustering script
        %run GIANA4.1.py -f input.txt -O input_clustered.txt -v True
        end_time = time.time()
        self._settings['time_to_run'] = end_time - start_time
        print(f'Elapsed time: {self._settings["time_to_run"]} seconds.')

        with open(os.path.join(self._giana_path, 'input_clustered.txt'), 'r') as f:
            clusters = f.read().splitlines()[3:]
            clusters = pd.DataFrame([x.split('\t') for x in clusters], columns=['CDR3', 'cluster', 'V'])

        # use bioidentities to map from output to input sequences
        clusters = self.get_bio(clusters.rename(columns={'CDR3': f'cdr3.{self._settings["Chain"]}', 'V': f'v.{self._settings["Chain"]}'}))
        self._cluster_map = {bio: cluster for bio, cluster in clusters[['bio', 'cluster']].values.tolist()}

        # reset working directory
        os.chdir(self._start_dir)
    
    def record_performance(self, data: pd.DataFrame) -> None:
        df = data.copy()
        df.loc[:, 'cluster'] = df['bio'].map(self._cluster_map)
        counts = [l for l, n in df['cluster'].value_counts().reset_index()[['cluster','count']].values.tolist() if n<=1]
        df.loc[:, 'cluster'] = df.loc[:, 'cluster'].replace(counts, np.nan)

        # save clusters
        if not os.path.exists(self._save_path):
            os.mkdir(self._save_path)

        df.to_csv(f'{self._save_path}/GIANA_{self._settings["Chain"]}_clusters.csv')

        # record performance
        self._settings['n_clusters'] = len(df['cluster'].dropna().unique())

        clusterscores = pd.DataFrame()
        statistics = pd.DataFrame()
        c_scores, stats = metrics.score(df, self._settings)
        clusterscores = pd.concat([clusterscores, c_scores])
        statistics = pd.concat([statistics, stats])

        clusterscores.to_csv(f'{self._save_path}/GIANA_{self._settings["Chain"]}_total.csv')
        statistics.to_csv(f'{self._save_path}/GIANA_{self._settings["Chain"]}_stats.csv')

        print('GIANA results saved')
    

In [38]:
df = pd.read_csv('./data/vdjdb.txt', sep="\t")

In [39]:
giana = GIANA(df)

In [40]:
giana.preprocess_data("HomoSapiens", "SARS-CoV-2", "beta", 1)

In [28]:
giana.pad_sequences(giana._processed_data)

In [41]:
bio_data = giana.get_bio(giana._processed_data)

Getting Bio-ID on V, CDR3


In [42]:
giana.run_giana(bio_data)

Clustering 634 sequences with GIANA.





GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis

Input columns:
1. CDR3 amino acid sequence (Starting from C, ending with the first F/L in motif [FL]G.G)
2. Variable gene name in Imgt format: TRBVXX-XX*XX
3. Joining gene name (optional)
4. Frequency (optional)
5. Other information (optional)

!!! ALL amino acid letters must be CAPITAL !!!


Processing input.txt
Total time elapsed: 0.073766
Maximum memory usage: 506.953728 MB
Elapsed time: 0.9905788898468018 seconds.
Getting Bio-ID on V, CDR3


In [43]:
giana.record_performance(bio_data)

No predictions made for:  {'EMLFSHGLVK', 'AVGVGKSAL', 'SYMIMEIE', 'RLPGVLPRA', 'KLTESLHKV', 'EEAAGIGIL', 'VHFFKNIVTPRTPG', 'RLGEVRHPV', 'YVMAYVMAGVGS', 'RLLQCTQQAV', 'NPVVHFFKNIVTPR', 'MVNTVAGAMK', 'LLIRWQHFL', 'KASEKIFYV', 'YLSFYTAEQL', 'ENPVVHFFKNIVTP', 'HRRGSRSYV', 'GVYDGREHTV', 'QPLALEGSLQKRG', 'MTLHGFMMY', 'ESITGSLGPLL', 'TEDEHFEFY', 'RHDLPPYRVYL', 'VMAPRTLVL', 'EHEGSGPEL', 'RMFPNAPYL', 'QTNPVTLQY', 'RVRFFFPSL', 'KALARALKEGRIR', 'VLLGVKLFGV', 'RNTFRHSVVVPCE', 'ALYGFVPVL', 'QAFWIDLFETIG', 'GLLDEDFYA', 'MTEYKLVVVGAGDVGKSALTIQLI', 'ATGFKQSSKALQRPVAS', 'YLSHLPLTCKF', 'MLIGIPVYV', 'QLCDVMFYL', 'NHVVDISKSGLITIA', 'LQPLALEGSLQKRG', 'YLAMPFATPMEAELARRSLA', 'HQNPVTGLLL', 'LLQCTQQAV', 'ALHGGWTTK', 'LYPEFIASI', 'EVLPFFLFF', 'RLARLALVL', 'FSWGAEGQRPGF', 'SCMGGMNWRPILTII', 'ALLPGLPAA', 'FQDYIKSYL', 'LGYGFVNYI', 'WLIRETQPITK', 'EFTVSGNIL', 'AALPILFQV', 'DPPALASTNAEVT', 'MPYGYVLNEF', 'ILCETCLIV', 'MTEYKLVVVGAVGVGKSALTIQLI', 'HYNYMCNSSCMGSMN', 'GQVELGGGNAVEVCKGS', 'TYDTVHRHL', 'AVGSYVYSV', 'TLREI

In [12]:
giana._processed_data

Unnamed: 0,cdr3.alpha,v.alpha,epitope
0,CAVAGYGGSQGNLIFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV12-2*01,ELAGIGILTV
1,CALSGGDSSYKLIFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV9-2*01,VHFFKNIVTPRTPG
2,CAVRPTSGGSYIPTFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV21*01,SLLMWITQV
3,CAVRPTSGGSYIPTFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV21*01,SLLMWITQC
4,CAVRPTSGGSYIPTFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV21*01,SLLMWITQC
...,...,...,...
399,CAVNPPDTGFQKLVFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV12-2*01,VVVGAVGVGK
400,CLVGDFNSNSGYALNFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV4*01,GARGVGKSAL
401,CAVNDQGGGADGLTFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV12-2*01,RMFPNAPYL
402,CAVSEGGDYKLSFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,TRAV8-4*01,RMFPNAPYL
