In [1]:
import pandas as pd 
import numpy as np


path2 =  "/Users/tomelder/Downloads/vdjdb-2023-06-01/vdjdb_full.txt"
data = pd.read_csv(path2, sep='\t')
df = pd.DataFrame(data)

df = df[(df['vdjdb.score'] == 2) | (df['vdjdb.score'] == 3) | (df['vdjdb.score'] == 4)  | (df['vdjdb.score'] == 1) ]

relevant_columns = [
    'cdr3.alpha', 'v.alpha', 'j.alpha', 'cdr3.beta', 'v.beta', 'd.beta', 'j.beta',
    'species', 'mhc.a', 'mhc.b', 'antigen.gene','antigen.epitope', 'vdjdb.score', 'mhc.class'
]
# For the purpose of this example, we won't filter out lower-quality scores to avoid data imbalance,
# but in practice, you may want to consider this based on the dataset's size and distribution.
filtered_data = df[relevant_columns]
# Displaying the first few rows of the filtered dataset to verify the selection
filtered_data
# Checking for missing values in crucial columns
missing_values = filtered_data.isnull().sum()

# Examining the number of unique values in categorical columns for potential encoding strategies
unique_values = filtered_data.nunique()

# Removing rows with missing cdr3 sequences
df_cleaned = filtered_data.dropna(subset=['cdr3.alpha','cdr3.beta'])
df_cleaned.reset_index(drop=True, inplace=True)

missing_values_df_cleaned = df_cleaned.isnull().sum()
report = {
    "Missing Values": missing_values,
    "Missing Values df_cleaned": missing_values_df_cleaned
}

report

  data = pd.read_csv(path2, sep='\t')


{'Missing Values': cdr3.alpha         5559
 v.alpha            5585
 j.alpha            5740
 cdr3.beta           925
 v.beta              921
 d.beta             8374
 j.beta              996
 species               0
 mhc.a                 0
 mhc.b                 0
 antigen.gene         31
 antigen.epitope       0
 vdjdb.score           0
 mhc.class             0
 dtype: int64,
 'Missing Values df_cleaned': cdr3.alpha            0
 v.alpha               0
 j.alpha             122
 cdr3.beta             0
 v.beta                0
 d.beta             2656
 j.beta               74
 species               0
 mhc.a                 0
 mhc.b                 0
 antigen.gene         31
 antigen.epitope       0
 vdjdb.score           0
 mhc.class             0
 dtype: int64}

In [2]:
def filter_by_length_range(df, column_name):
    """
    Asks the user for length bounds and filters the DataFrame to include rows where the length of
    the specified column's sequence falls within the provided bounds.

    Parameters:
    df (DataFrame): The data frame to filter.
    column_name (str): The name of the sequence column to check (e.g., 'cdr3.alpha').

    Returns:
    DataFrame: A DataFrame filtered by the specified length range.
    """
    # Ask the user for length bounds
    lower_bound = int(input(f"Enter lower bound for {column_name} length: "))
    upper_bound = int(input(f"Enter upper bound for {column_name} length: "))
    
    # Calculate the sequence lengths
    df[column_name + '.length'] = df[column_name].apply(len)
    
    # Filter based on the length range
    return df[(df[column_name + '.length'] >= lower_bound) & (df[column_name + '.length'] <= upper_bound)]

def filter_by_species(df):
    """
    Asks the user for species to filter by and filters the DataFrame to include rows where the
    species column matches any of the species provided.

    Parameters:
    df (DataFrame): The data frame to filter.

    Returns:
    DataFrame: A DataFrame filtered by the specified species.
    """
    # Ask the user for species to filter by
    input_species = input("Enter the species to filter by (separated by commas): ")
    species_to_filter = [species.strip() for species in input_species.split(',')]
    
    return df[df['species'].isin(species_to_filter)]


def filter_by_minimum_score(df, column='vdjdb.score'):
    """
    Filters the DataFrame based on a minimum score inputted by the user for a specified column.
    
    Parameters:
    df (DataFrame): The data frame to filter.
    column (str): The name of the column to apply the filter on. Defaults to 'vdjdb.score'.
    
    Returns:
    DataFrame: A DataFrame filtered based on the user-specified minimum score.
    """
    # Prompting user for minimum score
    min_score = input(f"Enter the minimum score (inclusive) for {column}: ")
    
    # Validating user input
    try:
        min_score = int(min_score)
        if min_score < 0 or min_score > 3:
            print("Score out of range. Please enter a value between 0 and 3.")
            return df
    except ValueError:
        print("Invalid input. Please enter an integer value.")
        return df
    
    # Filtering the DataFrame
    filtered_df = df[df[column] >= min_score]
    
    return filtered_df

def filter_by_mhc_class(df, column='mhc.class'):
    """
    Filters the DataFrame based on a user-specified MHC class ('MHCI' or 'MHCII').

    Parameters:
    df (DataFrame): The data frame to filter.
    column (str): The name of the column to apply the filter on. Defaults to 'mhc.class'.

    Returns:
    DataFrame: A DataFrame filtered based on the user-specified MHC class.
    """
    # Prompting user for MHC class
    mhc_class = input(f"Enter the MHC class to keep ('MHCI' or 'MHCII'): ").strip()

    # Validating user input
    if mhc_class not in ['MHCI', 'MHCII']:
        print("Invalid input. Please enter 'MHCI' or 'MHCII'.")
        return df
    
    # Filtering the DataFrame
    filtered_df = df[df[column] == mhc_class]
    
    return filtered_df




# Assuming df is your DataFrame
df = df_cleaned
# Apply length range filters
df_filtered_alpha = filter_by_length_range(df, 'cdr3.alpha')
df_filtered_beta = filter_by_length_range(df, 'cdr3.beta')
df_filtered_epitope = filter_by_length_range(df, 'antigen.epitope')
df_filtered_min_score = filter_by_minimum_score(df)
# Intersect the filtered DataFrames to get only rows that meet all criteria
df_length_filtered = df_filtered_alpha.merge(df_filtered_beta).merge(df_filtered_epitope)

# Further filter by species
df_final_filtered = filter_by_species(df_length_filtered)

# df_final_filtered is now your preprocessed DataFrame



Enter lower bound for cdr3.alpha length: 12
Enter upper bound for cdr3.alpha length: 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name + '.length'] = df[column_name].apply(len)


Enter lower bound for cdr3.beta length: 12
Enter upper bound for cdr3.beta length: 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name + '.length'] = df[column_name].apply(len)


Enter lower bound for antigen.epitope length: 0
Enter upper bound for antigen.epitope length: 100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name + '.length'] = df[column_name].apply(len)


Enter the minimum score (inclusive) for vdjdb.score: 1
Enter the species to filter by (separated by commas): HomoSapiens


In [7]:
df_final_filtered

# df_final_filtered.drop(['d.beta'], axis=1, inplace=True)

df_final_filtered.drop(['j.beta'], axis=1, inplace=True)

print(df_final_filtered.isnull().sum())

cdr3.alpha                0
v.alpha                   0
cdr3.beta                 0
v.beta                    0
species                   0
mhc.a                     0
mhc.b                     0
antigen.gene              8
antigen.epitope           0
vdjdb.score               0
mhc.class                 0
cdr3.alpha.length         0
cdr3.beta.length          0
antigen.epitope.length    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_filtered.drop(['j.beta'], axis=1, inplace=True)


# CDR3 seq TCRdist

In [5]:
from tcrsampler.setup_db import install_all_next_gen
install_all_next_gen(dry_run = False)
from tcrdist.rep_funcs import _pws, _pw  
from tcrdist.repertoire import TCRrep

RUNNING: curl -o /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/wiraninha_sampler.zip https://www.dropbox.com/s/ily0td3tn1uc7bi/wiraninha_sampler.zip?dl=1 -L


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    66    0    66    0     0    207      0 --:--:-- --:--:-- --:--:--   207
100   320  100   320    0     0    407      0 --:--:-- --:--:-- --:--:--     0
100 6882k  100 6882k    0     0  2412k      0  0:00:02  0:00:02 --:--:-- 6554k
replace /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/wirasinha_mouse_alpha_g8a.tsv.sampler.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/wiraninha_sampler.zip
RUNNING: curl -o /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ravens_samplers.zip https://www.dropbox.com/s/bahxa6x86drq0n5/ravens_samplers.zip?dl=1 -L


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    64    0    64    0     0    129      0 --:--:-- --:--:-- --:--:--   130
100   320  100   320    0     0    250      0  0:00:01  0:00:01 --:--:--     0
100  313k  100  313k    0     0   181k      0  0:00:01  0:00:01 --:--:--  181k
replace /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ravens_human_gamma_t.sampler.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Archive:  /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ravens_samplers.zip
RUNNING: curl -o /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/olga_sampler.zip https://www.dropbox.com/s/qlsxvst8bn04l0n/olga_sampler.zip?dl=1 -L


100    61    0    61    0     0    109      0 --:--:-- --:--:-- --:--:--   108
100   320  100   320    0     0    285      0  0:00:01  0:00:01 --:--:--     0
100 23.5M  100 23.5M    0     0  5612k      0  0:00:04  0:00:04 --:--:-- 9529k
replace /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/olga_human_beta_t.sampler.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Archive:  /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/olga_sampler.zip
RUNNING: curl -o /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ruggiero_mouse_sampler.zip https://www.dropbox.com/s/yz8v1c1gf2eyzxk/ruggiero_mouse_sampler.zip?dl=1 -L


100    71    0    71    0     0    220      0 --:--:-- --:--:-- --:--:--   220
100   320  100   320    0     0    386      0 --:--:-- --:--:-- --:--:--     0
100  210k  100  210k    0     0   158k      0  0:00:01  0:00:01 --:--:--  158k
replace /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ruggiero_mouse_alpha_t.tsv.sampler.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Archive:  /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ruggiero_mouse_sampler.zip
RUNNING: curl -o /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ruggiero_human_sampler.zip https://www.dropbox.com/s/jda6qtemk65zlfk/ruggiero_human_sampler.zip?dl=1 -L


100    71    0    71    0     0    226      0 --:--:-- --:--:-- --:--:--   226
100   320  100   320    0     0    109      0  0:00:02  0:00:02 --:--:--   199
100  599k  100  599k    0     0   173k      0  0:00:03  0:00:03 --:--:-- 6867k
replace /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ruggiero_human_alpha_t.tsv.sampler.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Archive:  /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/ruggiero_human_sampler.zip
RUNNING: curl -o /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/emerson_human_beta_t_cmvneg.tsv.sampler.tsv.zip https://www.dropbox.com/s/04mxrzw7f5wkg1x/emerson_human_beta_t_cmvneg.tsv.sampler.tsv.zip?dl=1 -L


100    92    0    92    0     0    193      0 --:--:-- --:--:-- --:--:--   193
100   320  100   320    0     0    293      0  0:00:01  0:00:01 --:--:--     0
100 11.3M  100 11.3M    0     0  4318k      0  0:00:02  0:00:02 --:--:-- 12.7M
replace /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/emerson_human_beta_t_cmvneg.tsv.sampler.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Archive:  /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/emerson_human_beta_t_cmvneg.tsv.sampler.tsv.zip
RUNNING: curl -o /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/britanova_human_beta_t_cb.tsv.sampler.tsv.zip https://www.dropbox.com/s/87n5v2by80xhy1q/britanova_human_beta_t_cb.tsv.sampler.tsv.zip?dl=1 -L


100    90    0    90    0     0    325      0 --:--:-- --:--:-- --:--:--   326
100   320  100   320    0     0    366      0 --:--:-- --:--:-- --:--:--   366
100 28.3M  100 28.3M    0     0  6234k      0  0:00:04  0:00:04 --:--:-- 8181k
replace /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/britanova_human_beta_t_cb.tsv.sampler.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  /Users/tomelder/opt/anaconda3/lib/python3.9/site-packages/tcrsampler/db/britanova_human_beta_t_cb.tsv.sampler.tsv.zip


  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [6]:
df_final_filtered.rename(columns={'cdr3.alpha': 'cdr3_a_aa', 'cdr3.beta': 'cdr3_b_aa', 'v.alpha':'v_a_gene','j.alpha': 'j_a_gene','v.beta': 'v_b_gene','j.beta': 'j_b_gene','antigen.epitope':'epitope'}, inplace=True)

print(np.shape(df_final_filtered))
def cdr3plot2(df_):
    # Assuming TCRrep is correctly set up to work with the provided dataframe
    tr_vdjdb = TCRrep(cell_df=df_, 
                      organism='human',
                      chains=['beta', 'alpha'],
                      deduplicate=False,
                      compute_distances=True )
    
    tcrdist_matrix = tr_vdjdb.pw_cdr3_b_aa
    return tcrdist_matrix

tcrdist_matrix =  cdr3plot2(df_final_filtered)
print(tcrdist_matrix)
print(np.shape(tcrdist_matrix))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_filtered.rename(columns={'cdr3.alpha': 'cdr3_a_aa', 'cdr3.beta': 'cdr3_b_aa', 'v.alpha':'v_a_gene','j.alpha': 'j_a_gene','v.beta': 'v_b_gene','j.beta': 'j_b_gene','antigen.epitope':'epitope'}, inplace=True)

  self._validate_cell_df()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.cell_df['count'] = 1


(8816, 13)
[[ 0 24 24 ... 24 31 32]
 [24  0  0 ... 20 27 23]
 [24  0  0 ... 20 27 23]
 ...
 [24 20 20 ...  0 23 20]
 [31 27 27 ... 23  0 24]
 [32 23 23 ... 20 24  0]]
(8816, 8816)


Below code accounts for large datasets ( >10000 ), but the tcrdist only provides sparse matrix for this number of datapoints. Dont know how to label with a sparse dataset. 

In [9]:
def cdr3plot2(df_):
    # Assuming TCRrep is correctly set up to work with the provided dataframe
    tr_vdjdb = TCRrep(cell_df=df_, 
                      organism='human',
                      chains=['beta', 'alpha'],
                      deduplicate=False)

    # Check if the size exceeds 10,000, to manage memory usage more efficiently
    if len(df_) > 10000:
        # Set the number of CPUs to use for parallel computation
        tr_vdjdb.cpus = 4  # Adjust this number based on your system's capabilities

        # Compute sparse rectangular distances for large datasets
        # The radius parameter can be adjusted based on your specific requirements
        tr_vdjdb.compute_sparse_rect_distances(radius=50, chunk_size=100)

        # Accessing the sparse matrix of distances stored in rw_beta
        tcrdist_matrix = tr_vdjdb.rw_beta
        
        from scipy.sparse import csr_matrix

        # Assuming `tcrdist_matrix` is your sparse matrix
        tcrdist_matrix_csr = csr_matrix(tcrdist_matrix)
    else:
        # For smaller datasets, proceed with the standard distance computation
        tr_vdjdb.compute_distances()
        tcrdist_matrix = tr_vdjdb.pw_cdr3_b_aa

    return tcrdist_matrix
tcrdist_matrix =  cdr3plot2(df_final_filtered)
print(tcrdist_matrix)
print(np.shape(tcrdist_matrix))



When TCRrep.<clone_df> size 11078 > 10,000.
	TCRrep.compute_distances() may be called explicitly by a user
	with knowledge of system memory availability.
	However, it's HIGHLY unlikely that you want to compute such
	a large numpy array. INSTEAD, if you want all pairwise distance,
	you will likely want to set an appropriate number of cpus with TCRrep.cpus = x,
	and then generate a scipy.sparse csr matrix of distances with:
	TCRrep.compute_sparse_rect_distances(radius=50, chunk_size=100), leaving df and df2 arguments blank.
	When you do this the results will be stored as TCRrep.rw_beta instead of TCRrep.pw_beta.
	This function is highly useful for comparing a smaller number of sequences against a bulk set
	In such a case, you can specify df and df2 arguments to create a non-square matrix of distances.
	See https://tcrdist3.readthedocs.io/en/latest/sparsity.html?highlight=sparse for more info.



  0%|          | 0/111 [00:00<?, ?it/s]

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


  0%|          | 0/111 [00:00<?, ?it/s]

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


  (0, 0)	-1
  (1, 1)	-1
  (1, 1332)	42
  (1, 10800)	48
  (1, 10841)	48
  (2, 2)	-1
  (2, 4)	30
  (3, 3)	-1
  (3, 244)	48
  (3, 10956)	48
  (3, 10976)	36
  (3, 10987)	48
  (4, 2)	30
  (4, 4)	-1
  (4, 1397)	48
  (4, 9652)	48
  (4, 11034)	48
  (5, 5)	-1
  (5, 6)	36
  (6, 5)	36
  (6, 6)	-1
  (6, 7)	36
  (6, 710)	48
  (7, 6)	36
  (7, 7)	-1
  :	:
  (11067, 1057)	48
  (11067, 1058)	48
  (11067, 9678)	12
  (11067, 11067)	-1
  (11068, 11068)	-1
  (11069, 11069)	-1
  (11070, 284)	36
  (11070, 285)	36
  (11070, 10911)	48
  (11070, 11070)	-1
  (11071, 11071)	-1
  (11072, 11072)	-1
  (11073, 9452)	48
  (11073, 9710)	36
  (11073, 10105)	48
  (11073, 11073)	-1
  (11074, 11074)	-1
  (11075, 281)	30
  (11075, 11031)	48
  (11075, 11075)	-1
  (11076, 10760)	42
  (11076, 10761)	42
  (11076, 10820)	21
  (11076, 11076)	-1
  (11077, 11077)	-1
(11078, 11078)


### DBscan

In [7]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(metric='precomputed', eps=0.2, min_samples=2)  # Example: eps value here is hypothetical
clusters = dbscan.fit_predict(tcrdist_matrix)

# Adding cluster labels to your data (assuming you have a DataFrame 'df' with your TCR sequence data)
df_final_filtered['cluster'] = clusters

df_final_filtered

# Exclude outlier points and count unique clusters
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)

print(f"Number of DBscan clusters: {n_clusters}")

print(f"Number of Epitopes: {len(df_final_filtered['epitope'].unique())}")


Number of DBscan clusters: 31
Number of Epitopes: 71


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_filtered['cluster'] = clusters


### Random classifier for baseline

Random Clustering:

The model randomly assigns TCR sequences to a predetermined number of clusters, mirroring the number your actual clustering algorithm produces. This process simulates a scenario where clusters are formed without any underlying biological or data-driven rationale.
Calculating Purity and Consistency:

Purity: For each randomly formed cluster, the model calculates the proportion of the most common epitope in that cluster. The idea is to see, on average, how often random clustering accidentally groups sequences by epitope. High purity in random clustering indicates that the dataset's imbalance might significantly influence purity scores.
Consistency: This measures how often sequences that bind to the same epitope end up in the same cluster, based on the random assignments. It assesses if the random clusters align sequences by epitope as effectively as the actual method.
Simulation Over Multiple Iterations:

The process of random clustering and subsequent evaluation of purity and consistency is repeated multiple times (e.g., 50). This repetition helps smooth out variances due to the randomness, providing a more reliable average baseline for these metrics.

In [8]:
def random_clustering(df, n_clusters):
    """Assigns each row in the dataframe to a random cluster."""
    random_clusters = np.random.randint(0, n_clusters, len(df))
    df['random_cluster'] = random_clusters
    return df

def calculate_random_purity(df):
    purity_sum = 0
    for cluster in df['random_cluster'].unique():
        cluster_df = df[df['random_cluster'] == cluster]
        most_common_epitope = cluster_df['epitope'].value_counts().idxmax()
        purity_sum += cluster_df['epitope'].value_counts().max() / len(cluster_df)
    purity = purity_sum / len(df['random_cluster'].unique())
    return purity

def calculate_random_consistency(df, epitope_clusters):
    correct_assignments = 0
    for epitope, cluster in epitope_clusters.items():
        correct_assignments += len(df[(df['epitope'] == epitope) & (df['random_cluster'] == cluster)])
    consistency = correct_assignments / len(df)
    return consistency

def simulate_baseline(df, n_clusters, iterations=50):
    purity_scores = []
    consistency_scores = []
    def get_epitope_clusters(df):
        epitope_clusters = {}
        for epitope in df['epitope'].unique():
            epitope_df = df[df['epitope'] == epitope]
            most_common_cluster = epitope_df['cluster'].value_counts().idxmax()
            epitope_clusters[epitope] = most_common_cluster
        return epitope_clusters

    # Assuming df_final_filtered has 'cluster' column from DBSCAN and 'epitope' information
    epitope_clusters = get_epitope_clusters(df_final_filtered)

  # This should come from your actual clustering method

    for _ in range(iterations):
        df_random = random_clustering(df.copy(), n_clusters)
        purity = calculate_random_purity(df_random)
        consistency = calculate_random_consistency(df_random, epitope_clusters)
        
        purity_scores.append(purity)
        consistency_scores.append(consistency)
    
    return np.mean(purity_scores), np.mean(consistency_scores)

pur_base, consist_base = simulate_baseline(df_final_filtered, n_clusters)

### Evaluate DBscan clusters

In [9]:
def calculate_retention(df):
    # Retention is the fraction of TCR sequences assigned to any cluster
    assigned = df[df['cluster'] != -1]  # DBSCAN labels outliers as -1
    retention = len(assigned) / len(df)
    return retention

def calculate_purity(df):
    # Purity is defined for each cluster, then averaged across clusters
    purity_sum = 0
    for cluster in df['cluster'].unique():
        if cluster == -1:
            continue  # Skip noise points
        cluster_df = df[df['cluster'] == cluster]
        most_common_epitope = cluster_df['epitope'].value_counts().idxmax()
        purity_sum += cluster_df['epitope'].value_counts().max() / len(cluster_df)
    purity = purity_sum / (len(df['cluster'].unique()) - (1 if -1 in df['cluster'].unique() else 0))
    return purity

def calculate_consistency(df):
    # Consistency is calculated based on the assignment of TCR sequences to the "true" cluster for their epitope
    epitope_clusters = {}
    for epitope in df['epitope'].unique():
        epitope_df = df[df['epitope'] == epitope]
        most_common_cluster = epitope_df['cluster'].value_counts().idxmax()
        epitope_clusters[epitope] = most_common_cluster

    correct_assignments = 0
    for epitope, cluster in epitope_clusters.items():
        correct_assignments += len(df[(df['epitope'] == epitope) & (df['cluster'] == cluster)])

    consistency = correct_assignments / len(df[df['cluster'] != -1])
    return consistency



retention = calculate_retention(df_final_filtered)
purity = calculate_purity(df_final_filtered)
consistency = calculate_consistency(df_final_filtered)

print(f"Retention: {retention}")
print(f"Purity: {purity}")
print(f"Consistency: {consistency}")

print(f"Purity baseline: {pur_base}")
print(f"Consistency baseline: {consist_base}")

Retention: 0.9878629764065335
Purity: 0.9114656735829082
Consistency: 0.9799058445286485
Purity baseline: 0.969157044703137
Consistency baseline: 0.031801270417422874


## Supervised KNN

In [7]:
from tcrdist.repertoire import TCRrep
df_final_filtered.rename(columns={'cdr3.alpha': 'cdr3_a_aa', 'cdr3.beta': 'cdr3_b_aa', 'v.alpha':'v_a_gene','j.alpha': 'j_a_gene','v.beta': 'v_b_gene','j.beta': 'j_b_gene','antigen.epitope':'epitope'}, inplace=True)

# Initialize TCRrep object
tcr_rep = TCRrep(
    cell_df = df_final_filtered,
    organism = 'human', # Adjust based on your data, or dynamically set based on the 'species' column
    chains = ['alpha', 'beta'],
    compute_distances = True,
    deduplicate = False 
)

from sklearn.neighbors import KNeighborsClassifier

# Assume X (distance matrix) is derived from tcr_rep object
# X = pd.DataFrame(tcr_rep.pw_alpha + tcr_rep.pw_beta)
X = pd.DataFrame(tcr_rep.pw_beta)
Y = df_final_filtered['epitope'] # Or any other column you wish to predict

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5, metric='precomputed')
knn_model.fit(X_train, Y_train)

# Make predictions
predictions = knn_model.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Assuming predictions and Y_test are already defined from the model testing
# Note: Adapt the metrics calculation as needed based on your specific task (binary vs. multiclass classification)

# Calculate and print common evaluation metrics
accuracy = accuracy_score(Y_test, predictions)
precision = precision_score(Y_test, predictions, average='macro')  # Use 'binary' for binary classification
recall = recall_score(Y_test, predictions, average='macro')  # Use 'binary' for binary classification
f1 = f1_score(Y_test, predictions, average='macro')  # Use 'binary' for binary classification

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# For binary classification, you may also want to compute the AUC-ROC
# Ensure your problem is binary and y_true and y_scores are appropriately defined for this metric
# For multi-class scenarios, consider a one-vs-rest approach to compute AUC-ROC
if len(Y_test.unique()) == 2:  # Checking if it's a binary classification
    # Assuming `prob_predictions` contain probability scores of the positive class
    prob_predictions = knn_model.predict_proba(X_test)[:, 1]  # Adapt this indexing based on your classifier output
    auc_roc = roc_auc_score(Y_test, prob_predictions)
    print(f"AUC-ROC: {auc_roc:.4f}")

# For a detailed classification report (precision, recall, f1-score per class)
print("\nDetailed Classification Report:")
print(classification_report(Y_test, predictions))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_filtered.rename(columns={'cdr3.alpha': 'cdr3_a_aa', 'cdr3.beta': 'cdr3_b_aa', 'v.alpha':'v_a_gene','j.alpha': 'j_a_gene','v.beta': 'v_b_gene','j.beta': 'j_b_gene','antigen.epitope':'epitope'}, inplace=True)


ValueError: Precomputed matrix must be a square matrix. Input is a 7866x9833 matrix.