In [3]:
import pandas as pd

# Load the dataset (adjust the file path as needed)
# If you have the data in a file, use this line:
df = pd.read_csv('metadata_new.csv')

# Step 1: Count unique explicit subjects (where Subject != "no")
explicit_subjects = df[df['Subject'] != 'no']['Subject'].nunique()
print(f"Number of unique explicit subjects (Subject ≠ 'no'): {explicit_subjects}")

# Step 2: Handle rows where Subject = "no" by creating a composite key
# Select relevant columns for the composite key
composite_cols = ['Author', 'Disease', 'Age', 'TSource']

# Filter rows where Subject = "no"
no_subject_df = df[df['Subject'] == 'no']

# Create a composite key by concatenating the selected columns
# Convert all columns to strings to avoid type issues
no_subject_df['Composite_Key'] = no_subject_df[composite_cols].astype(str).agg('_'.join, axis=1)

# Count unique composite keys
unique_composite_keys = no_subject_df['Composite_Key'].nunique()
print(f"Number of unique inferred subjects (Subject = 'no'): {unique_composite_keys}")

# Step 3: Total unique individuals
total_unique_individuals = explicit_subjects + unique_composite_keys
print(f"Total estimated unique individuals: {total_unique_individuals}")

# Optional: Display some details for verification
print("\nSample of explicit subjects:")
print(df[df['Subject'] != 'no'][['Subject', 'Author', 'Disease', 'Age', 'TSource']].drop_duplicates().head())
print("\nSample of composite keys for 'no' entries:")
print(no_subject_df[['Composite_Key', 'Author', 'Disease', 'Age', 'TSource']].drop_duplicates().head())

Number of unique explicit subjects (Subject ≠ 'no'): 128
Number of unique inferred subjects (Subject = 'no'): 185
Total estimated unique individuals: 313

Sample of explicit subjects:
                               Subject               Author  Disease  \
2                          4.1_4.2_4.3        Garner et al.  healthy   
16  agc-028;_agc-029;_agc-047;_agc-051           Kim et al.   cancer   
20           6_healthy_donors_(pooled)  Cordes et al., 2022  healthy   
27                            msk_1263     Pai et al., 2023   cancer   
28                               su001    Yost et al., 2019   cancer   

               Age                 TSource  
2               no                    pbmc  
16  78;_61;_40;_68                    pbmc  
20              no  human_postnatal_thymus  
27              no                   tumor  
28              no                   tumor  

Sample of composite keys for 'no' entries:
                                       Composite_Key                A

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_subject_df['Composite_Key'] = no_subject_df[composite_cols].astype(str).agg('_'.join, axis=1)


# Meta Disease, Age, T

In [6]:
import pandas as pd

# Load the dataset
df = pd.read_csv('metadata_new.csv')

# 1. Count occurrences of each disease
disease_counts = df['Disease'].value_counts()
print("Disease Counts:")
print(disease_counts)
print(f"Total unique diseases: {disease_counts.shape[0]}\n")

# 2. Age group analysis
df['Age'] = df['Age'].replace('no', pd.NA).str.replace(r'[^0-9;]', '', regex=True)
df['Age'] = df['Age'].str.split(';').str[0]
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
bins = [0, 20, 40, 60, 80, 100, float('inf')]
labels = ['0-20', '21-40', '41-60', '61-80', '81-100', '100+']
age_groups = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
age_group_counts = age_groups.value_counts().sort_index()
no_age_count = df['Age'].isna().sum()

print("Age Group Counts:")
print(age_group_counts)
print(f"Entries with no age or invalid age: {no_age_count}\n")

# 3. Count cd3+, cd4+, cd8+ in TType (not TSubtype)
# Ensure TType is string type and handle NaN
df['TType'] = df['TType'].fillna('').astype(str).str.lower()

# Count occurrences
cd3_count = df['TType'].str.contains('cd3+', na=False).sum()
cd4_count = df['TType'].str.contains('cd4+', na=False).sum()
cd8_count = df['TType'].str.contains('cd8+', na=False).sum()

print("TType Counts:")
print(f"cd3+: {cd3_count}")
print(f"cd4+: {cd4_count}")
print(f"cd8+: {cd8_count}")

# Debug: Show sample rows
print("\nSample rows with cd3+:")
print(df[df['TType'].str.contains('cd3+', na=False)][['Run', 'TType', 'TSubtype']].head())
print("\nSample rows with cd4+:")
print(df[df['TType'].str.contains('cd4+', na=False)][['Run', 'TType', 'TSubtype']].head())
print("\nSample rows with cd8+:")
print(df[df['TType'].str.contains('cd8+', na=False)][['Run', 'TType', 'TSubtype']].head())

Disease Counts:
Disease
covid                          894
cancer                         526
healthy                        210
allergic_rhinitis               26
herpes_simplex_virus_type_2     20
hiv                             17
lewy_body_dementia              16
old_age_and_frailty             14
mci/ad                          14
mda5+dermatomyositis            12
myocarditis                     11
cmv                              8
influenza                        8
bacterial_pneumonia              8
hiv_suppressed                   6
hiv_viremic                      4
autoimmunity                     4
alcoholic_liver_disease          2
covid_xla                        2
obesity                          2
Name: count, dtype: int64
Total unique diseases: 20

Age Group Counts:
Age
0-20        3
21-40      46
41-60      83
61-80     147
81-100     13
100+        0
Name: count, dtype: int64
Entries with no age or invalid age: 1512

TType Counts:
cd3+: 495
cd4+: 183
cd8+: 279

Samp

In [7]:
import pandas as pd

df = pd.read_csv('metadata_new.csv')
print(f"Total number of rows: {df.shape[0]}")
print(f"Total number of columns: {df.shape[1]}")

Total number of rows: 1804
Total number of columns: 15


In [8]:
import pandas as pd

# Load the dataset
df = pd.read_csv('metadata_new.csv')

# Get total rows and columns
total_rows, total_columns = df.shape
print(f"Total number of rows: {total_rows}")
print(f"Total number of columns: {total_columns}\n")

# Find completely identical rows
duplicate_rows = df[df.duplicated(keep=False)]  # keep=False shows all instances of duplicates
duplicate_count = df.duplicated().sum()  # Number of duplicate rows (excluding first occurrence)

print(f"Number of duplicate rows (excluding first occurrence): {duplicate_count}")
if not duplicate_rows.empty:
    print("Completely identical rows:")
    print(duplicate_rows)
else:
    print("No completely identical rows found.")

Total number of rows: 1804
Total number of columns: 15

Number of duplicate rows (excluding first occurrence): 0
No completely identical rows found.


In [9]:
import pandas as pd

# Load the dataset
df = pd.read_csv('metadata_new.csv')

# Filter rows where Disease is "cancer"
cancer_df = df[df['Disease'] == 'cancer']

# Get unique cancer types from the CancerType column
unique_cancer_types = cancer_df['CancerType'].unique()
cancer_type_counts = cancer_df['CancerType'].value_counts()

# Print results
print(f"Total rows with Disease 'cancer': {len(cancer_df)}")
print(f"Total unique cancer types: {len(unique_cancer_types)}\n")

print("All Cancer Types (unique values in CancerType):")
for cancer_type in sorted(unique_cancer_types):
    print(cancer_type)

print("\nCancer Type Counts:")
print(cancer_type_counts)

Total rows with Disease 'cancer': 526
Total unique cancer types: 20

All Cancer Types (unique values in CancerType):
T-cell_large_granular_lymphocyte_leukemia
acute_myeloid_leukemia
advanced_biliary_tract
angiomyolipoma_associated_with_tuberous_sclerosis_complex
breast
breast_invasive_ductal_carcinoma
breast_or_ovarian
cervical
clear_cell_renal_cell_carcinoma
colon
colorectal
cutaneous_t_cell_lymphoma
gastric
hpv+_hnscc
multiple_myeloma
nasopharyngeal_carcinoma
non_small_cell_lung
pancreatic
prostate
squamous_or_basal_cell_carcinoma

Cancer Type Counts:
CancerType
non_small_cell_lung                                          94
T-cell_large_granular_lymphocyte_leukemia                    64
hpv+_hnscc                                                   51
multiple_myeloma                                             43
clear_cell_renal_cell_carcinoma                              35
squamous_or_basal_cell_carcinoma                             34
cervical                                     

In [15]:
import pandas as pd

# Load the dataset
df = pd.read_csv('metadata_new.csv')

# Ensure TType is string type and handle NaN
df['TType'] = df['TType'].fillna('').astype(str).str.lower()

# Filter rows where TType contains 'cd3+'
cd3_df = df[df['TType'].str.contains('cd3+', na=False)]

# Get total rows and unique diseases associated with cd3+
total_cd3_rows = len(cd3_df)
unique_cd3_diseases = cd3_df['Disease'].unique()
cd3_disease_counts = cd3_df['Disease'].value_counts()

# Print results
print(f"Total rows with TType 'cd3+': {total_cd3_rows}")
print(f"Total unique diseases associated with cd3+: {len(unique_cd3_diseases)}\n")

print("Diseases associated with cd3+:")
for disease in sorted(unique_cd3_diseases):
    print(disease)

print("\nDisease Counts for cd3+:")
print(cd3_disease_counts)

Total rows with TType 'cd3+': 495
Total unique diseases associated with cd3+: 8

Diseases associated with cd3+:
alcoholic_liver_disease
bacterial_pneumonia
cancer
covid
covid_xla
healthy
mda5+dermatomyositis
myocarditis

Disease Counts for cd3+:
Disease
cancer                     289
covid                      121
healthy                     57
mda5+dermatomyositis        12
bacterial_pneumonia          8
myocarditis                  4
covid_xla                    2
alcoholic_liver_disease      2
Name: count, dtype: int64


In [11]:
import pandas as pd

# Load the dataset
df = pd.read_csv('metadata_new.csv')

# Ensure TType is string type and handle NaN
df['TType'] = df['TType'].fillna('').astype(str).str.lower()

# Get all unique TType values
unique_ttypes = df['TType'].unique()
print(f"Total unique TType values: {len(unique_ttypes)}")
print("Unique TType values:", sorted(unique_ttypes))
print()

# Function to get diseases for a given TType
def get_disease_associations(ttype_value):
    ttype_df = df[df['TType'].str.contains(ttype_value, na=False)]
    diseases = ttype_df['Disease'].unique()
    disease_counts = ttype_df['Disease'].value_counts()
    return diseases, disease_counts, len(ttype_df)

# Analyze cd3+, cd4+, cd8+, and others
ttype_categories = ['cd3+', 'cd4+', 'cd8+'] + [t for t in unique_ttypes if t not in ['cd3+', 'cd4+', 'cd8+'] and t != 'unsorted' and t != 'live']

for ttype in ttype_categories:
    diseases, counts, total_rows = get_disease_associations(ttype)
    print(f"Diseases associated with '{ttype}' (Total rows: {total_rows}):")
    print(f"  Unique diseases: {len(diseases)}")
    print("  Disease list:", sorted(diseases))
    print("  Counts:\n", counts)
    print()

# Check for one-to-one relationship
disease_ttype_map = df.groupby('Disease')['TType'].unique().apply(list)
print("Disease to TType Mapping (checking exclusivity):")
for disease, ttypes in disease_ttype_map.items():
    print(f"{disease}: {ttypes}")
print()

# Summarize one-to-one relationships
one_to_one = sum(1 for ttypes in disease_ttype_map if len([t for t in ttypes if t]) == 1)
print(f"Number of diseases with a one-to-one TType relationship: {one_to_one} out of {len(disease_ttype_map)}")

# Optional: Detailed overlap analysis
overlap_df = df.pivot_table(index='Disease', columns='TType', aggfunc='size', fill_value=0)
print("\nDisease-TType Overlap Table (counts of rows):")
print(overlap_df)

Total unique TType values: 15
Unique TType values: ['cd3+', 'cd3+cd4-cd8+_thymocytes', 'cd3-cd4+cd8-_thymocytes', 'cd34-cd3+cd4+cd8+_thymocytes', 'cd34-cd3-cd4+cd8+_thymocytes', 'cd34-cd3lowcd4+_thymocytes', 'cd4+', 'cd45+', 'cd8+', 'gamma-delta', 'lin-cd34+cd38+cd1a-_thymocytes', 'lin-cd34+cd38-cd1a+_thymocytes', 'lin-cd34+cd38-cd1a-_thymocytes', 'live', 'unsorted']

Diseases associated with 'cd3+' (Total rows: 495):
  Unique diseases: 8
  Disease list: ['alcoholic_liver_disease', 'bacterial_pneumonia', 'cancer', 'covid', 'covid_xla', 'healthy', 'mda5+dermatomyositis', 'myocarditis']
  Counts:
 Disease
cancer                     289
covid                      121
healthy                     57
mda5+dermatomyositis        12
bacterial_pneumonia          8
myocarditis                  4
covid_xla                    2
alcoholic_liver_disease      2
Name: count, dtype: int64

Diseases associated with 'cd4+' (Total rows: 183):
  Unique diseases: 10
  Disease list: ['allergic_rhinitis', 'au

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv('metadata_new.csv')

# Ensure TType is string type and handle NaN
df['TType'] = df['TType'].fillna('').astype(str).str.lower()

# Filter rows where TType is 'cd3+'
cd3_df = df[df['TType'] == 'cd3+']  # Exact match for 'cd3+' only

# Total rows and disease counts for cd3+
total_cd3_rows = len(cd3_df)
cd3_disease_counts = cd3_df['Disease'].value_counts()
unique_cd3_diseases = cd3_df['Disease'].unique()

# Overall disease counts for comparison
overall_disease_counts = df['Disease'].value_counts()

# Calculate proportions
cd3_proportions = cd3_disease_counts / total_cd3_rows
overall_proportions = overall_disease_counts / df.shape[0]
enrichment = cd3_disease_counts / overall_disease_counts

# Combine into a summary DataFrame
summary_df = pd.DataFrame({
    'cd3+_Count': cd3_disease_counts,
    'cd3+_Proportion': cd3_proportions,
    'Overall_Count': overall_disease_counts,
    'Overall_Proportion': overall_proportions,
    'Enrichment_Ratio': enrichment
}).fillna(0)

# Print results
print(f"Total rows in dataset: {df.shape[0]}")
print(f"Total rows with TType 'cd3+': {total_cd3_rows}")
print(f"Unique diseases with cd3+: {len(unique_cd3_diseases)}\n")

print("Disease Association Analysis with cd3+:")
print(summary_df.sort_values(by='cd3+_Count', ascending=False))
print()

# Highlight significant associations
significant = summary_df[summary_df['Enrichment_Ratio'] > 1.5]  # Enrichment > 1.5x overall
print("Diseases significantly enriched for cd3+ (Enrichment Ratio > 1.5):")
print(significant[['cd3+_Count', 'Enrichment_Ratio']])

# Optional: Compare with cd4+ and cd8+
cd4_df = df[df['TType'] == 'cd4+']
cd8_df = df[df['TType'] == 'cd8+']
print(f"\nTotal rows with TType 'cd4+': {len(cd4_df)}")
print("Top 5 diseases for cd4+:")
print(cd4_df['Disease'].value_counts().head())
print(f"\nTotal rows with TType 'cd8+': {len(cd8_df)}")
print("Top 5 diseases for cd8+:")
print(cd8_df['Disease'].value_counts().head())

Total rows in dataset: 1804
Total rows with TType 'cd3+': 487
Unique diseases with cd3+: 8

Disease Association Analysis with cd3+:
                             cd3+_Count  cd3+_Proportion  Overall_Count  \
Disease                                                                   
cancer                            289.0         0.593429            526   
covid                             121.0         0.248460            894   
healthy                            49.0         0.100616            210   
mda5+dermatomyositis               12.0         0.024641             12   
bacterial_pneumonia                 8.0         0.016427              8   
myocarditis                         4.0         0.008214             11   
alcoholic_liver_disease             2.0         0.004107              2   
covid_xla                           2.0         0.004107              2   
influenza                           0.0         0.000000              8   
obesity                             0.0    

In [13]:
import pandas as pd

# Load the new dataset
df = pd.read_csv('processed_data.csv')

# Ensure TType is string type and handle NaN
df['TType'] = df['TType'].fillna('').astype(str).str.lower()

# Filter rows where TType is 'cd3+'
cd3_df = df[df['TType'] == 'cd3+']  # Exact match for 'cd3+' only

# Total rows and disease counts for cd3+
total_cd3_rows = len(cd3_df)
cd3_disease_counts = cd3_df['Disease'].value_counts()
unique_cd3_diseases = cd3_df['Disease'].unique()

# Overall disease counts for comparison
overall_disease_counts = df['Disease'].value_counts()

# Calculate proportions
cd3_proportions = cd3_disease_counts / total_cd3_rows
overall_proportions = overall_disease_counts / df.shape[0]
enrichment = cd3_disease_counts / overall_disease_counts

# Combine into a summary DataFrame
summary_df = pd.DataFrame({
    'cd3+_Count': cd3_disease_counts,
    'cd3+_Proportion': cd3_proportions,
    'Overall_Count': overall_disease_counts,
    'Overall_Proportion': overall_proportions,
    'Enrichment_Ratio': enrichment
}).fillna(0)

# Print results
print(f"Total rows in dataset: {df.shape[0]}")
print(f"Total rows with TType 'cd3+': {total_cd3_rows}")
print(f"Unique diseases with cd3+: {len(unique_cd3_diseases)}\n")

print("Disease Association Analysis with cd3+:")
print(summary_df.sort_values(by='cd3+_Count', ascending=False))
print()

# Highlight significant associations
significant = summary_df[summary_df['Enrichment_Ratio'] > 1.5]  # Enrichment > 1.5x overall
print("Diseases significantly enriched for cd3+ (Enrichment Ratio > 1.5):")
print(significant[['cd3+_Count', 'Enrichment_Ratio']])

# Optional: Compare with cd4+ and cd8+
cd4_df = df[df['TType'] == 'cd4+']
cd8_df = df[df['TType'] == 'cd8+']
print(f"\nTotal rows with TType 'cd4+': {len(cd4_df)}")
print("Top 5 diseases for cd4+:")
print(cd4_df['Disease'].value_counts().head())
print(f"\nTotal rows with TType 'cd8+': {len(cd8_df)}")
print("Top 5 diseases for cd8+:")
print(cd8_df['Disease'].value_counts().head())

  df = pd.read_csv('processed_data.csv')


Total rows in dataset: 6436001
Total rows with TType 'cd3+': 1645198
Unique diseases with cd3+: 8

Disease Association Analysis with cd3+:
                                  cd3+_Count  cd3+_Proportion  Overall_Count  \
Disease                                                                        
cancer                             1015656.0         0.617346        1762749   
covid                               496693.0         0.301905        3184727   
mda5+dermatomyositis                 43398.0         0.026379          43398   
healthy                              39970.0         0.024295         864282   
bacterial_pneumonia                  17959.0         0.010916          17959   
covid_xla                            17114.0         0.010402          17114   
alcoholic_liver_disease               8379.0         0.005093           8379   
myocarditis                           6029.0         0.003665          37557   
inhibitor-associated_thyroiditis         0.0         0.000000

# Reasons to Include Only cd3+ Diseases
Relevance to cd3+ T-Cells: If your goal is to study CDR3 sequences specifically from CD3+ T-cells (which mark all T-cells), restricting to these 8 diseases ensures your analysis aligns with the cell type of interest.
Data Consistency: Your output shows 1,645,198 cd3+ rows, so focusing on these diseases leverages this large subset directly.
Simplified Analysis: Fewer diseases (8 vs. 24) reduce computational complexity and noise in BERT embeddings.
# Reasons to Include All Diseases
Broader Context: Excluding diseases like "hiv," "influenza," or "allergic_rhinitis" (common in cd4+ and cd8+) might miss CDR3 patterns relevant to T-cell responses across all subsets (since cd3+ includes both cd4+ and cd8+).
Potential Misclassification: If processed_data.csv is sequence-level data, some rows labeled "cd4+" or "cd8+" might still derive from cd3+ cells (as cd4+ and cd8+ are cd3+ subsets), and excluding them could skew results.
Exploratory Analysis: Including all diseases lets BERT uncover unexpected CDR3-disease associations not tied to predefined "cd3+" labels.

If Focusing on cd3+ Specifics: Filter to the 8 diseases ("cancer," "covid," "mda5+dermatomyositis," "healthy," "bacterial_pneumonia," "covid_xla," "alcoholic_liver_disease," "myocarditis") for a targeted cd3+ CDR3 analysis. This aligns with your current output and keeps the scope manageable (1.6M rows).
If Exploring All T-Cell Responses: Use all diseases, as CDR3 sequences (from T-cell receptors) are relevant across cd3+, cd4+, and cd8+ T-cells, and processed_data.csv’s size suggests it might include sequence data beyond strict metadata labels.

In [14]:
import pandas as pd

# Load the dataset
df = pd.read_csv('processed_data.csv')
print(f"Original dataset rows: {df.shape[0]}")

# Ensure TType is string type and handle NaN
df['TType'] = df['TType'].fillna('').astype(str).str.lower()

# Define the 8 diseases associated with cd3+ from your output
cd3_diseases = [
    "cancer",
    "covid",
    "mda5+dermatomyositis",
    "healthy",
    "bacterial_pneumonia",
    "covid_xla",
    "alcoholic_liver_disease",
    "myocarditis"
]

# Filter rows where TType is 'cd3+' and Disease is one of the 8
cd3_df = df[(df['TType'] == 'cd3+') & (df['Disease'].isin(cd3_diseases))]

# Verify the filtered data
print(f"Filtered rows with cd3+ and 8 diseases: {cd3_df.shape[0]}")
print("Disease counts in filtered data:")
print(cd3_df['Disease'].value_counts())

# Save the filtered DataFrame to a new CSV
output_file = 'cdr3_disease.csv'
cd3_df.to_csv(output_file, index=False)
print(f"Filtered data saved to {output_file}")

# Optional: Preview the first few rows
print("\nPreview of the first 5 rows:")
print(cd3_df.head())

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


  df = pd.read_csv('processed_data.csv')


Original dataset rows: 6436001
Filtered rows with cd3+ and 8 diseases: 1645198
Disease counts in filtered data:
Disease
cancer                     1015656
covid                       496693
mda5+dermatomyositis         43398
healthy                      39970
bacterial_pneumonia          17959
covid_xla                    17114
alcoholic_liver_disease       8379
myocarditis                   6029
Name: count, dtype: int64
Filtered data saved to cdr3_disease.csv

Preview of the first 5 rows:
        productive_beta complete_vdj_beta  v_call_beta d_call_beta  \
2772976               T                 T  TRBV25-1*01    TRBD1*01   
2772977               T                 T   TRBV6-1*01         NaN   
2772978               T                 T    TRBV30*02         NaN   
2772979               T                 T    TRBV18*01         NaN   
2772980               T                 T   TRBV5-4*01    TRBD1*01   

        j_call_beta                         sequence_alignment_aa_beta  \
2772976  