In [267]:
import pandas as pd 
df = pd.read_csv('vdjdb.txt',sep='\t')
df = df.drop(['method', 'meta','cdr3fix','web.method','web.method.seq','web.cdr3fix.nc','web.cdr3fix.unmp',"reference.id"], axis=1)
df = df.drop(["mhc.class","j.segm"], axis=1)
df = df[df['antigen.gene'] != 'Nef']

In [255]:
def classify_hla_allele(allele):
    if allele.startswith('HLA-A'):
        return 'HLA-A'
    elif allele.startswith('HLA-B'):
        return 'HLA-B'
    elif allele.startswith('HLA-C'):
        return 'HLA-C'
    elif allele.startswith('HLA-DPA') or allele.startswith('HLA-DPB'):
        return 'HLA-DP'
    elif allele.startswith('HLA-DQA') or allele.startswith('HLA-DQB'):
        return 'HLA-DQ'
    elif allele.startswith('HLA-DR'):
        return 'HLA-DR'
    elif allele.startswith('H-2'):
        return 'H-2 (Mouse)'
    else:
        return 'Other'

# Assuming you have a dataframe `df` with an 'allele' column
df['mhc.a'] = df['mhc.a'].apply(classify_hla_allele)

In [256]:
def classify_allele(allele):
    if 'HLA-DRB1' in allele:
        return 'DRB1'
    elif 'HLA-DQB1' in allele:
        return 'DQB1'
    elif 'HLA-DPB1' in allele or 'HLA-DPB' in allele:
        return 'DPB1'
    elif 'HLA-DRB3' in allele:
        return 'DRB3'
    elif 'B2M' in allele:
        return 'B2M'
    elif 'H-2' in allele:
        return 'H-2'
    else:
        return 'Other'
    
# Apply classification
df['mhc.b'] = df['mhc.b'].apply(classify_allele)

In [257]:
df=df[df['vdjdb.score'].isin([1,2,3])]
df_f=df[df['vdjdb.score'].isin([1,2,3])]
df=df.drop('vdjdb.score', axis=1)
len(df)

12157

In [258]:
df_A= df_f[df_f['gene']=='TRA']
df_A = df_A.drop('complex.id', axis=1)
df_A = df_A.drop('vdjdb.score', axis=1)
df_B= df_f[df_f['gene']=='TRB']
df_B = df_B.drop('complex.id', axis=1)
df_B = df_B.drop('vdjdb.score', axis=1)
print(len(df_B), len(df_A))

8296 3861


In [259]:
v_segm_trb = df.loc[(df['gene'] == 'TRB') & (df['complex.id'] > 0), 'v.segm'].values
len(v_segm_trb)

2947

In [260]:
# Define a custom aggregation function to concatenate strings without a delimiter
def combine_cdr3(x):
    return ''.join(x)

# Combine 'cdr3' values for each 'complex.id' without a delimiter
df['combined_cdr3'] = df.groupby('complex.id')['cdr3'].transform(combine_cdr3)

# Drop the original 'cdr3' column
df.drop(columns=['cdr3'], inplace=True)

# Drop duplicate rows
df.drop_duplicates(subset=['complex.id'], keep='first', inplace=True)

# Move the 'combined_cdr3' column to the first position
cols = df.columns.tolist()
cols = ['combined_cdr3'] + [col for col in cols if col != 'combined_cdr3']
df = df[cols]

# Reset index
df.reset_index(drop=True, inplace=True)

In [261]:
df= df[df['complex.id'] > 0]
df["v.segm"] = v_segm_trb
df

Unnamed: 0,combined_cdr3,complex.id,gene,v.segm,species,mhc.a,mhc.b,antigen.epitope,antigen.gene,antigen.species
1,CAVAGYGGSQGNLIFCASSPQGLGTEAFF,15,TRA,TRBV28*01,HomoSapiens,HLA-A,B2M,ELAGIGILTV,MLANA,HomoSapiens
2,CIVKTNSGGSNYKLTFCASSFEETQYF,77,TRA,TRBV7-2*01,HomoSapiens,HLA-DQ,DQB1,TAAQAAVVRFQEAAN,CFP10,M.tuberculosis
3,CIVHTNSGGSNYKLTFCASSPEETQYF,78,TRA,TRBV7-2*01,HomoSapiens,HLA-DQ,DQB1,TAAQAAVVRFQEAAN,CFP10,M.tuberculosis
4,CIEHTNSGGSNYKLTFCASSLEETQYF,79,TRA,TRBV7-2*01,HomoSapiens,HLA-DQ,DQB1,TAAQAAVVRFQEAAN,CFP10,M.tuberculosis
5,CAVGGLSGANSKLTFCASSVALAGAEYF,80,TRA,TRBV9*01,HomoSapiens,HLA-DR,DRB1,MHVSFVMAYPEMLAA,Rv1195,M.tuberculosis
...,...,...,...,...,...,...,...,...,...,...
2943,CIALNARLMFCASSLRATDTQYF,30550,TRA,TRBV7-2*01,HomoSapiens,HLA-DQ,DQB1,PQPELPYPQPQL,Gluten,Wheat
2944,CAMREGRYSSASKIIFCATSRAGGGGEKLFF,30551,TRA,TRBV15*01,HomoSapiens,HLA-DQ,DQB1,FPQPEQPFPWQP,Gluten,Wheat
2945,CLVGDGDGGATNKLIFCASSQGSGGNEQFF,30552,TRA,TRBV4-3*01,HomoSapiens,HLA-DQ,DQB1,FPQPEQPFPWQP,Gluten,Wheat
2946,CAASVLYGSSNTGKLIFCASSIVGSGGYNEQFF,30554,TRA,TRBV19*01,HomoSapiens,HLA-DQ,DQB1,QLQPFPQPELPY,Gluten,Wheat


In [262]:
df_AB=df
df_AB=df_AB.drop_duplicates(subset=['combined_cdr3'])
df_A=df_A.drop_duplicates(subset=['cdr3'])
df_B=df_B.drop_duplicates(subset=['cdr3'])

In [263]:
# Drop the column 'gene' from df_AB
df_AB = df_AB.drop('gene', axis=1)
# Drop the column 'complex.id' from df_AB
df_AB = df_AB.drop('complex.id', axis=1)

In [264]:
df_A = df_A.drop('gene', axis=1)
df_B = df_B.drop('gene', axis=1)

In [265]:
import pandas as pd

# Assuming df_AB is your DataFrame
# Create a function to select 19 characters starting from the 6th position
def select_characters_starting_from_6(text):
    return text[6:6+19]

# Apply the function to the column
df_AB['combined_cdr3'] = df_AB['combined_cdr3'].apply(select_characters_starting_from_6)

In [266]:
df_A.to_csv('df_AG.txt', sep="\t", index=False)
df_B.to_csv('df_BG.txt', sep="\t", index=False)
df_AB.to_csv('df_AB.txt', sep="\t", index=False)