In [11]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import os 


In [None]:
DATA_BASE = "tc-hard/dataset/ds.hard-splits/pep+cdr3b"

FEW_SHOT_DATA_BASE = "tc-hard/dataset/few_shot_split/pep+cdr3b"

In [13]:
validation_ratio = 0.2

In [14]:
def make_df(df_path):
    df = pd.read_csv(df_path, usecols = ["cdr3.beta", "antigen.epitope", "label"])

    map_keys = {
    'cdr3.beta': 'tcrb',
    'antigen.epitope': 'peptide',
    "label": "label"
    }
    df = df.rename(columns={c: map_keys[c] for c in df.columns})

    df['tcrb'] = df['tcrb'].str.replace('O','X')
    df['peptide'] = df['peptide'].str.replace('O','X')

    return df



In [15]:
def draw_bar(train_num, validation_num, test_num, split_id):

    labels = ["train", "validation", "test"]
    bars = plt.bar(labels, [train_num, validation_num, test_num])

    plt.title(f'Samples Distribution in split {split_id}')
    plt.xlabel('Categories')
    plt.ylabel('Values')

    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), ha='center', va='bottom')

    plt.show()

def check_unseen(train, validation):
    peptide_train = set(train['peptide'])
    peptide_validation = set(validation['peptide'])
    print("num of unseen peptides in validation set: ", len(peptide_validation - peptide_train))
    print("num of unseen peptides in train set: ", len(peptide_train - peptide_validation))
    print("num of common peptides in train and validation set: ", len(peptide_train & peptide_validation))


In [16]:
def get_few_shot_df(df, num_seen_peptide = 0.2, num_few_shots = 5):
    # Sample 20% of the unique peptides
    unique_peptides = df['peptide'].unique()
    sampled_peptides = pd.Series(unique_peptides).sample(frac = num_seen_peptide, random_state=42)  # Ensures reproducibility

    # Filter the DataFrame to only include rows with the sampled peptides
    filtered_df = df[df['peptide'].isin(sampled_peptides)]

    def sample_tcrs(group):
        return group.sample(n=min(len(group), num_few_shots), random_state = 42)  # Ensures reproducibility
    
    result_df = filtered_df.groupby("peptide").apply(sample_tcrs).reset_index(drop = True)

    return result_df

In [17]:
def clean_sampled(A: pd.DataFrame, B: pd.DataFrame):
    A['flag'] = True
    B['flag'] = False

    # Concatenate the two dataframes
    combined = pd.concat([A, B], ignore_index=True)

    # Remove rows that are repeated by keeping only those that have `flag` set to True
    filtered = combined.drop_duplicates(subset=A.columns[:-1], keep=False)

    # Retain only rows that have the original flag as `True`
    A_cleaned = filtered[filtered['flag']].drop(columns='flag')

    return A_cleaned

In [18]:
def validation_split(neg_generate_mode, split_id, unseen_ratio = 0.2, num_few_shot = 5):
    train_df_path = os.path.join(DATA_BASE, "train", neg_generate_mode, f"train-{split_id}.csv")

    test_df_path = os.path.join(DATA_BASE, "test", neg_generate_mode, f"test-{split_id}.csv")
    
    train_df = make_df(train_df_path)
    test_df = make_df(test_df_path)
    
    num_samples = train_df.shape[0]
    num_validation = int(num_samples * validation_ratio)

    from collections import Counter
    peptide_count = Counter(train_df['peptide'])
    peptide_count_len = len(peptide_count)
    peptide_perm = np.random.RandomState(seed=42).permutation(peptide_count_len)

    c = 0
    selected_peptide = []
    for i in peptide_perm:
        selected_peptide.append(peptide_count.most_common()[i][0])
        c += peptide_count.most_common()[i][1]
        if c > num_validation:
            break

    new_train_df = train_df[~train_df['peptide'].isin(selected_peptide)]
    validation_df = train_df[train_df['peptide'].isin(selected_peptide)]

    # get the few-shot knowledge
    sample_validation = get_few_shot_df(validation_df, 0.2, num_few_shot)
    sample_test = get_few_shot_df(test_df, 0.2, num_few_shot)

    print(f"shape of sample_validation: {sample_validation.shape}, shape of sample_test: {sample_test.shape}")

    new_train_df = pd.concat([new_train_df, sample_validation, sample_test], axis=0, ignore_index=True)

    new_validation_df = clean_sampled(validation_df, sample_validation)
    new_test_df = clean_sampled(test_df, sample_test)

    check_unseen(new_train_df, validation_df)
    check_unseen(new_train_df, test_df)
    print(f"num of unique peptide in test set: {test_df['peptide'].unique().shape}")
    # draw_bar(train_df.shape[0], validation_df.shape[0], test_df.shape[0], split_id)
    
    return new_train_df, new_validation_df, new_test_df


In [19]:
num_unseen_ratio = 0.2

for num_few_shot in [1, 2, 3, 4]:
    for neg_generate_mode in ["only-neg-assays", "only-sampled-negs"]:
        for split_id in range(5):
            train_df, validation_df, test_df = validation_split(neg_generate_mode, split_id, 0.2, num_few_shot)
            
            new_train_df_path = os.path.join(FEW_SHOT_DATA_BASE, "train", neg_generate_mode, f"{num_few_shot}-train-{split_id}.csv")
            validation_df_path = os.path.join(FEW_SHOT_DATA_BASE, "validation", neg_generate_mode, f"{num_few_shot}-validation-{split_id}.csv")
            test_df_path = os.path.join(FEW_SHOT_DATA_BASE, "test", neg_generate_mode, f"{num_few_shot}-test-{split_id}.csv")

            train_df.to_csv(new_train_df_path, index=False)
            validation_df.to_csv(validation_df_path, index=False)
            test_df.to_csv(test_df_path, index=False)


shape of sample_validation: (105, 3), shape of sample_test: (4, 3)
num of unseen peptides in validation set:  420
num of unseen peptides in train set:  820
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  15
num of unseen peptides in train set:  921
num of common peptides in train and validation set:  4
num of unique peptide in test set: (19,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (104, 3), shape of sample_test: (5, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (104, 3), shape of sample_test: (5, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (104, 3), shape of sample_test: (5, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  821
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  18
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  5
num of unique peptide in test set: (23,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (105, 3), shape of sample_test: (4, 3)
num of unseen peptides in validation set:  419
num of unseen peptides in train set:  819
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  17
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  4
num of unique peptide in test set: (21,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (94, 3), shape of sample_test: (6, 3)
num of unseen peptides in validation set:  374
num of unseen peptides in train set:  866
num of common peptides in train and validation set:  94
num of unseen peptides in validation set:  26
num of unseen peptides in train set:  954
num of common peptides in train and validation set:  6
num of unique peptide in test set: (32,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (92, 3), shape of sample_test: (6, 3)
num of unseen peptides in validation set:  366
num of unseen peptides in train set:  880
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  6
num of unique peptide in test set: (28,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (92, 3), shape of sample_test: (5, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  882
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  19
num of unseen peptides in train set:  969
num of common peptides in train and validation set:  5
num of unique peptide in test set: (24,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (92, 3), shape of sample_test: (5, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  881
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  20
num of unseen peptides in train set:  968
num of common peptides in train and validation set:  5
num of unique peptide in test set: (25,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (92, 3), shape of sample_test: (5, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  879
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  5
num of unique peptide in test set: (27,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (188, 3), shape of sample_test: (8, 3)
num of unseen peptides in validation set:  420
num of unseen peptides in train set:  820
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  15
num of unseen peptides in train set:  921
num of common peptides in train and validation set:  4
num of unique peptide in test set: (19,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (177, 3), shape of sample_test: (10, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (177, 3), shape of sample_test: (10, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (174, 3), shape of sample_test: (10, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  821
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  18
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  5
num of unique peptide in test set: (23,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (187, 3), shape of sample_test: (8, 3)
num of unseen peptides in validation set:  419
num of unseen peptides in train set:  819
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  17
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  4
num of unique peptide in test set: (21,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (188, 3), shape of sample_test: (12, 3)
num of unseen peptides in validation set:  374
num of unseen peptides in train set:  866
num of common peptides in train and validation set:  94
num of unseen peptides in validation set:  26
num of unseen peptides in train set:  954
num of common peptides in train and validation set:  6
num of unique peptide in test set: (32,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (184, 3), shape of sample_test: (12, 3)
num of unseen peptides in validation set:  366
num of unseen peptides in train set:  880
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  6
num of unique peptide in test set: (28,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (184, 3), shape of sample_test: (10, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  882
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  19
num of unseen peptides in train set:  969
num of common peptides in train and validation set:  5
num of unique peptide in test set: (24,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (184, 3), shape of sample_test: (10, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  881
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  20
num of unseen peptides in train set:  968
num of common peptides in train and validation set:  5
num of unique peptide in test set: (25,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (184, 3), shape of sample_test: (10, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  879
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  5
num of unique peptide in test set: (27,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (245, 3), shape of sample_test: (12, 3)
num of unseen peptides in validation set:  420
num of unseen peptides in train set:  820
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  15
num of unseen peptides in train set:  921
num of common peptides in train and validation set:  4
num of unique peptide in test set: (19,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (233, 3), shape of sample_test: (15, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (233, 3), shape of sample_test: (15, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (228, 3), shape of sample_test: (15, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  821
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  18
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  5
num of unique peptide in test set: (23,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (252, 3), shape of sample_test: (12, 3)
num of unseen peptides in validation set:  419
num of unseen peptides in train set:  819
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  17
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  4
num of unique peptide in test set: (21,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (282, 3), shape of sample_test: (18, 3)
num of unseen peptides in validation set:  374
num of unseen peptides in train set:  866
num of common peptides in train and validation set:  94
num of unseen peptides in validation set:  26
num of unseen peptides in train set:  954
num of common peptides in train and validation set:  6
num of unique peptide in test set: (32,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (276, 3), shape of sample_test: (18, 3)
num of unseen peptides in validation set:  366
num of unseen peptides in train set:  880
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  6
num of unique peptide in test set: (28,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (276, 3), shape of sample_test: (15, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  882
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  19
num of unseen peptides in train set:  969
num of common peptides in train and validation set:  5
num of unique peptide in test set: (24,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (276, 3), shape of sample_test: (15, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  881
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  20
num of unseen peptides in train set:  968
num of common peptides in train and validation set:  5
num of unique peptide in test set: (25,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (276, 3), shape of sample_test: (15, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  879
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  5
num of unique peptide in test set: (27,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (295, 3), shape of sample_test: (16, 3)
num of unseen peptides in validation set:  420
num of unseen peptides in train set:  820
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  15
num of unseen peptides in train set:  921
num of common peptides in train and validation set:  4
num of unique peptide in test set: (19,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (281, 3), shape of sample_test: (20, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (281, 3), shape of sample_test: (20, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  818
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  21
num of unseen peptides in train set:  917
num of common peptides in train and validation set:  5
num of unique peptide in test set: (26,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (276, 3), shape of sample_test: (20, 3)
num of unseen peptides in validation set:  417
num of unseen peptides in train set:  821
num of common peptides in train and validation set:  104
num of unseen peptides in validation set:  18
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  5
num of unique peptide in test set: (23,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (308, 3), shape of sample_test: (16, 3)
num of unseen peptides in validation set:  419
num of unseen peptides in train set:  819
num of common peptides in train and validation set:  105
num of unseen peptides in validation set:  17
num of unseen peptides in train set:  920
num of common peptides in train and validation set:  4
num of unique peptide in test set: (21,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (349, 3), shape of sample_test: (24, 3)
num of unseen peptides in validation set:  374
num of unseen peptides in train set:  866
num of common peptides in train and validation set:  94
num of unseen peptides in validation set:  26
num of unseen peptides in train set:  954
num of common peptides in train and validation set:  6
num of unique peptide in test set: (32,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (341, 3), shape of sample_test: (24, 3)
num of unseen peptides in validation set:  366
num of unseen peptides in train set:  880
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  6
num of unique peptide in test set: (28,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (341, 3), shape of sample_test: (20, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  882
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  19
num of unseen peptides in train set:  969
num of common peptides in train and validation set:  5
num of unique peptide in test set: (24,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (334, 3), shape of sample_test: (20, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  881
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  20
num of unseen peptides in train set:  968
num of common peptides in train and validation set:  5
num of unique peptide in test set: (25,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


shape of sample_validation: (336, 3), shape of sample_test: (20, 3)
num of unseen peptides in validation set:  367
num of unseen peptides in train set:  879
num of common peptides in train and validation set:  92
num of unseen peptides in validation set:  22
num of unseen peptides in train set:  966
num of common peptides in train and validation set:  5
num of unique peptide in test set: (27,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A['flag'] = True


In [20]:
# df = pd.read_csv("/project/zhiwei/cq5/PythonWorkSpace/TCRPrediction/tc-hard/dataset/few_shot_split/pep+cdr3b/test/only-sampled-negs/15-test-0.csv")

# df.shape