In [1]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
sys.path.append(os.path.abspath("../"))  # Adds 'src' to the module search path
sys.path.append(os.path.abspath("../src"))  # Adds 'src' to the module search path

In [2]:
from utils_data import create_model_dataset
from read_data import apply_index_file

In [55]:
np.random.seed(42)
tqdm.pandas()
RAW_DATASET = "../raw_data/massive.parquet"
DATA = "../data/"
TRAIN = "../data/train_subsets/"
TEST = "../data/test/"
os.makedirs(DATA, exist_ok=True)
os.makedirs(TRAIN, exist_ok=True)
os.makedirs(TEST, exist_ok=True)

# Preprocess

In [4]:
df = pd.read_parquet(RAW_DATASET,engine="pyarrow")

In [5]:
def remove_duplicates_keep_median(df):
    """
    Ultra-optimized version for very large datasets (27M+ rows).
    Uses vectorized operations throughout for maximum performance.
    """
    print("calculating medians")
    group_medians = df.groupby(['modified_sequence', 'filename'])['label'].transform('median')
    # Calculate absolute difference from median (vectorized)
    abs_diff = (df['label'] - group_medians).abs()
    
    print("Getting groups")
    group_ids = df.groupby(['modified_sequence', 'filename']).ngroup()
    

    print("min ids")
    min_idx_per_group = pd.DataFrame({
        'group_id': group_ids,
        'abs_diff': abs_diff,
        'original_idx': df.index
    }).groupby('group_id')['abs_diff'].idxmin()

    print("select indices")
    selected_indices = df.index[min_idx_per_group]
    
    return df.loc[selected_indices].reset_index(drop=True)

In [6]:
df_preprocessed = remove_duplicates_keep_median(df)

calculating medians
Getting groups
min ids
select indices


In [9]:
df_preprocessed.to_parquet(f"{DATA}dataset_preprocessed.parquet")

In [20]:
df_preprocessed

Unnamed: 0,modified_sequence,label,filename,dataset,task
0,AAAAAAAAAAAAAAAAG,77.383600,Bibo_20130621_CHS_IEF_3-10linear_24slices_08.m...,MSV000080274,iRT
1,AAAAAAAAAAAAAAAAG,93.708580,Bibo_20130621_CHS_IEF_3-10linear_24slices_15.m...,MSV000080274,iRT
2,AAAAAAAAAAAAAAAAG,89.782983,Bibo_20130621_CHS_IEF_3-10linear_24slices_16.m...,MSV000080274,iRT
3,AAAAAAAAAAAAAAAAG,85.309409,Bibo_20130621_CHS_IEF_3-10linear_24slices_18.m...,MSV000080274,iRT
4,AAAAAAAAAAAAAAAAG,99.475931,Bibo_20130621_CHS_IEF_3-10linear_24slices_20.m...,MSV000080274,iRT
...,...,...,...,...,...
27236451,YYYYWHLR,45.123521,20130502_EXQ6_SaDe_SA_76_05.mzML,MSV000080813,iRT
27236452,YYYYWHLRK,126.525771,20110715_EXQ1_TaGe_SA_PC11_6.mzXML,MSV000080069,iRT
27236453,YYYYWHLRK,78.161016,Bibo_20130110_CHS_IEF100_20121129_3-10linear_S...,MSV000080274,iRT
27236454,YYYYWHLRK,80.638047,HUVEC_ne_con_5a_1.mzXML,MSV000080225,iRT


# Split

In [56]:
df = pd.read_parquet(f"{DATA}dataset.parquet",engine="pyarrow")

In [57]:
sequences_dataset = df.groupby('dataset')['modified_sequence'].count()

In [58]:
all_datasets = df['dataset'].unique()
test_datasets = ['MSV000080274']
val_datasets = ['MSV000079550']
train_datasets = [d for d in all_datasets if d not in test_datasets + val_datasets]

In [59]:
test_df = df[df['dataset'].isin(test_datasets)].sample(350000, random_state=42) 
val_df = df[df['dataset'].isin(val_datasets)].sample(350000, random_state=42) 

In [60]:
test_sequences_with_intersection = set(test_df['modified_sequence'])
val_sequences_with_intersection = set(val_df['modified_sequence'])
common_sequences = test_sequences_with_intersection.intersection(val_sequences_with_intersection)
common_sequences_list = list(common_sequences)

In [61]:
np.random.shuffle(common_sequences_list)
half_point = len(common_sequences_list) // 2
test_keep_sequences = set(common_sequences_list[:half_point])
val_keep_sequences = set(common_sequences_list[half_point:])

test_df = test_df[~test_df['modified_sequence'].isin(val_keep_sequences)]
val_df = val_df[~val_df['modified_sequence'].isin(test_keep_sequences)]
print(f"test dataset size: {len(test_df)}")
print(f"validation dataset size: {len(val_df)}")
test_sequences = set(test_df["modified_sequence"].unique())
val_sequences = set(val_df["modified_sequence"].unique())
test_val_sequences = test_sequences | val_sequences

test dataset size: 247344
validation dataset size: 249607


In [62]:
train_df = df[df['dataset'].isin(train_datasets)]
train_df = train_df[~train_df['modified_sequence'].isin(test_val_sequences)]
train_sequences = set(train_df["modified_sequence"].unique())

In [63]:
if not (test_sequences & val_sequences or test_sequences & train_sequences or val_sequences & train_sequences):
    print("No sequence overlap between the sets")
else:
    print("There is an overlap in sequences between the sets.")

No sequence overlap between the sets


In [64]:
train_df.index.to_series().to_csv(f"{DATA}/train.csv", index=False,header=False)
test_df.index.to_series().to_csv(f"{DATA}/test.csv", index=False,header=False)
val_df.index.to_series().to_csv(f"{DATA}/val.csv", index=False,header=False)

## MSV000080814 train indices

In [65]:
df = pd.read_parquet(f"{DATA}dataset.parquet",engine="pyarrow")
train_df = apply_index_file(df,f"{DATA}train.csv")

In [66]:
MSV80814_indices = train_df[train_df['dataset']=="MSV000080814"].index.to_series()
MSV80814_indices.to_csv(f'{TRAIN}MSV000080814.csv', index=False,header=False)

## increasing train indices

In [67]:
df = pd.read_parquet(f"{DATA}dataset.parquet",engine="pyarrow")
train_df = apply_index_file(df,f"{DATA}train.csv")

In [68]:
unique_filenames = train_df['filename'].unique()
total_files = len(unique_filenames)
step_size = int(total_files * 0.1)  # Calculate 10% of total files

# Create steps at 10% increments up to 90%
steps = []
for i in range(1, 10):  # 10%, 20%, 30%, ... 90% (stopping before 100%)
    steps.append(i * step_size)

# Add 100% as the final step
steps.append(total_files)
for x in steps:
    subset = unique_filenames[:x]
    train_df[train_df['filename'].isin(subset)].index.to_series().to_csv(
        f'{TRAIN}runs_{x}.csv', index=False, header=False
    )

## Small test sample

In [69]:
df = pd.read_parquet(f"{DATA}dataset.parquet",engine="pyarrow")
df_test = df.head(200)
df_test.loc[0, 'filename'] = 'x'
df_test.loc[1, 'filename'] = 'y'
df_test.loc[2, 'filename'] = 'z'
train = pd.Series(range(0, 100))     
val = pd.Series(range(100, 150))  
test = pd.Series(range(150, 200))   
# Save each to a CSV
train.to_csv(f"{TEST}train.csv", index=False, header=False)
val.to_csv(f"{TEST}val.csv", index=False, header=False)
test.to_csv(f"{TEST}test.csv", index=False, header=False)
df_test.to_parquet(f"{TEST}dataset.parquet")