In [2]:
import sys
import os
sys.path.append(os.path.abspath("../"))  # Adds 'src' to the module search path
sys.path.append(os.path.abspath("../src"))  # Adds 'src' to the module search path

In [4]:
import pandas as pd
import numpy as np
from utils_data import create_dataset,create_dataset_df

# Split

In [4]:
np.random.seed(42)
RAW_DATA = "../raw_data/"

In [5]:
df = pd.read_parquet(f"{RAW_DATA}80_tasks.parquet",engine="pyarrow")

In [6]:
all_datasets = df['dataset'].unique()
test_datasets = ['MSV000080274']
val_datasets = ['MSV000079550']
train_datasets = [d for d in all_datasets if d not in test_datasets + val_datasets]

In [7]:
test_df = df[df['dataset'].isin(test_datasets)].sample(350000, random_state=42) 
val_df = df[df['dataset'].isin(val_datasets)].sample(350000, random_state=42) 

In [8]:
test_sequences_with_intersection = set(test_df['sequence'])
val_sequences_with_intersection = set(val_df['sequence'])
common_sequences = test_sequences_with_intersection.intersection(val_sequences_with_intersection)
common_sequences_list = list(common_sequences)

In [9]:
np.random.shuffle(common_sequences_list)
half_point = len(common_sequences_list) // 2
test_keep_sequences = set(common_sequences_list[:half_point])
val_keep_sequences = set(common_sequences_list[half_point:])

test_df = test_df[~test_df['sequence'].isin(val_keep_sequences)]
val_df = val_df[~val_df['sequence'].isin(test_keep_sequences)]

In [10]:
print(f"Cleaned test dataset size: {len(test_df)}")
print(f"Cleaned validation dataset size: {len(val_df)}")

Cleaned test dataset size: 241921
Cleaned validation dataset size: 249897


In [11]:
test_sequences = set(test_df["sequence"].unique())
val_sequences = set(val_df["sequence"].unique())
test_val_sequences = test_sequences | val_sequences

In [12]:
train_df = df[df['dataset'].isin(train_datasets)]

In [13]:
train_df = train_df[~train_df['sequence'].isin(test_val_sequences)]

In [14]:
train_sequences = set(train_df["sequence"].unique())

In [15]:
if not (test_sequences & val_sequences or test_sequences & train_sequences or val_sequences & train_sequences):
    print("No sequence overlap between the sets")
else:
    print("There is an overlap in sequences between the sets.")

No sequence overlap between the sets


In [16]:
out_dir = "../data/"


In [17]:
create_dataset_df(val_df,f"{out_dir}val.parquet")
create_dataset_df(test_df,f"{out_dir}test.parquet")
create_dataset_df(train_df,f"{out_dir}train.parquet")

## increasing train data split

In [19]:
DATA = "../data/"

In [20]:
train_df = pd.read_parquet(f"{DATA}train.parquet",engine="pyarrow")

In [21]:
output = "../data/increasing_data/"
os.makedirs(output, exist_ok=True)
unique_filenames = train_df['filename'].unique()
subsets=[]
for x in range(150,len(unique_filenames),150):
    subset = unique_filenames[:x]
    train_df[train_df['filename'].isin(subset)].index.to_series().to_csv(f'{output}train_indices_{x}.csv', index=False,header=False)
subset = unique_filenames[0:len(unique_filenames)]
train_df[train_df['filename'].isin(subset)].index.to_series().to_csv(f'{output}train_indices_{len(unique_filenames)}.csv', index=False,header=False)

### Small test set

In [8]:
DATA = "../data/"

In [10]:
train_df = pd.read_parquet(f"{DATA}train.parquet",engine="pyarrow")

In [24]:
output = "../data/increasing_data_small/"
os.makedirs(output, exist_ok=True)

unique_filenames = train_df['filename'].unique()
subsets=[]
for x in range(1,6,1):
    subset = unique_filenames[:x]
    train_df[train_df['filename'].isin(subset)].index.to_series().to_csv(f'{output}train_indices_{x}.csv', index=False,header=False)

## MSV000080814 train indices

In [12]:
DATA = "../data/"

In [14]:
train_df = pd.read_parquet(f"{DATA}train.parquet",engine="pyarrow")

In [20]:
output = "../data/increasing_data/"
os.makedirs(output, exist_ok=True)
MSV80814_indices = train_df[train_df['dataset']=="MSV000080814"].index.to_series()
MSV80814_indices.to_csv(f'{output}train_indices_MSV000080814.csv', index=False,header=False)

4427698    4427698
4427699    4427699
4427700    4427700
4427701    4427701
4427702    4427702
            ...   
5904518    5904518
5904519    5904519
5904520    5904520
5904521    5904521
5904522    5904522
Length: 1476825, dtype: int64


## Train two datasets comparison split

In [26]:
DATA = "../data/"

In [27]:
train_df = pd.read_parquet(f"{DATA}train.parquet",engine="pyarrow")

In [28]:
all_datasets = train_df["dataset"].unique()
low_variety_datasets = ["MSV000080814"]
high_variety_datasets = [d for d in all_datasets if d not in low_variety_datasets]

In [29]:
low_variety_df = train_df[train_df['dataset'].isin(low_variety_datasets)]
high_variety_df = train_df[train_df['dataset'].isin(high_variety_datasets)]

In [30]:
low_len = len(low_variety_df)
high_len = len(high_variety_df)
ratio = low_len / high_len
print(f"low variety df size is: {low_len}")
print(f"high variety df size is: {high_len}")
print(f"Ratio is: {ratio}")

low variety df size is: 1476825
high variety df size is: 4427698
Ratio is: 0.3335423960712768


In [31]:
high_variety_df = high_variety_df.groupby('filename').apply(lambda x: x.sample(frac=ratio, random_state=42)).reset_index(drop=True)

  high_variety_df = high_variety_df.groupby('filename').apply(lambda x: x.sample(frac=ratio, random_state=42)).reset_index(drop=True)


In [32]:
out_dir = "../data/"
os.makedirs(f"{out_dir}/low_variety/", exist_ok=True)
os.makedirs(f"{out_dir}/high_variety/", exist_ok=True)

low_variety_df.to_parquet(f"{out_dir}low_variety/all.parquet", index=False)
high_variety_df.to_parquet(f"{out_dir}high_variety/all.parquet", index=False)

# Pre process