In [23]:
import sys
import os
sys.path.append(os.path.abspath("../"))  # Adds 'src' to the module search path

In [3]:
import pandas as pd
import numpy as np

# Split

In [164]:
np.random.seed(42)
RAW_DATA = "../raw_data/"

In [166]:
df = pd.read_parquet(f"{RAW_DATA}80_tasks.parquet",engine="pyarrow")

In [167]:
all_datasets = df['dataset'].unique()
test_datasets = ['MSV000080274']
val_datasets = ['MSV000079550']
train_datasets = [d for d in all_datasets if d not in test_datasets + val_datasets]

In [168]:
test_df = df[df['dataset'].isin(test_datasets)].sample(350000, random_state=42) 
val_df = df[df['dataset'].isin(val_datasets)].sample(350000, random_state=42) 

In [169]:
test_sequences_with_intersection = set(test_df['sequence'])
val_sequences_with_intersection = set(val_df['sequence'])
common_sequences = test_sequences_with_intersection.intersection(val_sequences_with_intersection)
common_sequences_list = list(common_sequences)

In [170]:
np.random.shuffle(common_sequences_list)
half_point = len(common_sequences_list) // 2
test_keep_sequences = set(common_sequences_list[:half_point])
val_keep_sequences = set(common_sequences_list[half_point:])

test_df = test_df[~test_df['sequence'].isin(val_keep_sequences)]
val_df = val_df[~val_df['sequence'].isin(test_keep_sequences)]

In [171]:
print(f"Cleaned test dataset size: {len(test_df)}")
print(f"Cleaned validation dataset size: {len(val_df)}")

Cleaned test dataset size: 240299
Cleaned validation dataset size: 249888


In [174]:
test_sequences = set(test_df["sequence"].unique())
val_sequences = set(val_df["sequence"].unique())
test_val_sequences = test_sequences | val_sequences

In [180]:
train_df = df[df['dataset'].isin(train_datasets)]

In [181]:
train_df = train_df[~train_df['sequence'].isin(test_val_sequences)]

In [183]:
train_sequences = set(train_df["sequence"].unique())

In [185]:
if not (test_sequences & val_sequences or test_sequences & train_sequences or val_sequences & train_sequences):
    print("No sequence overlap between the sets")
else:
    print("There is an overlap in sequences between the sets.")

No sequence overlap between the sets


In [188]:
out_dir = "../data/parquet/"
test_df.to_parquet(f"{out_dir}test.parquet", index=False)
val_df.to_parquet(f"{out_dir}val.parquet", index=False)
train_df.to_parquet(f"{out_dir}train.parquet", index=False)

## Train two datasets comparison split

In [190]:
PARQUET_FILES = "../data/parquet/"

In [192]:
train_df = pd.read_parquet(f"{PARQUET_FILES}train.parquet",engine="pyarrow")

In [193]:
all_datasets = train_df["dataset"].unique()
low_variety_datasets = ["MSV000080814"]
high_variety_datasets = [d for d in all_datasets if d not in low_variety_datasets]

In [196]:
low_variety_df = train_df[train_df['dataset'].isin(low_variety_datasets)]
high_variety_df = train_df[train_df['dataset'].isin(high_variety_datasets)]

In [198]:
low_len = len(low_variety_df)
high_len = len(high_variety_df)
ratio = low_len / high_len
print(f"low variety df size is: {low_len}")
print(f"high variety df size is: {high_len}")
print(f"Ratio is: {ratio}")

low variety df size is: 1476825
high variety df size is: 4427698
Ratio is: 0.3335423960712768


In [200]:
high_variety_df = high_variety_df.groupby('filename').apply(lambda x: x.sample(frac=ratio, random_state=42)).reset_index(drop=True)

  high_variety_df = high_variety_df.groupby('filename').apply(lambda x: x.sample(frac=ratio, random_state=42)).reset_index(drop=True)


In [204]:
out_dir = "../data/parquet/"
low_variety_df.to_parquet(f"{out_dir}train_low_variety.parquet", index=False)
high_variety_df.to_parquet(f"{out_dir}train_high_variety.parquet", index=False)

# Pre process

In [33]:
from utils_data import create_dataset

In [35]:
PARQUET_FILES = "../data/parquet/"

In [39]:
create_dataset(f"{PARQUET_FILES}val.parquet","../data/all_data_validation.csv")
create_dataset(f"{PARQUET_FILES}test.parquet","../data/all_data_test.csv")

In [None]:
va_df =  pd.read_parquet(f"{PARQUET_FILES}val.parquet",engine="pyarrow")

## two datasets comparison

In [27]:
from utils_data import create_dataset

In [15]:
PARQUET_FILES = "../data/parquet/"

In [17]:
low_variety_df =  pd.read_parquet(f"{PARQUET_FILES}train_low_variety.parquet",engine="pyarrow")
high_variety_df =  pd.read_parquet(f"{PARQUET_FILES}train_high_variety.parquet",engine="pyarrow")

In [31]:
create_dataset(f"{PARQUET_FILES}train_low_variety.parquet","../data/low_variety/all_data.csv")
create_dataset(f"{PARQUET_FILES}train_high_variety.parquet","../data/high_variety/all_data.csv")