**User Story 14 / 15**

The task was completed by @milli2908 @genericusername99 and @elivic734

We put all phage feature tables into one large dataset together and split them into 80/20 with train_test_split from sklearn.model_selection. We saved the test and training files in data/combined-data-stratified-split and checked if the split was correct.


In [4]:
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split


# Directory with the TSV files
directory = "../data/feature_tables"
# Output file path
output_file = "../data/combined-data-stratified-split/combined.tsv"

# List of all .tsv files in the directory
tsv_files = glob.glob(os.path.join(directory, "*.tsv"))

# Combine all TSV files into one
with_header_written = False
with open(output_file, "w", encoding="utf-8") as target_file:
    for index, filename in enumerate(tsv_files):
        with open(filename, "r", encoding="utf-8") as source_file:
            lines = source_file.readlines()
            if index == 0:
                target_file.writelines(lines)  # Write header and data
            else:
                target_file.writelines(lines[1:])  # Only data, skip header
print(f"{len(tsv_files)} files successfully merged into '{output_file}'.")


# 1. Load the combined TSV file
df = pd.read_csv("../data/combined-data-stratified-split/combined.tsv", sep='\t')  

# 2. Define the target variable e.g., "classification"
label_column = "classification_x"  

# 3. Perform stratified 80/20 split
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df[label_column],
    random_state=42
)

# 4. Save training and test sets as TSV
train_df.to_csv("../data/combined-data-stratified-split/train_data.tsv", sep='\t', index=False)
test_df.to_csv("../data/combined-data-stratified-split/test_data.tsv", sep='\t', index=False)

# Check class distribution in training set
print("Class distribution in training set:")
print(train_df["classification_x"].value_counts(normalize=True))  # Percentages

# Check class distribution in test set
print("\nClass distribution in test set:")
print(test_df["classification_x"].value_counts(normalize=True))

# Check for overlapping genes
overlapping_genes = set(train_df["Geneid"]).intersection(set(test_df["Geneid"]))

# Print overlap results
if overlapping_genes:
    print(f"{len(overlapping_genes)} genes appear in both training and test sets!")
    print(overlapping_genes)
else:
    print("No overlapping genes training and test sets are properly separated.")

# Print number of rows in each set
n_train = len(train_df)
n_test = len(test_df)
n_total = n_train + n_test

print(f"Training samples: {n_train} ({n_train / n_total:.2%})")
print(f"Test samples: {n_test} ({n_test / n_total:.2%})")

# Per-class sample count in both splits
for phase in train_df["classification_x"].unique():
    n_train_phase = sum(train_df["classification_x"] == phase)
    n_test_phase = sum(test_df["classification_x"] == phase)
    n_phase_total = n_train_phase + n_test_phase

    print(f"{phase}: Train {n_train_phase} ({n_train_phase/n_phase_total:.2%}), "
          f"Test {n_test_phase} ({n_test_phase/n_phase_total:.2%})")

7 files successfully merged into '../data/combined-data-stratified-split/combined.tsv'.
Class distribution in training set:
classification_x
late      0.431211
middle    0.318275
early     0.250513
Name: proportion, dtype: float64

Class distribution in test set:
classification_x
late      0.430328
middle    0.319672
early     0.250000
Name: proportion, dtype: float64
No overlapping genes training and test sets are properly separated.
Training samples: 974 (79.97%)
Test samples: 244 (20.03%)
late: Train 420 (80.00%), Test 105 (20.00%)
early: Train 244 (80.00%), Test 61 (20.00%)
middle: Train 310 (79.90%), Test 78 (20.10%)
