In [16]:
import pandas as pd
import os, re, glob
from pathlib import Path
import shutil


def load_data(raw_count_file):
    # Raw count matrix
    count_data = pd.read_csv(raw_count_file, sep="\t", header=0)
    return count_data


def tpm_normalize(count_data):
    # Get sample columns
    sample_cols = [col for col in count_data.columns if re.match(r"^\d+_R\d+$", col)]

    # Sum each sample column and divide by 1,000,000
    sample_totals = count_data[sample_cols].sum().div(1000000)

    # Divide each entry by its column total
    count_data[sample_cols] = count_data[sample_cols].div(sample_totals)

    return count_data


def drop_host_data(count_data):
    df_phage = df[df['Entity'] == 'phage'].copy()
    return df_phage


datasets = [
    "Yang",
    "Sprenger_VC_WT_VP882_delta_cpdS",
    "Lood",
    "Guegler_T7_plusToxIN",
    "Guegler_T4_minusToxIN",
    "Finstrlova_Newman",
    "Brandao_MCCM"
]

tsv_dateien = []
for ds in datasets:
    pattern = f"../data/**/{ds}_full_raw_counts.tsv"
    tsv_dateien.extend(glob.glob(pattern, recursive=True))

zielordner = "../data/datasets_vivid_virions"
os.makedirs(zielordner, exist_ok=True)

for datei in tsv_dateien:
    dateiname = os.path.basename(datei)
    zielpfad = os.path.join(zielordner, dateiname)
    try:
        shutil.copy2(datei, zielpfad)
    except PermissionError as e:
        print(f"Fehler beim Kopieren von {datei} nach {zielpfad}: {e}")

print(f"Alle Dateien wurden nach {zielordner} kopiert.")
input_folder = Path("../data/datasets_vivid_virions")
output_folder = Path("../data/datasets_normalized")
output_folder.mkdir(exist_ok=True)

for ds in input_folder.glob("*.tsv"):
    df = load_data(ds)
    df_norm = tpm_normalize(df)
    df_phage = drop_host_data(df_norm) 

    # Save to ../data/datasets_normalized
    output_file = f"{output_folder}/{ds.stem}_tpm.tsv"
    df_norm.to_csv(output_file, sep="\t", index=False)

Alle Dateien wurden nach ../data/datasets_vivid_virions kopiert.
