In [None]:
import pandas as pd
from pathlib import Path
import os

# print current working directory
#print(f"Current working directory: {os.getcwd()}")
# go up one level
#os.chdir("..")

# input/output folders
input_folder = Path("data/marked")
output_folder = Path("data/filtered")
output_folder.mkdir(parents=True, exist_ok=True)

# logfile
log_lines = []

# find all TSV files in the input folder
tsv_files = list(input_folder.glob("*_full_raw_counts_tpm_marked.tsv"))
print(f"{len(tsv_files)} file found.")

for file_path in tsv_files:
    try:
        df = pd.read_csv(file_path, sep="\t")

        # --- 1. remove outliers ---
        original_row_count = len(df)
        df = df[df["Outlier"] != 1]
        removed_rows = original_row_count - len(df)

        # --- 2. filter columns ---
        marker_row = df[df["Geneid"] == "Replicates with missing value"]
        removed_columns = []

        if not marker_row.empty:
            marker = marker_row.iloc[0]
            removed_columns = [col for col in df.columns if marker.get(col, 0) == 1]
            df = df[df["Geneid"] != "Replicates with missing value"]
            df = df.drop(columns=removed_columns)
            marker_status = "Found column marker"
        else:
            marker_status = "No column marker found"

        # safe
        output_file = output_folder / file_path.name.replace("_marked.tsv", "_filtered.tsv")
        df.to_csv(output_file, sep="\t", index=False)
        print(f"Saved: {output_file.name}")

        # log info
        log_lines.append(f"File: {file_path.name}")
        log_lines.append(f"- remove outlier rows: {removed_rows}")
        log_lines.append(f"- {marker_status}")
        if removed_columns:
            log_lines.append(f"- remove columns: {', '.join(removed_columns)}")
        else:
            log_lines.append("- removed columns: none")
        log_lines.append("") 

    except Exception as e:
        print(f"Error at {file_path.name}: {e}")
        log_lines.append(f"File: {file_path.name} – ERROR: {e}")
        log_lines.append("")

# save log file
log_path = output_folder / "filter_protocoll.txt"
with open(log_path, "w", encoding="utf-8") as log_file:
    log_file.write("\n".join(log_lines))


7 file found.
Saved: Brandao_MCCM_full_raw_counts_tpm_filtered.tsv
Saved: Finstrlova_Newman_full_raw_counts_tpm_filtered.tsv
Saved: Guegler_T4_minusToxIN_full_raw_counts_tpm_filtered.tsv
Saved: Guegler_T7_plusToxIN_full_raw_counts_tpm_filtered.tsv
Saved: Lood_full_raw_counts_tpm_filtered.tsv
Saved: Sprenger_VC_WT_VP882_delta_cpdS_full_raw_counts_tpm_filtered.tsv
Saved: Yang_full_raw_counts_tpm_filtered.tsv
