In [2]:
import pandas as pd
import os, re
from pathlib import Path


def load_data(raw_count_file):
    # Raw count matrix
    count_data = pd.read_csv(raw_count_file, sep="\t", header=0)
    return count_data


def tpm_normalize(count_data):
    # Get sample columns
    sample_cols = [col for col in count_data.columns if re.match(r"^\d+_R\d+$", col)]

    # Sum each sample column and divide by 1,000,000
    sample_totals = count_data[sample_cols].sum().div(1000000)

    # Divide each entry by its column total
    count_data[sample_cols] = count_data[sample_cols].div(sample_totals)

    return count_data


input_folder = Path("../data/marked")
output_folder = Path("../data/datasets_normalized")
output_folder.mkdir(exist_ok=True)

for ds in input_folder.glob("*.tsv"):
    df = load_data(ds)
    df_norm = tpm_normalize(df)

    # Save to ../data/datasets_normalized
    output_file = f"{output_folder}/{ds.stem}_tpm.tsv"
    df_norm.to_csv(output_file, sep="\t", index=False)