In [8]:
import glob
import os
import random
import pandas as pd


script_dir = os.path.dirname(os.path.realpath(__file__))
base_dir = os.path.join(script_dir, "..")

def make_training_data(file_paths):
    """ In each file, it selects the evalue threshold up to which the number of TPs is equal to the number of FPs"""
    train_data_list = []
    for file_path in file_paths:
        df = pd.read_csv(file_path, sep="\t")
        df["pfam_label"] = (df["query"].str.split("-", expand=True)[3] == df["target"].str.split("-", expand=True)[3])
        df = df.sort_values(by="evalue").reset_index(drop=True)
        df["cum_tps"] = df["pfam_label"].cumsum()  # To select the hits up to the e-value whose TPs count will be equal to the FPs count
        df["cum_fps"] = df.index - df["cum_tps"]
        df["eligible"] = (df["cum_tps"] >= df["cum_fps"])
        max_index = df[df['eligible']].index.max()
        train_data = df.loc[:max_index].drop(columns=["cum_tps", "cum_fps", "eligible"])
        #pos_df = df[(df["pfam_label"])]       # To select all TPs + equal number of FPs
        #neg_df = df[~df["pfam_label"]].head(len(pos_df))  # To select all TPs + equal number of FPs
        #train_data = pd.concat([pos_df, neg_df])    # To select all TPs + equal number of FPs
        train_data_list.append(train_data)
    return pd.concat(train_data_list).sample(frac=1, random_state=0).reset_index(drop=True)
    
ali_file_paths = [f"{base_dir}/tmp/alis/split_pf_seq/reseek_sens_B{i}_bitscore.tsv" for i in range(1,3)]  # Only two files are selected for the training purposes
train_data = make_training_data(ali_file_paths)
train_data.to_csv(f"{base_dir}/tmp/alis/split_pf_seq/reseek_train_data.tsv", sep="\t", index=None)

ali_file_paths = [f"{base_dir}/tmp/alis/split_pf_seq/fs_pref_B{i}_bitscore.tsv" for i in range(1,3)]  # Only two files are selected for the training purposes
train_data = make_training_data(ali_file_paths)
train_data.to_csv(f"{base_dir}/tmp/alis/split_pf_seq/fs_train_data.tsv", sep="\t", index=None)
