In [3]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [4]:
base_dir = ".."

In [5]:
def prepare_raw_df_data(df):
    """Adds a column for the target length and also calculates the log of the evalue"""
    df["tlen"] = df["target"].str.split("-", expand=True)[2].astype(int) - df["target"].str.split("-", expand=True)[1].astype(int) + 1
    df["evalue"] = df["evalue"].replace(0, 1e-100)
    df["log_evalue"] = np.log10(df["evalue"])
    return df

In [6]:
def score_ali_df(ml_model, ali_path, cols2read, feature_cols):
    ali_df = pd.read_csv(ali_path, sep="\t", usecols=cols2read)
    ali_df = prepare_raw_df_data(ali_df)
    ali_df["ml_score"] = ml_model.predict_proba(ali_df[feature_cols])[:,1]
    ali_df.to_csv(ali_path.replace("_bitscore.tsv", "_ml.tsv"), sep="\t", index=None)

In [7]:
def train_and_predict(tool):
    """The tool could be fs or reseek. It will train a model on the data of the first batch to predict the probability of having
    a True Positive hit based on the e-value, the target length, and the bitscore of the hit."""

    if tool == "fs":
        training_data_path = f"../tmp/alis/split_pf_seq/fs_pref_B1_bitscore.tsv"   # The first batch is used for training the data
        to_score_paths = [f"../tmp/alis/split_pf_seq/fs_pref_B{x}_bitscore.tsv" for x in range(2,17)] # Other batches are used for evaluating the new score based on tlen, profile bitscore, and the evalue of the hit
    elif tool == "reseek":
        training_data_path = f"../tmp/alis/split_pf_seq/reseek_sens_B1_bitscore.tsv" # The first batch is used for training the data
        to_score_paths = [f"../tmp/alis/split_pf_seq/reseek_sens_B{x}_bitscore.tsv" for x in range(2,17)] # Other batches are used for evaluating the new score based on tlen, profile bitscore, and the evalue of the hit

    cols2read_test = ["query", "target", "evalue", "bitscore_rep"]  # The columns to read for scoring the hits
    
    
    features = ["bitscore_rep", "log_evalue", "tlen"]
    target = "pfam_label"
    
    df = pd.read_csv(training_data_path, sep="\t", usecols=cols2read_test)
    df["pfam_label"] = (df["query"].str.split("-", expand=True)[3] == df["target"].str.split("-", expand=True)[3])  # Adds label to the training set
    df = prepare_raw_df_data(df)
    
    X = df[features]
    y = df[target]
    
    # Initialize the Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=20, random_state=42, class_weight='balanced') ################
    # Fit the model on the training data
    rf_model.fit(X, y)
    
    for path in to_score_paths:
        print(f"scoring {path}")
        score_ali_df(rf_model, path, cols2read_test, features)


In [8]:
train_and_predict("fs")

scoring ../tmp/alis/split_pf_seq/fs_pref_B3_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B4_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B5_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B6_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B7_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B8_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B9_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B10_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B11_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B12_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B13_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B14_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B15_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/fs_pref_B16_bitscore.tsv


In [None]:
train_and_predict("reseek")

scoring ../tmp/alis/split_pf_seq/reseek_sens_B3_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B4_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B5_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B6_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B7_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B8_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B9_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B10_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B11_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B12_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B13_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B14_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B15_bitscore.tsv
scoring ../tmp/alis/split_pf_seq/reseek_sens_B16_bitscore.tsv
