In [1]:
import pandas as pd
import os
from matplotlib import pyplot as plt
from Bio import SeqIO
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, mean_squared_error

In [2]:
DATA_PATH = "/home/pbarbosa/git_repos/mutsplice/notebooks/4_all_gappedKmers"
DIRS = [os.path.join(DATA_PATH, dir) for dir in os.listdir(DATA_PATH) if os.path.isdir(os.path.join(DATA_PATH, dir))]

#### Classification (exon group)

In [3]:
out = []
for rbp_dir in DIRS:
    rbp_name = os.path.basename(rbp_dir)
    _rbp_dir = rbp_dir + "/clf"

    # gkm
    cols = ["seq_id", "pred", "label", "cvset"]
    gkm_cv = pd.read_csv(
        os.path.join(_rbp_dir, "out_gkm.cvpred.txt"), sep="\t", header=None
    )
    gkm_cv.columns = cols
    gkm_cv["pred_binary"] = gkm_cv["pred"].apply(lambda x: 1 if x > 0 else -1)
    gkm_score = accuracy_score(gkm_cv["label"], gkm_cv["pred_binary"])
    #gkm_auc = roc_auc_score(gkm_cv["label"], gkm_cv["pred"])

    # gkmrbf
    gkmrbf_cv = pd.read_csv(
        os.path.join(_rbp_dir, "out_gkmrbf.cvpred.txt"), sep="\t", header=None
    )
    gkmrbf_cv.columns = cols
    gkmrbf_cv["pred_binary"] = gkmrbf_cv["pred"].apply(lambda x: 1 if x > 0 else -1)
    gkmrbf_score = accuracy_score(gkmrbf_cv["label"], gkmrbf_cv["pred_binary"])
    #gkmrbf_auc = roc_auc_score(gkmrbf_cv["label"], gkmrbf_cv["pred"])

    out.append([rbp_name, "Gapped_Kmers", "LineargkmSVMClassifier", gkm_score, None, None, None])
    out.append([rbp_name, "Gapped_Kmers", "NonLineargkmSVMClassifier", gkmrbf_score, None, None, None])

clf_df = pd.DataFrame(out, columns=["rbp_name", "dataset_type", "model", "score", "pval", "feat_import", "features"])

#### Regression (SpliceAI score)

In [4]:
out = []
for rbp_dir in DIRS:
    rbp_name = os.path.basename(rbp_dir)
    _rbp_dir = rbp_dir + "/regr"

    # gkm
    cols = ["seq_id", "pred", "label", "cvset"]
    gkm_cv = pd.read_csv(
        os.path.join(_rbp_dir, "out_gkm.cvpred.txt"), sep="\t", header=None
    )
    gkm_cv.columns = cols
    gkm_score = r2_score(gkm_cv["label"], gkm_cv["pred"])
    #gkm_score = mean_squared_error(gkm_cv["label"], gkm_cv["pred"])

    # gkmrbf
    gkmrbf_cv = pd.read_csv(
        os.path.join(_rbp_dir, "out_gkmrbf.cvpred.txt"), sep="\t", header=None
    )
    gkmrbf_cv.columns = cols
    gkmrbf_score = r2_score(gkmrbf_cv["label"], gkmrbf_cv["pred"])
    #gkmrbf_score = mean_squared_error(gkmrbf_cv["label"], gkmrbf_cv["pred"])

    out.append([rbp_name, "Gapped_Kmers", "LineargkmSVMRegressor", gkm_score, None, None, None])
    out.append([rbp_name, "Gapped_Kmers", "NonLineargkmSVMRegressor", gkmrbf_score, None, None, None])

regr_df = pd.DataFrame(out, columns=["rbp_name", "dataset_type", "model", "score", "pval", "feat_import", "features"])

In [5]:
df = pd.concat([clf_df, regr_df])

In [7]:
df.to_csv("5_ml_results/5_results_lsgkm_all_rbps.csv", index=False)