In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

folder_path = "../data/"

In [2]:
def get_df_and_truth(data_list, truth_list, cols=['Chr', 'START_POS_REF', 'END_POS_REF'], split=False):
    
    df_res = []
    truth_res = {}
    for data_name, truth_name in zip(data_list, truth_list):
        data_path = folder_path + f"snv-parse-{data_name}.txt"
        df = pd.read_csv(data_path, sep='\t', low_memory=False)
        df['is_real'] = data_name[:4] == 'real'
        df['ds_name'] = data_name
        df['ds_type'] = data_name[:5]
        truth_path = folder_path + f"{data_name}/{truth_name}.bed"
        truth = pd.read_csv(truth_path, sep='\t', header=None, names = ['Chr', 'START_POS_REF', 'END_POS_REF'])
        truth['label'] = 1
        df_res.append(df)
        truth_res[data_name] = truth

    return df_res, truth_res

In [3]:
#data_list = ["syn1", "syn2", "syn3", "syn4", "syn5", "real1", "real2_part1"]
data_list = ["real1", "real2_part1"]
#truth_list = ["syn1_truth", "syn2_truth", "syn3_truth", "syn4_truth", "syn5_truth", "real1_truth", "real2_truth_chr1to5"]
truth_list = ["real1_truth", "real2_truth_chr1to5"]
df_list, truth_dict = get_df_and_truth(data_list, truth_list)

In [4]:
df_list = [df.drop(labels=['vd_SAMPLE', 'vd_STATUS', 'vd_TYPE'], axis=1) for df in df_list]
merged_list = [df.merge(truth, how='left').fillna({'label':0}) for df, truth in zip(df_list, truth_dict.values())]

In [35]:
merged_list[-3].shape

(1658733, 58)

In [5]:
tmp = merged_list[-1]
tmp_0 = tmp.iloc[:len(tmp)//2,:].copy()
tmp_0.loc[:,'ds_name'] = "real2_part1_0"
tmp_1 = tmp.iloc[len(tmp)//2:,:].copy()
tmp_1.loc[:,'ds_name'] = "real2_part1_1"
tmp_0.shape, tmp_1.shape

((829366, 58), (829367, 58))

In [6]:
tmp_label = truth_dict['real2_part1']
truth_dict['real2_part1_0'] = tmp_label.loc[tmp_label.loc[:,'START_POS_REF'].isin(tmp_0.loc[tmp_0['label'].astype(bool),'START_POS_REF'])]
truth_dict['real2_part1_1'] = tmp_label.loc[tmp_label.loc[:,'START_POS_REF'].isin(tmp_1.loc[tmp_1['label'].astype(bool),'START_POS_REF'])]

In [7]:
merged_list += [tmp_0, tmp_1]

In [76]:
def handle_inf_values(df, inf_list=["vd_SOR"]):
    # for col in inf_list:
    #    df.loc[df.loc[:,col] == float('inf'), col] = -1
    df = df.drop(labels=inf_list, axis=1)
    return df

def handle_filters(df, columns):
    res_df = [df]
    res_cols = []

    for col in columns:
        cat_filter_df = pd.get_dummies(df.loc[:,col].str.split(';').explode()).groupby(level=0).sum()
        cat_filter_df = cat_filter_df.rename(lambda x: f'{col}_{x}', axis=1)
        res_df.append(cat_filter_df)
        res_cols += list(cat_filter_df.columns)

    return pd.concat(res_df, axis=1), res_cols
    
def handle_nan_values(d_list, sample_list, imputer):
    
    df = pd.concat(d_list, axis=0).reset_index(drop=True)

    filter_list = ["FILTER_Mutect2_data", "FILTER_Freebayes_data","FILTER_Vardict_data","FILTER_Varscan_data"]
    df, filter_cols = handle_filters(df, filter_list[1:])
    df = handle_inf_values(df)
    
    nan_cols = list(set(df.columns[df.isna().any()]) - set(filter_list))
    new_nan_cols = [x + "_nan_ind" for x in nan_cols]
    df[new_nan_cols] = df.loc[:,nan_cols].isna()
    
    df.loc[df['ds_name'].isin(sample_list),nan_cols] = imputer.fit_transform(df.loc[df['ds_name'].isin(sample_list),nan_cols])
    if not df['ds_name'].isin(sample_list).all():
        df.loc[~df['ds_name'].isin(sample_list),nan_cols] = imputer.transform(df.loc[~df['ds_name'].isin(sample_list),nan_cols])
        
    return df, nan_cols, new_nan_cols + filter_cols
    
def create_more_cat_values(df, columns):
    
    ret_cols = []
    for c in columns:
        s = df[c].str.split('/').apply(lambda x: x[:-1])
        new_cols = [c + f"_{i}" for i in range(len(s.iloc[0]))]
        df[new_cols] = pd.DataFrame(np.vstack(s.values))
        ret_cols += new_cols

        for col in new_cols:
            rare_cols = df.loc[:,col].value_counts()[(df.loc[:,col].value_counts() <= 200)].index
            df.loc[df.loc[:,col].isin(rare_cols),col] = 'rare'
            
    return df, ret_cols
        
def handle_cat_values(df, cat_list):
    cat_df = pd.get_dummies(df, columns=cat_list, drop_first=True)
    cat_cols = list(set(cat_df.columns) - set(df.columns))
    cat_df['Chr'] = df['Chr']
    return cat_df, cat_cols

def get_X_y(df, sample_list, feature_list):
    df = df[df['ds_name'].isin(sample_list)]
    X = df.loc[:,feature_list].astype(float)
    y = df.loc[:,"label"].astype(float)
    return X, y

In [77]:
# Define training samples 
#train_sample_list = ["syn1", "syn2", "syn3", "syn4", "syn5", "real1"]
train_sample_list = ['real1']
# Default features, that we will always consider
col_list = ["FILTER_Mutect2", "FILTER_Freebayes", "FILTER_Vardict", "FILTER_Varscan", "is_real"]

# Handling nan values
imputer=SimpleImputer(strategy='median')
#imputer=KNNImputer(n_neighbors=10)
df, nan_cols, new_nan_cols = handle_nan_values(merged_list, train_sample_list, imputer)

# Handling Categorical Data
add_cols = ["REF_MFVdVs", "ALT_MFVdVs"]
df, ret_cols = create_more_cat_values(df, add_cols)

cat_list = ["REF", "ALT", 'ds_type'] + ret_cols
df, cat_cols = handle_cat_values(df, cat_list)

# Combining all features and getting training data
feature_list = col_list + nan_cols + new_nan_cols + cat_cols
X_all, y_all = get_X_y(df, train_sample_list, feature_list)

Index(['Chr', 'START_POS_REF', 'END_POS_REF', 'REF', 'ALT', 'REF_MFVdVs',
       'ALT_MFVdVs', 'Sample_Name', 'FILTER_Mutect2', 'FILTER_Freebayes',
       'FILTER_Vardict', 'FILTER_Varscan', 'FILTER_Mutect2_data',
       'FILTER_Freebayes_data', 'FILTER_Vardict_data', 'FILTER_Varscan_data',
       'm2_ClippingRankSum', 'm2_DP', 'm2_ECNT', 'm2_FS', 'm2_MQ', 'm2_MQ0',
       'm2_MQRankSum', 'm2_ReadPosRankSum', 'f_AN', 'f_DP', 'f_DPB', 'f_EPPR',
       'f_GTI', 'f_MQMR', 'f_NS', 'f_NUMALT', 'f_ODDS', 'f_PAIREDR', 'f_PQR',
       'f_PRO', 'f_QR', 'f_RO', 'f_RPPR', 'f_SRF', 'f_SRP', 'f_SRR', 'vs_DP',
       'vs_GPV', 'vs_SPV', 'vs_SS', 'vs_SSC', 'vd_DP', 'vd_MSI', 'vd_MSILEN',
       'vd_SHIFT3', 'vd_SOR', 'vd_SSF', 'vd_VD', 'is_real', 'ds_name',
       'ds_type', 'label', 'FILTER_Freebayes_data_FBQualDepth',
       'FILTER_Freebayes_data_PASS', 'FILTER_Freebayes_data_REJECT',
       'FILTER_Vardict_data_Bias', 'FILTER_Vardict_data_InDelLikely',
       'FILTER_Vardict_data_LowAlleleDepth',

In [78]:
from sklearn.ensemble import IsolationForest
from pyod.models.ecod import ECOD

contamination = 0.0000001

clf = IsolationForest(n_estimators=1, contamination=contamination, random_state=42, n_jobs=-1, verbose=2)
#clf = ECOD(contamination=contamination)

outlier_res = clf.fit_predict(X_all)
outlier_res[:] = 1
X_train = X_all.loc[outlier_res == 1]
y_train = y_all.loc[outlier_res == 1]

Building estimator 1 of 1 for this parallel run (total 1)...


In [79]:
X_train.shape, X_all.shape

((4661861, 195), (4661861, 195))

In [80]:
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, verbose=2, class_weight='balanced')
gb = HistGradientBoostingClassifier()
lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.05,verbose=2, n_jobs=-1, random_state=42)
lgr = LogisticRegression(C=1000)
extra = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=42, verbose=2)
catboost = CatBoostClassifier(num_trees=1000)

model = rf#VotingClassifier(estimators=[("rf", rf), ("extra", extra), ("catboost", catboost)], voting="soft")

In [81]:
from sklearn.feature_selection import SelectFromModel

sel = SelectFromModel(RandomForestClassifier(n_estimators = 1, random_state=42, n_jobs=-1), threshold=0)
sel.fit(X_train, y_train)

In [82]:
X_selected = X_train.loc[:,X_train.columns[(sel.get_support())]]
X_train.shape, X_selected.shape

((4661861, 195), (4661861, 195))

In [83]:
model.fit(X_selected, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 128 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done  49 out of 100 | elapsed:   21.8s remaining:   22.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   28.4s finished


In [84]:
def calc_F1(pred, truth):
    predv = pred['Chr'].astype(str) + pred['START_POS_REF'].astype(str)
    truthv = truth['Chr'].astype(str) + truth['START_POS_REF'].astype(str)

    res = pd.DataFrame(columns=['TP', 'FP', 'FN', 'Precision', 'Recall', 'F1'])

    res.loc[0, 'TP'] = sum(predv.isin(truthv))
    res.loc[0, 'FP'] = sum(~predv.isin(truthv))
    res.loc[0, 'FN'] = sum(~truthv.isin(predv))

    res.loc[0, 'Precision'] = res.loc[0, 'TP'] / (res.loc[0, 'TP'] + res.loc[0, 'FP'])
    res.loc[0, 'Recall'] = res.loc[0, 'TP'] / (res.loc[0, 'TP'] + res.loc[0, 'FN'])
    res.loc[0, 'F1'] = (2 * res.loc[0, 'Precision'] * res.loc[0, 'Recall']) / (res.loc[0, 'Precision'] + res.loc[0, 'Recall'])

    return res

In [85]:
def run_test(ds_name, cols=['Chr', 'START_POS_REF', 'END_POS_REF']):
    
    X_test, y_test = get_X_y(df, [ds_name], feature_list)
    y_pred = model.predict(X_test.loc[:,X_train.columns[(sel.get_support())]])
    pred = df[df["ds_name"]==ds_name].loc[y_pred.astype(bool),cols]
    return calc_F1(pred, truth_dict[ds_name])

In [86]:
# Used as test set
run_test("real2_part1_1")

[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done   3 out of 100 | elapsed:    0.1s remaining:    3.8s
[Parallel(n_jobs=100)]: Done  54 out of 100 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=100)]: Done 100 out of 100 | elapsed:    0.4s finished


Unnamed: 0,TP,FP,FN,Precision,Recall,F1
0,205,1,53,0.995146,0.794574,0.883621


In [87]:
run_test("real2_part1_0")

[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done   3 out of 100 | elapsed:    0.1s remaining:    1.7s
[Parallel(n_jobs=100)]: Done  54 out of 100 | elapsed:    0.4s remaining:    0.3s
[Parallel(n_jobs=100)]: Done 100 out of 100 | elapsed:    0.5s finished


Unnamed: 0,TP,FP,FN,Precision,Recall,F1
0,168,1,36,0.994083,0.823529,0.900804


In [88]:
run_test("real2_part1")

[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done   3 out of 100 | elapsed:    0.3s remaining:    8.4s
[Parallel(n_jobs=100)]: Done  54 out of 100 | elapsed:    0.6s remaining:    0.5s
[Parallel(n_jobs=100)]: Done 100 out of 100 | elapsed:    0.8s finished


Unnamed: 0,TP,FP,FN,Precision,Recall,F1
0,373,2,118,0.994667,0.759674,0.861432


In [33]:
# Training data
run_test("real1")

[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done   3 out of 100 | elapsed:    1.4s remaining:   45.7s
[Parallel(n_jobs=100)]: Done  54 out of 100 | elapsed:    1.9s remaining:    1.6s
[Parallel(n_jobs=100)]: Done 100 out of 100 | elapsed:    2.5s finished


Unnamed: 0,TP,FP,FN,Precision,Recall,F1
0,1308,1,11,0.999236,0.99166,0.995434


In [None]:
ds_name = "real2_part1"
X_test, y_test = get_X_y(df, [ds_name], feature_list)
y_pred = model.predict(X_test.loc[:,X_train.columns[(sel.get_support())]])
pred = df[df["ds_name"]==ds_name].loc[y_pred.astype(bool),['Chr', 'START_POS_REF', 'END_POS_REF']]

In [None]:
pred.to_csv('my-real1-predictions.bed', index=False, sep='\t', header=True)