In [1]:
# from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from joblib import dump
import pandas as pd
import json, os, re, sys
from joblib import load
import pandas as pd
import numpy as np



In [None]:
# Load in the model

modelPath = "./model1.joblib"
pipe = load(modelPath)  # this is the full sklearn Pipeline you fit

In [None]:
# Open additional data

# Cleaned data
df = pd.read_parquet("./ambiguous_data.parquet") 

In [4]:
# Split off predictors

X = df.copy()
X = df.drop(["chrom", "pos", "ref", "alt", "label", "clnsig"], axis=1)

In [5]:
# Split off target data

y = df.label

In [6]:
# Predict

proba = pipe.predict_proba(X)[:, 1]   # P(y=1 | features)
pred  = (proba >= 0.5).astype(int)    # or pick a tuned threshold

In [7]:
# Minimal and maximial output dataset

out = df[["chrom","pos","ref","alt","clnsig"]]
outmax = df.copy()

In [None]:
# Add results to output dataframe

out.loc[out.index, "proba"] = proba
out.loc[out.index, "pred"] = pred

outmax.loc[out.index, "proba"] = proba
outmax.loc[out.index, "pred"] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out.loc[out.index, "proba"] = proba
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out.loc[out.index, "pred"] = pred


In [None]:
# Convert 0/1 back to string - "benign"/"pathogenic"

def toPred(s):
    '''Converts 0/1 to likely_benign/likely_pathogenic'''
    if s == 0:
        return "predict_benign"
    elif s == 1:
        return "predict_pathogenic"
    else:
        return "error_not_0or1"

In [10]:
# Convert 0/1 result to written prediction

out["result"] = out.pred.map(toPred)
outmax["result"] = out.pred.map(toPred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out["result"] = out.pred.map(toPred)


In [11]:
# Save predictions

out.to_csv("predictionsMin.csv")
outmax.to_csv("predictionsAllCols.csv")

In [12]:
# write first few rows to csv

out.iloc[1:20,].to_csv("predictionsHead.csv")
outmax.iloc[1:20,].to_csv("predictionsAllColsHead.csv")

In [None]:
# Number of predicted benign variants

len(out.loc[out["result"] == "predict_benign"])

954662

In [None]:
# Number of predicted pathogenic variants

len(out.loc[(out["result"] == "predict_pathogenic")])

995382

In [None]:
# Filter low confidence predictions

pp = (outmax.loc[(out["result"] == "predict_pathogenic")])
pp = pp.loc[(pp["proba"] > 0.75) | (pp["proba"] < 0.25),:]