In [2]:
import argparse, json, os, re, sys
import numpy as np
import pandas as pd

In [3]:
# Input data
d = "C:/test/data/9_clean_in_py/trainingData.clean.tsv.gz"
df = pd.read_csv(d, sep="\t", na_values=".")

  df = pd.read_csv(d, sep="\t", na_values=".")


In [4]:
# Step 1: Clean transition/transversions

def ti_flag(v: str):
    '''
    Transition/transversion > 1/0
    Data: convert 'ti'/'tv'/'na' to numeric; ti=1, tv=0, na -> np.nan
    '''
    if v == "ti": return 1.0
    elif v == "tv": return 0.0
    else:
        return -1
    return np.nan

In [5]:
#1) Convert transition/transversion to 1/0 or -1 for neither

df["is_transition_num"] = df["is_transition"].map(ti_flag)

In [6]:
#1) Check that it conversion worked and drop original column

# Note: column "is_transition" (string) is dropped below

# Look at the new data to confirm all numeric (0 or 1)
df["is_transition_num"]

0         -1.0
1          1.0
2          1.0
3          0.0
4          1.0
          ... 
3674808    0.0
3674809    1.0
3674810    1.0
3674811    1.0
3674812    0.0
Name: is_transition_num, Length: 3674813, dtype: float64

In [7]:
#1) Both pos and neg present?

df["is_transition_num"].unique()

array([-1.,  1.,  0.])

In [9]:
# So far:
# 1) transition/transversion to 0/1/-1


# Step 2)
# Cleanup mpc (0 - ~3.5 range, only for missense, all others have NaN.
#    Problem: rare variants AND non-missense entries are BOTH NaN.
#    Need to distinguish between them
#    Note: XGBoost, LightGBM, CatBoost type models can all handle Na values. It's MLR, etc. that can't. 
#    Solution: Create sentinel columns:
#            'mpc'         -- original column with missense tolerance value or NaN
#            'is_missense' -- marks if missense mutation with 1
#            'mpc_filled'  -- marks if missense mutations with -1
#            'mpc_missing' -- marks all missing value with 1. 
#                             with 'is_missense' col informs tells if an mpc *NaN* == a rare variant 

In [8]:
# 2a) Marks missense mutations (0 == missense, 1 == non-missense)

df["is_missense"] = (df["mutation_type"]=="missense").astype(int)

In [17]:
# 2) Check on missense constraint analysis columns:
#     Sentinel columns are designed to identify when NaN value for mpc (missense tolerance) == NaN for an informative rare variant
#     Distinguishes betweeen NaN when the mutaiton is NOT a missense mutation == NaN for not-informative

# Example:

# Row Name (int)		 1685				1						 12
# mpc					 0.626245			NaN						 NaN
# is_missense			 0					1						 0
# mpc_is_missing		 0					1						 1
# mpc_filled			-1					NaN						-1
# Result: 				Mpc Info=valuable	No mpc, not useful		No mpc, USEFUL

In [9]:
# 2) Converts mpc to numeric

df["mpc"] = pd.to_numeric(df["mpc"], errors="coerce")

In [10]:
# 2) Sentinel for not-applicable cases; keep true NaNs for rare missing missense
#     This marks -1 if:
#             - "IS missense"
#             - mpc value == NaN
#             --> indicates rare variant

df["mpc_filled"] = np.where(df["is_missense"]==1, df["mpc"], -1.0)

In [11]:
# 2) 2Sentinel for not-applicable cases; keep true NaNs for rare missing missense

df["mpc_is_missing"] = df["mpc"].isna().astype(int)  # only meaningful when is_missense==1

In [12]:
# 2) See the change - mpc is missing info (a true NaN value) for rare variants

df["mpc_is_missing"]

0          1
1          1
2          1
3          1
4          1
          ..
3674808    1
3674809    1
3674810    1
3674811    1
3674812    1
Name: mpc_is_missing, Length: 3674813, dtype: int32

In [14]:
# 2) Make sentinel for mpc (non-missense)

df["is_missense"] = (df["mutation_type"]=="missense").astype(int)

In [15]:
# 2) Missense tolerance scores or NaN (indicating rare variant or non-missense.)
#     NaN is good - they are disginguished by sentinel columns

df.mpc.unique()

array([      nan, 0.568203 , 0.698117 , ..., 0.0555659, 1.84537  ,
       0.286994 ])

In [16]:
df.is_missense.head()

0    0
1    1
2    1
3    1
4    1
Name: is_missense, dtype: int32

In [17]:
df.mpc_is_missing.head()

0    1
1    1
2    1
3    1
4    1
Name: mpc_is_missing, dtype: int32

In [18]:
df.mpc_filled.head()

0   -1.0
1    NaN
2    NaN
3    NaN
4    NaN
Name: mpc_filled, dtype: float64

In [19]:
# So far:
# 1) transition/transversion to 0/1/-1
# 2) mpc > added three columns to distinguish rare variants from no data

# Step 3) All number columns to numeric (. > NaN)

num_cols = [
    "af","nhomalt","revel","cadd_phred","cadd_raw","mpc",
    "phyloP100","phastCons100","gerp_rs",
    "loeuf","pli","mis_z","syn_z",
    "ccr_pct","ref_len","alt_len","indel_len",
    "is_indel","is_snv","is_transition_num",
    "spliceai_tx_count",
    "spliceai_ds_ag_max","spliceai_ds_al_max","spliceai_ds_dg_max","spliceai_ds_dl_max",
    "mpc_filled","mpc_is_missing","is_frameshift","ccr_top1"
]

# coerce to numeric
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

In [20]:
# So far:
# 1) transition/transversion to 0/1/-1
# 2) mpc > added three columns to distinguish rare variants from no data
# 3) Number columns to numeric

# Step 4a) Clip columns (should be value of 0-1, just be sure)

clip_cols = ["af","revel","phastCons100", "spliceai_ds_ag_max","spliceai_ds_al_max","spliceai_ds_dg_max","spliceai_ds_dl_max"]
for c in clip_cols:
    if c in df.columns:
        df[c] = df[c].clip(lower=0, upper=1)
        print(f"Column {c}: lower {df[c].min()}, upper {df[c].max()}")

Column af: lower 0.0, upper 1.0
Column revel: lower 0.0, upper 1.0
Column phastCons100: lower 0.0, upper 1.0
Column spliceai_ds_ag_max: lower 0.0, upper 1.0
Column spliceai_ds_al_max: lower 0.0, upper 1.0
Column spliceai_ds_dg_max: lower 0.0, upper 1.0
Column spliceai_ds_dl_max: lower 0.0, upper 1.0


In [21]:
# 4b) Clip other columns wiht specific range requirements

df["ccr_pct"] = df["ccr_pct"].clip(lower=0, upper=100)
df["cadd_phred"] = df["cadd_phred"].clip(lower=0, upper=99)
df["gerp_rs"] = df["gerp_rs"].clip(lower=-10, upper=10)

In [22]:
# So far:
# 1) transition/transversion to 0/1/-1
# 2) mpc > added three columns to distinguish rare variants from no data
# 3) Number columns to numeric
# 4) Clipped columns to ensure proper range

# Step 5a: Mutation type to one hot

mt_dum = pd.get_dummies(df["mutation_type"].fillna("noncoding"), prefix="mt", dtype=int)
mt_dum

Unnamed: 0,mt_missense,mt_noncoding,mt_nonsense,mt_silent
0,0,1,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
3674808,1,0,0,0
3674809,0,0,1,0
3674810,0,0,0,1
3674811,0,0,0,1


In [23]:
# 5b Add the one-hot mutation types to the df

df = pd.concat([df, mt_dum], axis=1)

In [24]:
# 5c drop mutation column now that it's in 1 hot form

df = df.drop(["mutation_type"], axis=1)

In [25]:
# So far:
# 1) Transition/transversion to 0/1/-1
# 2) mpc > added three columns to distinguish rare variants from no data. Kept mpc.
# 3) Number columns to numeric
# 4) Clipped columns to ensure proper range
# 5) Mutation type to 1-hot, dropped original

# Step 6: Drop no/low-utility columns
df = df.drop([], axis=1)
drop_cols = [
        "gene","gene_id",
        "spliceai_allele_first","spliceai_symbol_first",
        "spliceai_ds_ag_first","spliceai_ds_al_first","spliceai_ds_dg_first","spliceai_ds_dl_first",
        "spliceai_dp_ag_first","spliceai_dp_al_first","spliceai_dp_dg_first","spliceai_dp_dl_first",
        "spliceai_raw", "bcq_raw", "is_transition"
    ]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

In [26]:
df.head()

Unnamed: 0,chrom,pos,ref,alt,af,nhomalt,revel,cadd_phred,cadd_raw,mpc,...,spliceai_ds_dl_max,ccr_top1,is_frameshift,is_missense,mpc_filled,mpc_is_missing,mt_missense,mt_noncoding,mt_nonsense,mt_silent
0,chr1,66926,AG,A,,,,,,,...,,,0,0,-1.0,1,0,1,0,0
1,chr1,69134,A,G,0.000318,0.0,0.075,16.91,1.68318,,...,0.0,,0,1,,1,1,0,0,0
2,chr1,69308,A,G,,,0.147,18.68,1.92409,,...,0.0,,0,1,,1,1,0,0,0
3,chr1,69314,T,G,2e-06,0.0,0.208,21.8,2.33789,,...,0.0,,0,1,,1,1,0,0,0
4,chr1,69404,T,C,4.3e-05,0.0,0.05,18.69,1.92614,,...,0.0,,0,1,,1,1,0,0,0


In [27]:
# Check on remaining columns

print(f"Columns remaining: {len(df.columns)}. \n\n Names = {df.columns}, \n\n Number of Rows: {len(df)}")

Columns remaining: 39. 

 Names = Index(['chrom', 'pos', 'ref', 'alt', 'af', 'nhomalt', 'revel', 'cadd_phred',
       'cadd_raw', 'mpc', 'phyloP100', 'phastCons100', 'gerp_rs', 'loeuf',
       'pli', 'mis_z', 'syn_z', 'clnsig', 'ccr_pct', 'ref_len', 'alt_len',
       'is_indel', 'is_snv', 'indel_len', 'is_transition_num',
       'spliceai_tx_count', 'spliceai_ds_ag_max', 'spliceai_ds_al_max',
       'spliceai_ds_dg_max', 'spliceai_ds_dl_max', 'ccr_top1', 'is_frameshift',
       'is_missense', 'mpc_filled', 'mpc_is_missing', 'mt_missense',
       'mt_noncoding', 'mt_nonsense', 'mt_silent'],
      dtype='object'), 

 Number of Rows: 3674813


In [28]:
# So far:
# 1) Transition/transversion to 0/1/-1
# 2) mpc > added three columns to distinguish rare variants from no data. Kept mpc.
# 3) Number columns to numeric
# 4) Clipped columns to ensure proper range
# 5) Mutation type to 1-hot, dropped original
# 6) Drop no/low-utility columns

# Step 7: Deduplicate based on Chrom, pos, ref, alt; keep the first
df = df.sort_values(["chrom","pos"]).drop_duplicates(subset=["chrom","pos","ref","alt"], keep="first")

In [29]:
# So far:
# 1) Transition/transversion to 0/1/-1
# 2) mpc > added three columns to distinguish rare variants from no data. Kept mpc.
# 3) Number columns to numeric
# 4) Clipped columns to ensure proper range
# 5) Mutation type to 1-hot, dropped original
# 6) Drop no/low-utility columns
# 7) Deduplicate based on Chrom, pos, ref, alt

# Step 8) Save dataset

In [30]:
# 8) Save

import json
import pandas as pd
import numpy as np

# Compact dtypes
for c in df.select_dtypes(include=["float64"]).columns:
    df[c] = df[c].astype("float32")
for c in df.select_dtypes(include=["int64","int32"]).columns:
    df[c] = pd.to_numeric(df[c], downcast="integer")

In [31]:
# So far:
# 1) Transition/transversion to 0/1/-1
# 2) mpc > added three columns to distinguish rare variants from no data. Kept mpc.
# 3) Number columns to numeric
# 4) Clipped columns to ensure proper range
# 5) Mutation type to 1-hot, dropped original
# 6) Drop no/low-utility columns
# 7) Deduplicate based on Chrom, pos, ref, alt
# 8) Save dataset

# Step 9a) Map target column, clinical significance ("clnsig") to 0/1 - column "label"

def map_label(s):
    if not isinstance(s, str): return np.nan
    t = s.lower().replace(" ", "_")
    if "pathogenic" in t and "benign" not in t: return 1
    if "benign" in t and "pathogenic" not in t: return 0
    return np.nan

df["label"] = df["clnsig"].apply(map_label)

In [32]:
# 9b) Save ambiguous data rows. Use later to make predictions.

amb = df["label"].isna()

In [33]:
# Check on the ambiguous calls

df.loc[amb,["clnsig", "label"]]

Unnamed: 0,clnsig,label
0,Uncertain_significance,
2,Uncertain_significance,
3,Uncertain_significance,
4,Uncertain_significance,
5,Uncertain_significance,
...,...,...
3674797,Uncertain_significance,
3674805,Uncertain_significance,
3674808,Uncertain_significance,
3674809,Uncertain_significance,


In [34]:
# 9b) Make ambiguous-only dataset, then save
amb_db = df.loc[amb]

# 1) Readable dataset (ambiguous)
amb_db.to_csv("ambiguous_data.tsv.gz", sep="\t", index=False,
          na_rep=".", float_format="%.6g", compression="gzip")

# 1) Machine input (ambiguous)
amb_db.to_parquet("ambiguous_data.parquet", engine="pyarrow",
              compression="snappy", index=False)

In [35]:
# 9c) Remove unclear outcomes (anything that's not a clinical significance of benigh or pathogenic)
#     These are 0/1 in the "label" column
#     Reduce full dataset down to good rows

# Save data points with clear, unambiguous benign/pathogenic calls (0 or 1)
keep = df["label"].notna()
df = df.loc[keep]

# Report the number of benign or pathogenic datapoints
numpath = sum(df["label"] == 1)
numben = sum(df["label"] == 0)
print("Number pathogenic: ", numpath)
print("Number benign: ", numben)
print("Number total: ", len(df["label"]))

Number pathogenic:  460877
Number benign:  1263888
Number total:  1724765


In [36]:
# 9d) Save the training data (note: clnsig will have to be removed prior to training)

# 1) Readable dataset
df.to_csv("training.pycleaned.tsv.gz", sep="\t", index=False,
          na_rep=".", float_format="%.6g", compression="gzip")

# 1) Canonical artifact for training
df.to_parquet("training.pycleaned.parquet", engine="pyarrow",
              compression="snappy", index=False)