In [1]:
import pandas as pd
import numpy as np

# Load preprocessed data
df = pd.read_csv('data/prep_clinvar_result.csv')

In [2]:
print( df.columns)

Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Germline date last evaluated',
       'Germline review status', 'Somatic clinical impact',
       'Somatic clinical impact date last evaluated',
       'Somatic clinical impact review status', 'Oncogenicity classification',
       'Oncogenicity date last evaluated', 'Oncogenicity review status',
       'Unnamed: 24', 'Sequence_ID', 'position', 'Deleted_Sequence',
       'Inserted_Sequence', 'sequence_window', 'mutated_sequence',
       'prev_position_allele', 'next_position_allele', 'aligned_ref',
       'aligned_alt', 'alignment_score', 'mc_synonymous_variant',
       'mc_frameshift_variant', 'mc_3_prime_UTR_variant',
       'mc_5_prime_UTR_variant', 'mc_splice_donor_variant',
       

In [3]:
region_encoded = pd.get_dummies(df['region'], prefix='region')
df = pd.concat([df, region_encoded], axis=1)
df.drop(columns=['region'], inplace=True)

In [4]:
splice_type_encoded = pd.get_dummies(df['nearest_splice_type'], prefix='splice_type')
df = pd.concat([df, splice_type_encoded], axis=1)
df.drop(columns=['nearest_splice_type'], inplace=True)


In [5]:
label_mapping = {
    "Duplication": 3, 
    "Deletion": 1,
    "single nucleotide variant": 0,
    "Insertion": 2, 
    "Indel": 4
}
df["type"] = df["Variant type"].map(label_mapping).fillna(-1)
df = df[df["type"] != -1].copy()
df.drop(columns=['Variant type'], inplace=True)



In [6]:
prev_encoded = pd.get_dummies(df['prev_position_allele'], prefix='prev')
next_encoded = pd.get_dummies(df['next_position_allele'], prefix='next')
df = pd.concat([df, prev_encoded, next_encoded], axis=1)
df.drop(['prev_position_allele', 'next_position_allele'], axis=1, inplace=True)

In [7]:
def calculate_gc_content(seq):
    seq = seq.upper()
    gc_count = seq.count('G') + seq.count('C')
    total = len(seq.replace('N', ''))
    return gc_count / total if total > 0 else 0

def calculate_gc_skew(seq):
    seq = seq.upper()
    g = seq.count('G')
    c = seq.count('C')
    return (g - c) / (g + c) if (g + c) > 0 else 0

df['gc_content'] = df['mutated_sequence_fixed'].apply(calculate_gc_content)
df['gc_skew'] = df['mutated_sequence_fixed'].apply(calculate_gc_skew)


In [8]:
def calculate_at_content(seq):
    seq = seq.upper()
    at_count = seq.count('A') + seq.count('T')
    total = len(seq.replace('N', ''))
    return at_count / total if total > 0 else 0

df['at_content'] = df['mutated_sequence_fixed'].apply(calculate_at_content)


In [9]:
MAX_CHR_X_POS = 156040895
df['position_bin'] = df['position'] / MAX_CHR_X_POS
df['position_decile'] = pd.qcut(df['position_bin'], 10, labels=False)


In [11]:
df[['donor_distance', 'acceptor_distance', 'dist_to_exon_start', 'dist_to_exon_end']] = \
    df[['donor_distance', 'acceptor_distance', 'dist_to_exon_start', 'dist_to_exon_end']].fillna(-1)


In [12]:
df['chr'] = 'chrX'
chr_encoded = pd.get_dummies(df['chr'], prefix='chr')
df = pd.concat([df, chr_encoded], axis=1)
df.drop(columns=['chr'], inplace=True)

In [13]:
df.dtypes

Name                object
Gene(s)             object
Protein change      object
Condition(s)        object
Accession           object
                    ...   
gc_skew            float64
at_content         float64
position_bin       float64
position_decile      int64
chr_chrX              bool
Length: 69, dtype: object

In [14]:
# === 11. Optional: Flag Known Hotspot Positions (Example Placeholder) ===
# Placeholder for known MECP2 pathogenic hotspots (replace with real positions if known)
known_hotspots = [153296543, 153296544, 153296545]  # Example positions
df['hotspot_flag'] = df['position'].apply(lambda x: 1 if x in known_hotspots else 0)


In [15]:
# === 12. Optional: CpG Site Overlap (Example Placeholder) ===
# Assuming you have a CpG position list (replace with actual CpG island data if available)
cpg_sites = [153296540, 153296541, 153296542]  # Example positions
df['cpg_overlap'] = df['position'].apply(lambda x: 1 if x in cpg_sites else 0)


In [20]:
# === 13. Drop Non-ML Columns (Sequence Strings, IDs, etc) ===
drop_cols = [
    'Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
    'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome', 'GRCh38Location',
    'VariationID', 'AlleleID(s)', 'dbSNP ID', 'Canonical SPDI',
    'Germline date last evaluated', 'Somatic clinical impact',
    'Somatic clinical impact date last evaluated', 'Somatic clinical impact review status',
    'Oncogenicity classification', 'Oncogenicity date last evaluated', 'Oncogenicity review status',
    'Unnamed: 24', 'Sequence_ID', 'sequence_window', 'mutated_sequence',
    'aligned_ref', 'aligned_alt', 'Deleted_Sequence', 'Molecular consequence', 'Germline classification',
    'Germline review status', 'sequence_window_fixed','Inserted_Sequence', 'mutated_sequence_fixed'
]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])


In [21]:
df.dtypes

position                        int64
alignment_score                 int64
mc_synonymous_variant           int64
mc_frameshift_variant           int64
mc_3_prime_UTR_variant          int64
mc_5_prime_UTR_variant          int64
mc_splice_donor_variant         int64
mc_splice_acceptor_variant      int64
mc_nonsense                     int64
mc_intron_variant               int64
mc_missense_variant             int64
mc_stop_lost                    int64
donor_distance                  int64
acceptor_distance               int64
dist_to_exon_start            float64
dist_to_exon_end              float64
label                           int64
region_exon                      bool
region_non-exon                  bool
splice_type_acceptor             bool
splice_type_donor                bool
type                            int64
prev_A                           bool
prev_C                           bool
prev_G                           bool
prev_T                           bool
next_A      

In [22]:
# === 14. Final Check ===
print("✅ Final feature columns:", df.columns)
print("✅ Data shape:", df.shape)

✅ Final feature columns: Index(['position', 'alignment_score', 'mc_synonymous_variant',
       'mc_frameshift_variant', 'mc_3_prime_UTR_variant',
       'mc_5_prime_UTR_variant', 'mc_splice_donor_variant',
       'mc_splice_acceptor_variant', 'mc_nonsense', 'mc_intron_variant',
       'mc_missense_variant', 'mc_stop_lost', 'donor_distance',
       'acceptor_distance', 'dist_to_exon_start', 'dist_to_exon_end', 'label',
       'region_exon', 'region_non-exon', 'splice_type_acceptor',
       'splice_type_donor', 'type', 'prev_A', 'prev_C', 'prev_G', 'prev_T',
       'next_A', 'next_C', 'next_G', 'next_T', 'gc_content', 'gc_skew',
       'at_content', 'position_bin', 'position_decile', 'chr_chrX',
       'hotspot_flag', 'cpg_overlap'],
      dtype='object')
✅ Data shape: (1215, 38)


In [23]:
df.to_csv('data/featured_clinvar_result.csv', index=False)
print("✅ Feature extraction complete. Shape:", df.shape)

✅ Feature extraction complete. Shape: (1215, 38)
