In [1]:
import pandas as pd

# Load dataset
file_path = "datap/featured_clinical_result.csv"
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ...  Prev_A Prev_C Prev_G  \
0                X       154022511       143283  ...   False  False 

In [None]:
categorical_cols = ['Mutation_Type', 'Molecular consequence', 'Germline classification']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Fill missing numerical values with median
numerical_cols = ['position']
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

print("‚úÖ Missing values handled successfully.")


‚úÖ Missing values handled successfully.


In [None]:
columns_to_remove = [
    'Name','Gene(s)', 'Accession', 'VariationID', 'AlleleID(s)', 'dbSNP ID', 'Sequence_ID',
    'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome', 'GRCh38Location',
    'Canonical SPDI', 'Protein change', 'Condition(s)',
    'sequence_window', 'mutated_sequence','Variant type', 'Molecular consequence', 'Germline classification',
       'Deleted_Sequence', 'Inserted_Sequence', 'Mutation_Type']

# Drop columns
df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

print("‚úÖ Unnecessary columns removed. Final dataset shape:", df.shape)


‚úÖ Unnecessary columns removed. Final dataset shape: (802, 34)


In [None]:

label_counts = df['label'].value_counts()
print("Class Distribution:\n", label_counts)



Class Distribution:
 label
0    589
1    213
Name: count, dtype: int64


In [None]:
y = df["label"]
X = df.drop(columns=["label"])


In [None]:
# Identify numeric variables
numeric_cols = X.select_dtypes(exclude=['object']).columns
print("üîç Numeric Columns:\n", numeric_cols)


üîç Categorical Columns:
 Index([], dtype='object')
üîç Numeric Columns:
 Index(['position', 'Mutation_Type_Encoded', 'BP_A>C', 'BP_A>G', 'BP_A>T',
       'BP_C>A', 'BP_C>G', 'BP_C>T', 'BP_G>A', 'BP_G>C', 'BP_G>T', 'BP_T>A',
       'BP_T>C', 'BP_T>G', 'Normalized_Position', 'mc_synonymous_variant',
       'mc_3_prime_UTR_variant', 'mc_5_prime_UTR_variant',
       'mc_splice_donor_variant', 'mc_splice_acceptor_variant', 'mc_nonsense',
       'mc_intron_variant', 'mc_missense_variant', 'mc_stop_lost', 'Prev_A',
       'Prev_C', 'Prev_G', 'Prev_T', 'Next_A', 'Next_C', 'Next_G', 'Next_T',
       'Mutation_Frequency'],
      dtype='object')


In [None]:
# Apply One-Hot Encoding
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Verify all features are now numeric
print("‚úÖ Data Types After Encoding:\n", X_encoded.dtypes)


üîç Categorical Columns:
 Index([], dtype='object')
‚úÖ Data Types After Encoding:
 position                        int64
Mutation_Type_Encoded           int64
BP_A>C                           bool
BP_A>G                           bool
BP_A>T                           bool
BP_C>A                           bool
BP_C>G                           bool
BP_C>T                           bool
BP_G>A                           bool
BP_G>C                           bool
BP_G>T                           bool
BP_T>A                           bool
BP_T>C                           bool
BP_T>G                           bool
Normalized_Position           float64
mc_synonymous_variant           int64
mc_3_prime_UTR_variant          int64
mc_5_prime_UTR_variant          int64
mc_splice_donor_variant         int64
mc_splice_acceptor_variant      int64
mc_nonsense                     int64
mc_intron_variant               int64
mc_missense_variant             int64
mc_stop_lost                    int64
Pre

In [8]:
print(df.columns)

Index(['position', 'Mutation_Type_Encoded', 'BP_A>C', 'BP_A>G', 'BP_A>T',
       'BP_C>A', 'BP_C>G', 'BP_C>T', 'BP_G>A', 'BP_G>C', 'BP_G>T', 'BP_T>A',
       'BP_T>C', 'BP_T>G', 'Normalized_Position', 'mc_synonymous_variant',
       'mc_3_prime_UTR_variant', 'mc_5_prime_UTR_variant',
       'mc_splice_donor_variant', 'mc_splice_acceptor_variant', 'mc_nonsense',
       'mc_intron_variant', 'mc_missense_variant', 'mc_stop_lost', 'Prev_A',
       'Prev_C', 'Prev_G', 'Prev_T', 'Next_A', 'Next_C', 'Next_G', 'Next_T',
       'Mutation_Frequency', 'label'],
      dtype='object')


In [9]:
from collections import Counter

# Check label distribution
print("üîç Class Distribution Before Resampling:\n", Counter(y))


üîç Class Distribution Before Resampling:
 Counter({0: 589, 1: 213})


In [10]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.75, random_state=42)  # Keep 75% balance, not full 1:1
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# Check new distribution
print("‚úÖ New Class Distribution After SMOTE:\n", Counter(y_resampled))


‚úÖ New Class Distribution After SMOTE:
 Counter({0: 589, 1: 441})


In [11]:
# from imblearn.under_sampling import RandomUnderSampler

# # Apply random undersampling
# undersampler = RandomUnderSampler(sampling_strategy=0.75, random_state=42)  # Keep 75% of Benign cases
# X_resampled, y_resampled = undersampler.fit_resample(X_encoded, y)

# # Check new distribution
# print("‚úÖ New Class Distribution After Undersampling:\n", Counter(y_resampled))


In [12]:
from sklearn.model_selection import train_test_split

# Define feature set (X) and target variable (y)
X = df.drop(columns=['label'])
y = df['label']

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"‚úÖ Train-Test Split Done: {X_train.shape[0]} Train Samples, {X_test.shape[0]} Test Samples")


‚úÖ Train-Test Split Done: 641 Train Samples, 161 Test Samples


In [13]:
# Save the processed data
df.to_csv("datap/processed_clinical_result.csv", index=False)
print("‚úÖ Processed dataset saved as 'processed_clinical_result.csv'")


‚úÖ Processed dataset saved as 'processed_clinical_result.csv'
