In [2]:
import pandas as pd


Load the Dataset

In [3]:
df= pd.read_csv("datap/clinical_result_labeled.csv")

# Display first 5 rows
print(df.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ...   Sequence_ID   position  \
0                X       154022511       143283  ...  NC_000023.1

Encode Mutation Type

In [4]:
from sklearn.preprocessing import LabelEncoder

le_mutation_type = LabelEncoder()
df['Mutation_Type_Encoded'] = le_mutation_type.fit_transform(df['Mutation_Type'])

print(df[['Mutation_Type', 'Mutation_Type_Encoded']].head())


  Mutation_Type  Mutation_Type_Encoded
0  Transversion                      1
1    Transition                      0
2    Transition                      0
3    Transition                      0
4  Transversion                      1


Step 3: Encode Base Pair Changes (G:A, C:T, etc.)

In [5]:
df_bp_onehot = pd.get_dummies(df['BP_Mutation'], prefix="BP")
df = pd.concat([df, df_bp_onehot], axis=1)

df.drop(columns=['BP_Mutation'], inplace=True)

print(df.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ...  BP_A>T BP_C>A BP_C>G  \
0                X       154022511       143283  ...   False  False 

Step 4: Use Mutation Position as a Numerical Feature

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['Normalized_Position'] = scaler.fit_transform(df[['position']])

print(df[['position', 'Normalized_Position']].head())


    position  Normalized_Position
0  154022510             0.000000
1  154022618             0.001436
2  154024527             0.026823
3  154025018             0.033352
4  154025790             0.043619


Step 5: Encode Molecular Consequence (One-Hot Encoding)

In [7]:
mc_labels = ["synonymous variant","3 prime UTR variant", "5 prime UTR variant","splice donor variant","splice acceptor variant", "nonsense", "intron variant", "missense variant", "stop lost"]

for mc in mc_labels:
    df[f"mc_{mc.replace(' ', '_')}"] = df["Molecular consequence"].apply(lambda x: 1 if mc in str(x) else 0)


print("\n✅ Sample After Modification:")
print(df.head())





✅ Sample After Modification:
                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ...  Normalized_Position  \
0                X       154022511     

In [8]:
# Count the frequency of each molecular consequence label
frequency_counts = {}

for mc in mc_labels:
    column_name = f"mc_{mc.replace(' ', '_')}"
    frequency_counts[mc] = df[column_name].sum()

# Display the frequencies
print("\n✅ Frequency Counts:")
print(frequency_counts)



✅ Frequency Counts:
{'synonymous variant': 297, '3 prime UTR variant': 58, '5 prime UTR variant': 224, 'splice donor variant': 7, 'splice acceptor variant': 4, 'nonsense': 63, 'intron variant': 133, 'missense variant': 299, 'stop lost': 5}


Step 6: Extract Sequence Context (Previous & Next Base)

In [9]:
df_prev_onehot = pd.get_dummies(df['prev_position_allele'], prefix="Prev")
df_next_onehot = pd.get_dummies(df['next_position_allele'], prefix="Next")

# Merge into main dataframe
df = pd.concat([df, df_prev_onehot, df_next_onehot], axis=1)

# Drop original columns
df.drop(columns=['prev_position_allele', 'next_position_allele'], inplace=True)

print(df.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ...  mc_missense_variant  \
0                X       154022511       143283  ...                 

Compute Mutation Frequency

In [10]:
mutation_counts = df['Mutation_Type'].value_counts()
df['Mutation_Frequency'] = df['Mutation_Type'].map(mutation_counts)

print(df[['Mutation_Type', 'Mutation_Frequency']].head())


  Mutation_Type  Mutation_Frequency
0  Transversion                 352
1    Transition                 450
2    Transition                 450
3    Transition                 450
4  Transversion                 352


In [11]:
print(df[['Germline classification']].head())

  Germline classification
0                  Benign
1                  Benign
2           Likely benign
3    Benign/Likely benign
4                  Benign


Step 8: Encode Clinical Significance (Target Variable)

In [12]:
print(df.columns)


Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Sequence_ID', 'position',
       'Deleted_Sequence', 'Inserted_Sequence', 'Mutation_Type',
       'sequence_window', 'mutated_sequence', 'Mutation_Type_Encoded',
       'BP_A>C', 'BP_A>G', 'BP_A>T', 'BP_C>A', 'BP_C>G', 'BP_C>T', 'BP_G>A',
       'BP_G>C', 'BP_G>T', 'BP_T>A', 'BP_T>C', 'BP_T>G', 'Normalized_Position',
       'mc_synonymous_variant', 'mc_3_prime_UTR_variant',
       'mc_5_prime_UTR_variant', 'mc_splice_donor_variant',
       'mc_splice_acceptor_variant', 'mc_nonsense', 'mc_intron_variant',
       'mc_missense_variant', 'mc_stop_lost', 'Prev_A', 'Prev_C', 'Prev_G',
       'Prev_T', 'Next_A', 'Next_C', 'Next_G', 'Next_T', 'Mutation_Frequency'],
      dtype='object')


In [13]:
label_mapping = {
    "Pathogenic": 1, 
    "Likely pathogenic": 1,
    "Pathogenic/Likely pathogenic": 1,
    "Benign": 0, 
    "Likely benign": 0,
    "Benign/Likely benign": 0
}

df["label"] = df["Germline classification"].map(label_mapping).fillna(-1)  

df_variants = df[df["label"] != -1].copy()

df_variants.drop(columns=['Germline classification'], inplace=True)

# Display first few rows
print(df_variants.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ...  Prev_A Prev_C Prev_G  \
0                X       154022511       143283  ...   False  False 

Step 9: Save the Processed Dataset

In [14]:

# df.to_csv("datap/featured_clinical_result.csv", index=False)
# print("✅ Processed dataset saved as 'processed_clinical_result.csv'")


In [15]:
print(df.columns)


Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Sequence_ID', 'position',
       'Deleted_Sequence', 'Inserted_Sequence', 'Mutation_Type',
       'sequence_window', 'mutated_sequence', 'Mutation_Type_Encoded',
       'BP_A>C', 'BP_A>G', 'BP_A>T', 'BP_C>A', 'BP_C>G', 'BP_C>T', 'BP_G>A',
       'BP_G>C', 'BP_G>T', 'BP_T>A', 'BP_T>C', 'BP_T>G', 'Normalized_Position',
       'mc_synonymous_variant', 'mc_3_prime_UTR_variant',
       'mc_5_prime_UTR_variant', 'mc_splice_donor_variant',
       'mc_splice_acceptor_variant', 'mc_nonsense', 'mc_intron_variant',
       'mc_missense_variant', 'mc_stop_lost', 'Prev_A', 'Prev_C', 'Prev_G',
       'Prev_T', 'Next_A', 'Next_C', 'Next_G', 'Next_T', 'Mutation_Frequency',
       'label'],
      dtype=