In [21]:
import pandas as pd

file = 'data/prep_clinvar_result.csv'

df= pd.read_csv(file)

In [22]:
print( df.columns)

Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Germline date last evaluated',
       'Germline review status', 'Somatic clinical impact',
       'Somatic clinical impact date last evaluated',
       'Somatic clinical impact review status', 'Oncogenicity classification',
       'Oncogenicity date last evaluated', 'Oncogenicity review status',
       'Unnamed: 24', 'Sequence_ID', 'position', 'Deleted_Sequence',
       'Inserted_Sequence', 'sequence_window', 'mutated_sequence',
       'prev_position_allele', 'next_position_allele', 'aligned_ref',
       'aligned_alt', 'alignment_score', 'mc_synonymous_variant',
       'mc_3_prime_UTR_variant', 'mc_5_prime_UTR_variant',
       'mc_splice_donor_variant', 'mc_splice_acceptor_variant', 'm

In [23]:
df.dtypes


Name                                            object
Gene(s)                                         object
Protein change                                  object
Condition(s)                                    object
Accession                                       object
GRCh37Chromosome                                object
GRCh37Location                                  object
GRCh38Chromosome                                object
GRCh38Location                                  object
VariationID                                      int64
AlleleID(s)                                      int64
dbSNP ID                                        object
Canonical SPDI                                  object
Variant type                                    object
Molecular consequence                           object
Germline classification                         object
Germline date last evaluated                    object
Germline review status                          object
Somatic cl

In [24]:
drop_cols = [
    'Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
    'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome', 'GRCh38Location',
    'VariationID', 'AlleleID(s)', 'dbSNP ID', 'Canonical SPDI',
    'Germline date last evaluated', 'Somatic clinical impact',
    'Somatic clinical impact date last evaluated', 'Somatic clinical impact review status',
    'Oncogenicity classification', 'Oncogenicity date last evaluated', 'Oncogenicity review status',
    'Unnamed: 24', 'Sequence_ID', 'sequence_window', 'mutated_sequence',
    'aligned_ref', 'aligned_alt','Germline classification',
       'Germline review status', 'Molecular consequence'
]


In [25]:
df = df.drop(columns=drop_cols)

df.dtypes

Variant type                  object
position                       int64
Deleted_Sequence              object
Inserted_Sequence             object
prev_position_allele          object
next_position_allele          object
alignment_score                int64
mc_synonymous_variant          int64
mc_3_prime_UTR_variant         int64
mc_5_prime_UTR_variant         int64
mc_splice_donor_variant        int64
mc_splice_acceptor_variant     int64
mc_nonsense                    int64
mc_intron_variant              int64
mc_missense_variant            int64
mc_stop_lost                   int64
splice_distance                int64
mutated_sequence_fixed        object
label                          int64
dtype: object

In [26]:
df['Variant type'].unique()

array(['Duplication', 'Deletion', 'single nucleotide variant',
       'Insertion', 'Indel'], dtype=object)

In [27]:
label_mapping = {
    "Duplication": 3, 
    "Deletion": 1,
    "single nucleotide variant": 0,
    "Insertion": 2, 
    "Indel": 4
}

df["type"] = df["Variant type"].map(label_mapping).fillna(-1)  

df = df[df["type"] != -1].copy()

df.drop(columns=['Variant type'], inplace=True)

# Display first few rows
print(df.head())

    position Deleted_Sequence Inserted_Sequence prev_position_allele  \
0  154021863       GGGGGGGGGG       GGGGGGGGGGG                    G   
1  154021863       GGGGGGGGGG         GGGGGGGGG                    G   
2  154022510                T                 G                    G   
3  154022618                G                 A                    A   
4  154024527                G                 A                    C   

  next_position_allele  alignment_score  mc_synonymous_variant  \
0                    G               92                      0   
1                    G               80                      0   
2                    T              110                      0   
3                    G              101                      0   
4                    G              110                      0   

   mc_3_prime_UTR_variant  mc_5_prime_UTR_variant  mc_splice_donor_variant  \
0                       1                       0                        0   
1             

In [28]:
df.dtypes

position                       int64
Deleted_Sequence              object
Inserted_Sequence             object
prev_position_allele          object
next_position_allele          object
alignment_score                int64
mc_synonymous_variant          int64
mc_3_prime_UTR_variant         int64
mc_5_prime_UTR_variant         int64
mc_splice_donor_variant        int64
mc_splice_acceptor_variant     int64
mc_nonsense                    int64
mc_intron_variant              int64
mc_missense_variant            int64
mc_stop_lost                   int64
splice_distance                int64
mutated_sequence_fixed        object
label                          int64
type                           int64
dtype: object

In [29]:
# One-hot encode both columns
prev_encoded = pd.get_dummies(df['prev_position_allele'], prefix='prev')
next_encoded = pd.get_dummies(df['next_position_allele'], prefix='next')

# Concatenate with the original DataFrame
df = pd.concat([df, prev_encoded, next_encoded], axis=1)

# Optionally drop the original columns if no longer needed
df.drop(['prev_position_allele', 'next_position_allele'], axis=1, inplace=True)
print(df.head())

    position Deleted_Sequence Inserted_Sequence  alignment_score  \
0  154021863       GGGGGGGGGG       GGGGGGGGGGG               92   
1  154021863       GGGGGGGGGG         GGGGGGGGG               80   
2  154022510                T                 G              110   
3  154022618                G                 A              101   
4  154024527                G                 A              110   

   mc_synonymous_variant  mc_3_prime_UTR_variant  mc_5_prime_UTR_variant  \
0                      0                       1                       0   
1                      0                       1                       0   
2                      0                       1                       0   
3                      0                       1                       0   
4                      0                       1                       0   

   mc_splice_donor_variant  mc_splice_acceptor_variant  mc_nonsense  ...  \
0                        0                           0    

In [30]:
df.dtypes

position                       int64
Deleted_Sequence              object
Inserted_Sequence             object
alignment_score                int64
mc_synonymous_variant          int64
mc_3_prime_UTR_variant         int64
mc_5_prime_UTR_variant         int64
mc_splice_donor_variant        int64
mc_splice_acceptor_variant     int64
mc_nonsense                    int64
mc_intron_variant              int64
mc_missense_variant            int64
mc_stop_lost                   int64
splice_distance                int64
mutated_sequence_fixed        object
label                          int64
type                           int64
prev_A                          bool
prev_C                          bool
prev_G                          bool
prev_T                          bool
next_A                          bool
next_C                          bool
next_G                          bool
next_T                          bool
dtype: object

In [35]:
df= df.drop(columns=['Deleted_Sequence','Inserted_Sequence','mutated_sequence_fixed'])

In [36]:
df.dtypes

position                      int64
alignment_score               int64
mc_synonymous_variant         int64
mc_3_prime_UTR_variant        int64
mc_5_prime_UTR_variant        int64
mc_splice_donor_variant       int64
mc_splice_acceptor_variant    int64
mc_nonsense                   int64
mc_intron_variant             int64
mc_missense_variant           int64
mc_stop_lost                  int64
splice_distance               int64
label                         int64
type                          int64
prev_A                         bool
prev_C                         bool
prev_G                         bool
prev_T                         bool
next_A                         bool
next_C                         bool
next_G                         bool
next_T                         bool
dtype: object

In [37]:
df.to_csv('data/featured_clinvar_result.csv')