In [71]:
import pandas as pd

gtf_file = "Homo_sapiens.GRCh38.109.gtf"
gtf = pd.read_csv(gtf_file, sep='\t', comment='#', header=None)
gtf.columns = ['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
print(gtf.columns)


Index(['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame',
       'attribute'],
      dtype='object')


In [72]:
print(gtf.head())

    chr  source     feature  start    end score strand frame  \
0  chr1  HAVANA        gene  11121  24894     .      +     .   
1  chr1  HAVANA  transcript  11121  14413     .      +     .   
2  chr1  HAVANA        exon  11121  11211     .      +     .   
3  chr1  HAVANA        exon  12010  12227     .      +     .   
4  chr1  HAVANA        exon  12613  12721     .      +     .   

                                           attribute  
0  gene_id "ENSG00000290825.2"; gene_type "lncRNA...  
1  gene_id "ENSG00000290825.2"; transcript_id "EN...  
2  gene_id "ENSG00000290825.2"; transcript_id "EN...  
3  gene_id "ENSG00000290825.2"; transcript_id "EN...  
4  gene_id "ENSG00000290825.2"; transcript_id "EN...  


In [73]:
exons = gtf[(gtf['feature'] == 'exon') & (gtf['chr'] == 'chrX')]


In [74]:
splice_sites = []

for _, row in exons.iterrows():
    splice_sites.append({'chr': row['chr'], 'position': row['end'], 'site_type': 'donor'})
    splice_sites.append({'chr': row['chr'], 'position': row['start'], 'site_type': 'acceptor'})

splice_df = pd.DataFrame(splice_sites)


In [75]:
print(splice_df.columns)


Index(['chr', 'position', 'site_type'], dtype='object')


In [76]:
file='data/aligned_clinvar_result.csv'

df_variants= pd.read_csv(file)

In [77]:
def nearest_splice_distance(pos, splice_positions):
    return min([abs(pos - sp) for sp in splice_positions])

# Splice positions on Chr X
splice_positions = splice_df['position'].tolist()


df_variants['splice_distance'] = df_variants['position'].apply(
    lambda pos: nearest_splice_distance(int(pos), splice_positions)
)



In [78]:
print(df_variants.columns)
print(df_variants.head())

Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Germline date last evaluated',
       'Germline review status', 'Somatic clinical impact',
       'Somatic clinical impact date last evaluated',
       'Somatic clinical impact review status', 'Oncogenicity classification',
       'Oncogenicity date last evaluated', 'Oncogenicity review status',
       'Unnamed: 24', 'Sequence_ID', 'position', 'Deleted_Sequence',
       'Inserted_Sequence', 'sequence_window', 'mutated_sequence',
       'prev_position_allele', 'next_position_allele', 'aligned_ref',
       'aligned_alt', 'alignment_score', 'mc_synonymous_variant',
       'mc_3_prime_UTR_variant', 'mc_5_prime_UTR_variant',
       'mc_splice_donor_variant', 'mc_splice_acceptor_variant', 'm

In [79]:
df_variants['mutated_sequence'].str.len().value_counts()


mutated_sequence
101    897
102     79
103     66
100     59
104     30
105     22
106     14
107      8
109      8
110      5
111      4
116      4
108      3
131      3
121      2
119      1
112      1
150      1
151      1
114      1
118      1
117      1
115      1
137      1
143      1
127      1
Name: count, dtype: int64

In [80]:
df=df_variants

In [81]:
DESIRED_LENGTH = 101

def fix_length(seq, desired_len=DESIRED_LENGTH):
    seq = seq.upper()
    if len(seq) >= desired_len:
        return seq[:desired_len]
    else:
        padding = 'N' * (desired_len - len(seq))
        return seq + padding

df['mutated_sequence_fixed'] = df['mutated_sequence'].apply(fix_length)


In [82]:
from tensorflow.keras.utils import to_categorical
import numpy as np

base_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
NUM_BASES = len(base_map)

def one_hot_encode(seq):
    encoded = [base_map.get(base, 4) for base in seq]
    return to_categorical(encoded, num_classes=NUM_BASES)

# Apply to fixed-length sequences
encoded_sequences = np.array(
    df['mutated_sequence_fixed'].apply(one_hot_encode).tolist()
)


In [83]:
encoded_sequences.shape


(1215, 101, 5)

In [84]:
label_cols = [col for col in df.columns if col.startswith('mc_')]

# Make sure they’re all integers (0/1)
df[label_cols] = df[label_cols].fillna(0).astype(int)

# Final label matrix
Y = df[label_cols].values


In [85]:
df['Germline classification'].unique()


array(['Benign', 'Likely benign', 'Benign/Likely benign',
       'Likely pathogenic', 'Pathogenic', 'Pathogenic/Likely pathogenic'],
      dtype=object)

In [86]:
label_mapping = {
    "Pathogenic": 1, 
    "Likely pathogenic": 1,
    "Pathogenic/Likely pathogenic": 1,
    "Benign": 0, 
    "Likely benign": 0,
    "Benign/Likely benign": 0
}

df["label"] = df["Germline classification"].map(label_mapping).fillna(-1)  

df_variants = df[df["label"] != -1].copy()

df_variants.drop(columns=['Germline classification'], inplace=True)

# Display first few rows
print(df_variants.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*8503dup   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*8503del   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  \
0               Rett syndrome  VCV000143289                X   
1               Rett syndrome  VCV000143288                X   
2  not provided|Rett syndrome  VCV000143283                X   
3               Rett syndrome  VCV000143282                X   
4               Rett syndrome  VCV000143280                X   

          GRCh37Location GRCh38Chromosome         GRCh38Location  VariationID  \
0  153287314 - 153287315                X  154021863 - 154021864       143289   
1              153287315                X              154021864       143288   

In [87]:
print(df.columns)

Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Germline date last evaluated',
       'Germline review status', 'Somatic clinical impact',
       'Somatic clinical impact date last evaluated',
       'Somatic clinical impact review status', 'Oncogenicity classification',
       'Oncogenicity date last evaluated', 'Oncogenicity review status',
       'Unnamed: 24', 'Sequence_ID', 'position', 'Deleted_Sequence',
       'Inserted_Sequence', 'sequence_window', 'mutated_sequence',
       'prev_position_allele', 'next_position_allele', 'aligned_ref',
       'aligned_alt', 'alignment_score', 'mc_synonymous_variant',
       'mc_3_prime_UTR_variant', 'mc_5_prime_UTR_variant',
       'mc_splice_donor_variant', 'mc_splice_acceptor_variant', 'm

In [88]:
df.to_csv("data/prep_clinvar_result.csv", index=False)