In [1]:
import pandas as pd

df = pd.read_csv("data/clinvar_result (21).txt", delimiter="\t")  # Change delimiter if needed

# Save as CSV
df.to_csv("data/clinvar_result.csv", index=False)

print("TXT file successfully converted to CSV!")


TXT file successfully converted to CSV!


In [2]:
print(df.head())

                                              Name  \
0                                    Single allele   
1  GRCh38/hg38 Xp22.33-q28(chrX:10001-156030895)x1   
2  GRCh38/hg38 Xp22.33-q28(chrX:10679-156013167)x1   
3  GRCh38/hg38 Xp22.33-q28(chrX:10679-156022206)x3   
4  GRCh38/hg38 Xp22.33-q28(chrX:10679-156022206)x1   

                                             Gene(s) Protein change  \
0  ARSF|CFAP47|LOC130068640|ABCB7|ARSH|ARSL|CFP|L...            NaN   
1  ABCB7|ABCD1|ACE2|ACE2-DT|ACOT9|ACSL4|ACTRT1|AD...            NaN   
2  LOC130067918|LOC130067919|LOC130067920|LOC1300...            NaN   
3  LOC130068528|LOC130068529|LOC130068530|LOC1300...            NaN   
4  ARMCX5|ARMCX5-GPRASP2|ARMCX6|ARR3|ARSD|ARSD-AS...            NaN   

           Condition(s)     Accession GRCh37Chromosome     GRCh37Location  \
0  Autism|Schizophrenia  VCV000488014              NaN                NaN   
1             See cases  VCV000146764                X  60001 - 155260560   
2             See

In [3]:
print(df.columns)

Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Germline date last evaluated',
       'Germline review status', 'Somatic clinical impact',
       'Somatic clinical impact date last evaluated',
       'Somatic clinical impact review status', 'Oncogenicity classification',
       'Oncogenicity date last evaluated', 'Oncogenicity review status',
       'Unnamed: 24'],
      dtype='object')


In [4]:
valid_spdi_mask = df['Canonical SPDI'].apply(lambda x: isinstance(x, str) and x.count(':') == 3)
num_valid_spdi = valid_spdi_mask.sum()

print(f"Number of valid SPDI entries: {num_valid_spdi}")


Number of valid SPDI entries: 1215


In [5]:
# Keep only rows where Canonical SPDI is a string with exactly 3 colons
df = df[df['Canonical SPDI'].apply(lambda x: isinstance(x, str) and x.count(':') == 3)].copy()

# Reset index if needed
df.reset_index(drop=True, inplace=True)

print(f"Remaining rows with valid SPDI: {len(df)}")


Remaining rows with valid SPDI: 1215


In [6]:


# Assuming df is your original DataFrame
df_clean = df.copy()

# Initialize empty columns
df_clean['Sequence_ID'] = ""
df_clean['position'] = ""
df_clean['Deleted_Sequence'] = ""
df_clean['Inserted_Sequence'] = ""

# Split 'Canonical SPDI' only if it is valid
for index, row in df_clean.iterrows():
    spdi = row.get('Canonical SPDI')
    if isinstance(spdi, str) and spdi.count(':') == 3:
        parts = spdi.split(':')
        df_clean.at[index, 'Sequence_ID'] = parts[0]
        df_clean.at[index, 'position'] = parts[1]
        df_clean.at[index, 'Deleted_Sequence'] = parts[2]
        df_clean.at[index, 'Inserted_Sequence'] = parts[3]

# Optional: convert position to int if needed
# df_clean['position'] = pd.to_numeric(df_clean['position'], errors='coerce')


In [7]:
print(df_clean[['Canonical SPDI', 'Deleted_Sequence', 'Inserted_Sequence']].head(10))

                                  Canonical SPDI Deleted_Sequence  \
0  NC_000023.11:154021863:GGGGGGGGGG:GGGGGGGGGGG       GGGGGGGGGG   
1    NC_000023.11:154021863:GGGGGGGGGG:GGGGGGGGG       GGGGGGGGGG   
2                     NC_000023.11:154022510:T:G                T   
3                     NC_000023.11:154022618:G:A                G   
4                     NC_000023.11:154024527:G:A                G   
5                     NC_000023.11:154025018:A:G                A   
6                     NC_000023.11:154025790:T:G                T   
7                     NC_000023.11:154026488:C:G                C   
8                     NC_000023.11:154026704:T:C                T   
9                     NC_000023.11:154026708:G:A                G   

  Inserted_Sequence  
0       GGGGGGGGGGG  
1         GGGGGGGGG  
2                 G  
3                 A  
4                 A  
5                 G  
6                 G  
7                 G  
8                 C  
9                 A 

In [8]:
df_clean.to_csv("data/SPDI_clinvar_result.csv", index=False)

In [9]:
import requests
df_variants = df_clean
def fetch_dna_sequence(position, window=50):
    """Fetches a DNA sequence window (±50 bp) around a variant on Chromosome X."""
    start = max(1, int(position) - window)
    end = int(position) + window
    url = f"https://rest.ensembl.org/sequence/region/human/X:{start}..{end}?content-type=text/plain"
    
    response = requests.get(url)
    if response.status_code == 200:
        return response.text.strip()
    return "N" * (2 * window + 1)  # If request fails, return Ns

# Apply function to get sequence windows (Chromosome is always 'X')
df_variants["sequence_window"] = df_variants["position"].apply(
    lambda pos: fetch_dna_sequence(pos) if pos != "N/A" else "N" * 101
)

# Replace reference allele with alternate allele at position 51
def mutate_sequence(sequence, variant, position=50):
    """Replaces the reference base with the alternate allele at position 51."""
    return sequence[:position] + variant + sequence[position+1:]

# Convert alternate alleles to strings and handle NaNs
df_variants["Inserted_Sequence"] = df_variants["Inserted_Sequence"].fillna("N").astype(str)

# Apply function to mutate sequence
df_variants["mutated_sequence"] = df_variants.apply(
    lambda row: mutate_sequence(row["sequence_window"], row["Inserted_Sequence"]) 
    if row["sequence_window"] != "N" * 101 else row["sequence_window"],
    axis=1
)

def get_prev_alleles(sequence, position=50):
    """Extracts the previous allele from the sequence window."""
    return sequence[position - 1] if position > 0 else "N"

def get_next_alleles(sequence, position=50):
    """Extracts the next allele from the sequence window."""
    return sequence[position + 1] if position < len(sequence) - 1 else "N"

# Apply function to extract alleles
df_variants["prev_position_allele"] = df_variants["sequence_window"].apply(get_prev_alleles)
df_variants["next_position_allele"] = df_variants["sequence_window"].apply(get_next_alleles)


KeyboardInterrupt: 

In [None]:
df_variants.to_csv("data/variant_clinvar_result.csv", index=False)

In [10]:
file = 'data/variant_clinvar_result.csv'
df_variants = pd.read_csv(file)

In [11]:
import random

# Alignment scoring
def calculate_fitness(seq1, seq2, match=1, mismatch=-1, gap=-2):
    score = 0
    for a, b in zip(seq1, seq2):
        if a == '-' or b == '-':
            score += gap
        elif a == b:
            score += match
        else:
            score += mismatch
    return score

# Align sequences to equal length with padding
def pad_sequences(seq1, seq2, pad_char='-'):
    max_len = max(len(seq1), len(seq2))
    return seq1.ljust(max_len, pad_char), seq2.ljust(max_len, pad_char)

# Genetic algorithm for single pair alignment
def genetic_align(seq1, seq2, pop_size=50, generations=100):
    seq1, seq2 = pad_sequences(seq1, seq2)

    def random_individual():
        # Randomly insert gaps into either sequence
        s1 = list(seq1)
        s2 = list(seq2)
        for _ in range(random.randint(0, 5)):
            i = random.randint(0, len(s1)-1)
            s1.insert(i, '-')
        for _ in range(random.randint(0, 5)):
            i = random.randint(0, len(s2)-1)
            s2.insert(i, '-')
        max_len = max(len(s1), len(s2))
        return ''.join(s1).ljust(max_len, '-'), ''.join(s2).ljust(max_len, '-')

    # Initialize population
    population = [random_individual() for _ in range(pop_size)]

    for gen in range(generations):
        # Evaluate fitness
        population.sort(key=lambda pair: calculate_fitness(pair[0], pair[1]), reverse=True)
        new_population = population[:10]  # Elitism

        while len(new_population) < pop_size:
            p1, p2 = random.sample(population[:20], 2)
            # Simple crossover
            cut = random.randint(1, len(p1[0])-2)
            child1 = (p1[0][:cut] + p2[0][cut:], p1[1][:cut] + p2[1][cut:])
            child2 = (p2[0][:cut] + p1[0][cut:], p2[1][:cut] + p1[1][cut:])

            # Mutation: random gap insertion
            for child in [child1, child2]:
                if random.random() < 0.3:
                    s1 = list(child[0])
                    s2 = list(child[1])
                    idx = random.randint(0, len(s1)-1)
                    s1.insert(idx, '-')
                    s2.insert(idx, '-')
                    child = (''.join(s1), ''.join(s2))
                new_population.append(child)

        population = new_population[:pop_size]

    best = population[0]
    return best, calculate_fitness(best[0], best[1])


In [12]:
ref_seq = df_variants.iloc[0]["sequence_window"]
mut_seq = df_variants.iloc[0]["mutated_sequence"]

(aligned1, aligned2), score = genetic_align(ref_seq, mut_seq)
print("Best Alignment:")
print(aligned1)
print(aligned2)
print("Score:", score)


Best Alignment:
AAATTTATAAGGCAAACTCTTTATATAATAAATAGGTTTTAACAGGGATCAGGGGGGGGGGGGGGTGGTTTTGAAGTATACAGGTACATTCAAGACAAGGTTTTAC------
AAATTTATAAGGCAAACTCTTTATATAATAAATAGGTTTTAACAGGGATCAGGGGGGGGGGGGGGGGGGGGGGGTTTTTTGAAACGTATACAAGGTAACATTTCAGAATTAC
Score: 48


In [13]:
alignments = []
for i, row in df_variants.iterrows():
    ref = row["sequence_window"]
    alt = row["mutated_sequence"]
    (a1, a2), score = genetic_align(ref, alt)
    alignments.append((a1, a2, score))

df_variants["aligned_ref"], df_variants["aligned_alt"], df_variants["alignment_score"] = zip(*alignments)


In [14]:
mc_labels = ["synonymous variant","frameshift variant","3 prime UTR variant", "5 prime UTR variant","splice donor variant","splice acceptor variant", "nonsense", "intron variant", "missense variant", "stop lost"]

for mc in mc_labels:
    df_variants[f"mc_{mc.replace(' ', '_')}"] = df_variants["Molecular consequence"].apply(lambda x: 1 if mc in str(x) else 0)


print("\n✅ Sample After Modification:")
print(df_variants.head())



✅ Sample After Modification:
                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*8503dup   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*8503del   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  \
0               Rett syndrome  VCV000143289                X   
1               Rett syndrome  VCV000143288                X   
2  not provided|Rett syndrome  VCV000143283                X   
3               Rett syndrome  VCV000143282                X   
4               Rett syndrome  VCV000143280                X   

          GRCh37Location GRCh38Chromosome         GRCh38Location  VariationID  \
0  153287314 - 153287315                X  154021863 - 154021864       143289   
1              153287315                X         

In [15]:
print(df_variants.columns)

Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Germline date last evaluated',
       'Germline review status', 'Somatic clinical impact',
       'Somatic clinical impact date last evaluated',
       'Somatic clinical impact review status', 'Oncogenicity classification',
       'Oncogenicity date last evaluated', 'Oncogenicity review status',
       'Unnamed: 24', 'Sequence_ID', 'position', 'Deleted_Sequence',
       'Inserted_Sequence', 'sequence_window', 'mutated_sequence',
       'prev_position_allele', 'next_position_allele', 'aligned_ref',
       'aligned_alt', 'alignment_score', 'mc_synonymous_variant',
       'mc_frameshift_variant', 'mc_3_prime_UTR_variant',
       'mc_5_prime_UTR_variant', 'mc_splice_donor_variant',
       

In [16]:
df_variants.to_csv("data/aligned_clinvar_result.csv", index=False)