# Analysis Plan for MECP2 Mutation

### Conversion of .txt to .csv


In [14]:
import pandas as pd

# Load the text file
df = pd.read_csv("datap/clinvar_result_CDKL5.txt", delimiter="\t")  # Change delimiter if needed

# Save as CSV
df.to_csv("datap/clinvar_result_CDKL5.csv", index=False)

print("✅ TXT file successfully converted to CSV!")


✅ TXT file successfully converted to CSV!


In [15]:
# Load the text file
df = pd.read_csv("datap/clinvar_result_MECP2.txt", delimiter="\t")  # Change delimiter if needed

# Save as CSV
df.to_csv("datap/clinvar_result_MECP2.csv", index=False)

print("✅ TXT file successfully converted to CSV!")


✅ TXT file successfully converted to CSV!


In [16]:
# Load the text file
df = pd.read_csv("datap/clinvar_result_FOXG1.txt", delimiter="\t")  # Change delimiter if needed

# Save as CSV
df.to_csv("datap/clinvar_result_FOXG1.csv", index=False)

print("✅ TXT file successfully converted to CSV!")


✅ TXT file successfully converted to CSV!


In [17]:
csv_files =["datap/clinvar_result_MECP2.csv","datap/clinvar_result_CDKL5.csv","datap/clinvar_result_FOXG1.csv"]

In [23]:
l =[]
for f in csv_files:
    l.append(pd.read_csv(f))
     
df = pd.concat(l, ignore_index=True)
df.to_csv("datap/clinvar_result.csv", index=False)

In [27]:

# Keep only rows where 'Condition(s)' contains "Rett syndrome" (either alone or with other conditions)
df_res = df[df["Condition(s)"].str.contains(r"\bRett syndrome\b", na=False, regex=True)]

# Save the filtered dataset
df_res.to_csv("filtered_variants.csv", index=False)

# Display the first few rows of the filtered dataset
print(df_res.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ... Germline classification  \
0                X       154022511       143283  ...              

### Load CSV file into a DataFrame

In [29]:

import requests

df = pd.read_csv("datap/clinvar_result.csv")

# Display first 5 rows
print(df.head())


                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ... Germline classification  \
0                X       154022511       143283  ...              

### Split Canonical SPDI into four separate columns

In [35]:
if "Canonical SPDI" in df.columns:
    # Split into four separate columns safely
    spdi_split = df["Canonical SPDI"].str.split(":", expand=True)
    
    # Check if all four columns exist after splitting
    if spdi_split.shape[1] == 4:
        df[['Sequence_ID', 'position', 'Deleted_Sequence', 'Inserted_Sequence']] = spdi_split
    else:
        print("Warning: Some rows do not have all four SPDI components and will be skipped.")
        df[['Sequence_ID', 'position', 'Deleted_Sequence', 'Inserted_Sequence']] = spdi_split.iloc[:, :4].fillna("Missing")

    # Convert Position to integer for numerical analysis
    df["position"] = pd.to_numeric(df["position"], errors="coerce")


print("✅ Canonical SPDI successfully split into separate columns.")


✅ Canonical SPDI successfully split into separate columns.


### Labeling mutation types

In [36]:

# Define a function to label mutation types
def label_mutation_type(row):
    purines = ['A', 'G']
    pyrimidines = ['C', 'T']
    deleted = row['Deleted_Sequence']
    inserted = row['Inserted_Sequence']
    
    if (deleted in purines and inserted in purines) or (deleted in pyrimidines and inserted in pyrimidines):
        return 'Transition'
    elif (deleted in purines and inserted in pyrimidines) or (deleted in pyrimidines and inserted in purines):
        return 'Transversion'
    else:
        return 'Unknown'

# Apply the function to label each mutation
df['Mutation_Type'] = df.apply(label_mutation_type, axis=1)



### Combines Deleted and Inserted Sequences

In [37]:
df_new = df

# Create a new column that combines Deleted and Inserted Sequences
df_new['BP_Mutation'] = df_new['Deleted_Sequence'] + '>' + df_new['Inserted_Sequence']

# Count the frequency of each base pair mutation
bp_mutation_counts = df_new['BP_Mutation'].value_counts()

# Get the highest base pair mutation
most_frequent_bp_mutation = bp_mutation_counts.idxmax()
most_frequent_bp_mutation_count = bp_mutation_counts.max()

print(f"✅ The most frequent base pair mutation is: {most_frequent_bp_mutation}")
print(f"✅ It occurs {most_frequent_bp_mutation_count} times.")


✅ The most frequent base pair mutation is: G>A
✅ It occurs 496 times.


In [None]:
df_variants = df_new
def fetch_dna_sequence(position, window=50):
    """Fetches a DNA sequence window (±50 bp) around a variant on Chromosome X."""
    start = max(1, int(position) - window)
    end = int(position) + window
    url = f"https://rest.ensembl.org/sequence/region/human/X:{start}..{end}?content-type=text/plain"
    
    response = requests.get(url)
    if response.status_code == 200:
        return response.text.strip()
    return "N" * (2 * window + 1)  # If request fails, return Ns

# Apply function to get sequence windows (Chromosome is always 'X')
df_variants["sequence_window"] = df_variants["position"].apply(
    lambda pos: fetch_dna_sequence(pos) if pos != "N/A" else "N" * 101
)

# Replace reference allele with alternate allele at position 51
def mutate_sequence(sequence, variant, position=50):
    """Replaces the reference base with the alternate allele at position 51."""
    return sequence[:position] + variant + sequence[position+1:]

# Convert alternate alleles to strings and handle NaNs
df_variants["Inserted_Sequence"] = df_variants["Inserted_Sequence"].fillna("N").astype(str)

# Apply function to mutate sequence
df_variants["mutated_sequence"] = df_variants.apply(
    lambda row: mutate_sequence(row["sequence_window"], row["Inserted_Sequence"]) 
    if row["sequence_window"] != "N" * 101 else row["sequence_window"],
    axis=1
)

def get_prev_alleles(sequence, position=50):
    """Extracts the previous allele from the sequence window."""
    return sequence[position - 1] if position > 0 else "N"

def get_next_alleles(sequence, position=50):
    """Extracts the next allele from the sequence window."""
    return sequence[position + 1] if position < len(sequence) - 1 else "N"

# Apply function to extract alleles
df_variants["prev_position_allele"] = df_variants["sequence_window"].apply(get_prev_alleles)
df_variants["next_position_allele"] = df_variants["sequence_window"].apply(get_next_alleles)


In [None]:
# List of columns to drop
columns_to_drop = [
    'Germline date last evaluated', 
    'Germline review status',
    'Somatic clinical impact',
    'Somatic clinical impact date last evaluated',
    'Somatic clinical impact review status', 
    'Oncogenicity classification',
    'Oncogenicity date last evaluated', 
    'Oncogenicity review status',
    'Unnamed: 24'
]

# Drop the columns
df_variants.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Display the updated dataframe
print("✅ Unwanted columns dropped successfully.")
print(df.head())


✅ Unwanted columns dropped successfully.
                               Name Gene(s) Protein change  \
0  NM_001110792.2(MECP2):c.*7856A>C   MECP2            NaN   
1  NM_001110792.2(MECP2):c.*7748C>T   MECP2            NaN   
2  NM_001110792.2(MECP2):c.*5839C>T   MECP2            NaN   
3  NM_001110792.2(MECP2):c.*5348T>C   MECP2            NaN   
4  NM_001110792.2(MECP2):c.*4576A>C   MECP2            NaN   

                 Condition(s)     Accession GRCh37Chromosome  GRCh37Location  \
0  not provided|Rett syndrome  VCV000143283                X       153287962   
1               Rett syndrome  VCV000143282                X       153288070   
2               Rett syndrome  VCV000143280                X       153289979   
3  not provided|Rett syndrome  VCV000143275                X       153290470   
4               Rett syndrome  VCV000143271                X       153291242   

  GRCh38Chromosome  GRCh38Location  VariationID  ...  \
0                X       154022511       143283  

In [None]:
df_variants.to_csv('data/clinvar_result(9)_labeled.csv', index=False)
print("✅ Mutation type labeling complete. Saved to 'data/clinvar_result(9)_labeled.csv'")
