In [14]:
import pandas as pd
import re
import requests

In [2]:
# Function to convert three-letter amino acid code to one-letter code
def aa_three_to_one(three_letter_code):
    aa_dict = {
        'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
        'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
        'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
        'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
    }
    return aa_dict.get(three_letter_code, '')

In [3]:
# Function to apply missense or nonsense mutation
def apply_protein_mutation(protein_sequence, mutation):
    if not mutation.startswith("p."):
        raise ValueError(f"Invalid mutation format: {mutation}")

    # Extract information from mutation string
    mutation_pattern = re.compile(r"p\.(\D+)(\d+)(\D*)")
    match = mutation_pattern.match(mutation)

    if not match:
        raise ValueError(f"Invalid protein mutation format: {mutation}")

    original_aa, position, new_aa = match.groups()
    position = int(position)

    # Convert three-letter amino acid codes to one-letter codes
    original_aa_one = aa_three_to_one(original_aa)
    new_aa_one = aa_three_to_one(new_aa)

    # Check if it's a nonsense mutation (new_aa is a stop codon '*')
    if new_aa_one == "*":
        return protein_sequence[:position - 1]

    # Check if the original amino acid matches the expected one in the sequence
    if protein_sequence[position - 1] != original_aa_one:
        raise ValueError(f"Original amino acid {original_aa} does not match the one in the sequence at position {position}")

    # Apply the mutation
    mutated_sequence = (protein_sequence[:position - 1] + new_aa_one + protein_sequence[position:])
    return mutated_sequence

In [13]:
# Function to fetch protein sequence using Ensembl API
def get_protein_sequence(transcript_id):
    server = "https://rest.ensembl.org"
    ext = f"/sequence/id/{transcript_id}?type=protein"
    
    response = requests.get(server + ext, headers={"Content-Type": "application/json"})
    if not response.ok:
        response.raise_for_status()
    
    return response.json()

In [4]:
file_path = 'dataset.xlsx'
data = pd.read_excel(file_path)

In [7]:
columns_to_copy = ['rsIDs', 'Transcript', 'Allele Frequency', 'HGVS Consequence']

missense_df = data[columns_to_copy].copy()

In [15]:
unique_transcripts = missense_df['Transcript'].unique()
unique_transcripts = [x.split('.')[0] for x in unique_transcripts]
unique_transcripts = set(unique_transcripts)

In [16]:
wildtype_seq = {}

for transcript in unique_transcripts:
    try:
        sequence = get_protein_sequence(transcript)
        wildtype_seq[transcript] = sequence['seq']
    except Exception as e:
        print(f"Error fetching sequence for {transcript_id}: {e}")

In [17]:
wildtype_seq

{'ENST00000269571': 'MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLELTYLPTNASLSFLQDIQEVQGYVLIAHNQVRQVPLQRLRIVRGTQLFEDNYALAVLDNGDPLNNTTPVTGASPGGLRELQLRSLTEILKGGVLIQRNPQLCYQDTILWKDIFHKNNQLALTLIDTNRSRACHPCSPMCKGSRCWGESSEDCQSLTRTVCAGGCARCKGPLPTDCCHEQCAAGCTGPKHSDCLACLHFNHSGICELHCPALVTYNTDTFESMPNPEGRYTFGASCVTACPYNYLSTDVGSCTLVCPLHNQEVTAEDGTQRCEKCSKPCARVCYGLGMEHLREVRAVTSANIQEFAGCKKIFGSLAFLPESFDGDPASNTAPLQPEQLQVFETLEEITGYLYISAWPDSLPDLSVFQNLQVIRGRILHNGAYSLTLQGLGISWLGLRSLRELGSGLALIHHNTHLCFVHTVPWDQLFRNPHQALLHTANRPEDECVGEGLACHQLCARGHCWGPGPTQCVNCSQFLRGQECVEECRVLQGLPREYVNARHCLPCHPECQPQNGSVTCFGPEADQCVACAHYKDPPFCVARCPSGVKPDLSYMPIWKFPDEEGACQPCPINCTHSCVDLDDKGCPAEQRASPLTSIISAVVGILLVVVLGVVFGILIKRRQQKIRKYTMRRLLQETELVEPLTPSGAMPNQAQMRILKETELRKVKVLGSGAFGTVYKGIWIPDGENVKIPVAIKVLRENTSPKANKEILDEAYVMAGVGSPYVSRLLGICLTSTVQLVTQLMPYGCLLDHVRENRGRLGSQDLLNWCMQIAKGMSYLEDVRLVHRDLAARNVLVKSPNHVKITDFGLARLLDIDETEYHADGGKVPIKWMALESILRRRFTHQSDVWSYGVTVWELMTFGAKPYDGIPAREIPDLLEKGERLPQPPICTIDVYMIMVKCWMIDSECRPRFRELVSEFSRM

In [18]:
# Apply mutations and store results
def apply_mutation(row):
    mutation = row['HGVS Consequence']
    sequence = row['Transcript']
    sequence = wildtype_seq[sequence.split('.')[0]]
    
    if mutation.startswith("p."):
        try:
            return apply_protein_mutation(sequence, mutation)
        except ValueError as e:
            return f"Error: {e}"

missense_df['Mutated sequence'] = missense_df.apply(apply_mutation, axis=1)

In [19]:
missense_df

Unnamed: 0,rsIDs,Transcript,Allele Frequency,HGVS Consequence,Mutated sequence
0,rs781138665,ENST00000541774.5,1.304412e-06,p.Pro2Ser,MSRGSWKPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
1,rs1193961962,ENST00000541774.5,1.500189e-05,p.Arg3Trp,MPWGSWKPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
2,rs1193961962,ENST00000541774.5,5.218049e-06,p.Arg3Gly,MPGGSWKPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
3,rs966070730,ENST00000541774.5,5.222328e-06,p.Arg3Gln,MPQGSWKPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
4,rs966070730,ENST00000541774.5,1.958375e-06,p.Arg3Pro,MPPGSWKPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
5,,ENST00000541774.5,6.530177e-07,p.Gly4Val,MPRVSWKPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
6,rs1211697061,ENST00000541774.5,6.540479e-07,p.Trp6Arg,MPRGSRKPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
7,,ENST00000541774.5,6.559863e-07,p.Lys7Thr,MPRGSWTPQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
8,rs4252596,ENST00000541774.5,0.1193098,p.Pro8Thr,MPRGSWKTQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...
9,,ENST00000541774.5,6.673705e-07,p.Val10Met,MPRGSWKPQMCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLE...


In [21]:
# Saving to CSV Excel Format
# Specify the file path where you want to save the CSV file
file_path = 'test_10_missense_mutation.csv'

# Save the DataFrame to CSV
missense_df.to_csv(file_path, index=False)

print(f"DataFrame saved to {file_path}")

DataFrame saved to test_10_missense_mutation.csv


In [22]:
# Specify the file path where you want to save the JSON file (txt format)
file_path = 'test_10_missense_mutation.json'

# Save the DataFrame to JSON
missense_df.to_json(file_path, orient='records')

print(f"DataFrame saved to {file_path}")

DataFrame saved to test_10_missense_mutation.json
