In [1]:
import pandas as pd
import re
import requests

In [2]:
# Function to convert three-letter amino acid code to one-letter code
def aa_three_to_one(three_letter_code):
    aa_dict = {
        'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
        'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
        'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
        'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
    }
    return aa_dict.get(three_letter_code, '')

In [3]:
# Function to fetch protein sequence using Ensembl API
def get_protein_sequence(transcript_id):
    server = "https://rest.ensembl.org"
    ext = f"/sequence/id/{transcript_id}?type=protein"
    
    response = requests.get(server + ext, headers={"Content-Type": "application/json"})
    if not response.ok:
        response.raise_for_status()
    
    return response.json()

In [13]:
def parse_hgvs_stop_gained(hgvs_consequence):
    import re
    stop_gained_pattern = re.compile(r'p\.([A-Za-z]+)(\d+)Ter')
    
    match = stop_gained_pattern.match(hgvs_consequence)
    if match:
        start_aa = match.group(1)
        pos = int(match.group(2))
        return (start_aa, pos)
    else:
        raise ValueError(f"Invalid HGVS consequence format: {hgvs_consequence}")

In [14]:
def apply_stop_gained(wildtype_sequence, hgvs_consequence):
    start_aa, pos = parse_hgvs_stop_gained(hgvs_consequence)
    
    # Convert start_aa to one-letter code
    start_aa_one_letter = aa_three_to_one(start_aa)
    
    # Validate the amino acid at the specified position
    if wildtype_sequence[pos-1] != start_aa_one_letter:
        raise ValueError(f"Amino acid at position {pos} is not {start_aa_one_letter}")
    
    # Replace the amino acid at the specified position with a stop codon
    mutated_sequence = wildtype_sequence[:pos-1] + '*' + wildtype_sequence[pos:]
    
    return mutated_sequence

In [23]:
latest_df = pd.read_csv('dataset_v2.csv')

In [28]:
columns_to_copy = ['rsIDs', 'Transcript', 'Allele Frequency', 'HGVS Consequence', 'VEP Annotation']
stop_gained_df = latest_df[columns_to_copy].copy()

In [29]:
stop_gained_df = stop_gained_df[stop_gained_df['VEP Annotation'] == 'stop_gained']

In [30]:
unique_transcripts = stop_gained_df['Transcript'].unique()
unique_transcripts = [x.split('.')[0] for x in unique_transcripts]
unique_transcripts = set(unique_transcripts)

In [31]:
wildtype_seq = {}

for transcript in unique_transcripts:
    try:
        sequence = get_protein_sequence(transcript)
        wildtype_seq[transcript] = sequence['seq']
    except Exception as e:
        print(f"Error fetching sequence for {transcript_id}: {e}")

In [32]:
wildtype_seq

{'ENST00000269571': 'MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLELTYLPTNASLSFLQDIQEVQGYVLIAHNQVRQVPLQRLRIVRGTQLFEDNYALAVLDNGDPLNNTTPVTGASPGGLRELQLRSLTEILKGGVLIQRNPQLCYQDTILWKDIFHKNNQLALTLIDTNRSRACHPCSPMCKGSRCWGESSEDCQSLTRTVCAGGCARCKGPLPTDCCHEQCAAGCTGPKHSDCLACLHFNHSGICELHCPALVTYNTDTFESMPNPEGRYTFGASCVTACPYNYLSTDVGSCTLVCPLHNQEVTAEDGTQRCEKCSKPCARVCYGLGMEHLREVRAVTSANIQEFAGCKKIFGSLAFLPESFDGDPASNTAPLQPEQLQVFETLEEITGYLYISAWPDSLPDLSVFQNLQVIRGRILHNGAYSLTLQGLGISWLGLRSLRELGSGLALIHHNTHLCFVHTVPWDQLFRNPHQALLHTANRPEDECVGEGLACHQLCARGHCWGPGPTQCVNCSQFLRGQECVEECRVLQGLPREYVNARHCLPCHPECQPQNGSVTCFGPEADQCVACAHYKDPPFCVARCPSGVKPDLSYMPIWKFPDEEGACQPCPINCTHSCVDLDDKGCPAEQRASPLTSIISAVVGILLVVVLGVVFGILIKRRQQKIRKYTMRRLLQETELVEPLTPSGAMPNQAQMRILKETELRKVKVLGSGAFGTVYKGIWIPDGENVKIPVAIKVLRENTSPKANKEILDEAYVMAGVGSPYVSRLLGICLTSTVQLVTQLMPYGCLLDHVRENRGRLGSQDLLNWCMQIAKGMSYLEDVRLVHRDLAARNVLVKSPNHVKITDFGLARLLDIDETEYHADGGKVPIKWMALESILRRRFTHQSDVWSYGVTVWELMTFGAKPYDGIPAREIPDLLEKGERLPQPPICTIDVYMIMVKCWMIDSECRPRFRELVSEFSRM

In [12]:
stop_gained_df

Unnamed: 0,rsIDs,Transcript,Allele Frequency,HGVS Consequence,VEP Annotation
19,rs2145266957,ENST00000269571.10,6.920032e-07,p.Glu2Ter,stop_gained
106,rs2145407678,ENST00000269571.10,6.218712e-07,p.Tyr50Ter,stop_gained
135,rs1449036371,ENST00000269571.10,1.27621e-06,p.Gln75Ter,stop_gained
164,rs1555614298,ENST00000269571.10,1.858692e-06,p.Arg103Ter,stop_gained
166,rs1395467940,ENST00000269571.10,6.195081e-07,p.Gln106Ter,stop_gained
201,rs1249755832,ENST00000269571.10,1.858911e-06,p.Arg143Ter,stop_gained
279,,ENST00000269571.10,6.197484e-07,p.Trp205Ter,stop_gained
427,rs2058829206,ENST00000269571.10,6.196317e-07,p.Arg340Ter,stop_gained
438,rs2145576104,ENST00000269571.10,6.195887e-07,p.Leu350Ter,stop_gained
441,rs770327076,ENST00000269571.10,6.209745e-07,p.Arg351Ter,stop_gained


In [15]:
def apply_mutation(row):
    mutation = row['HGVS Consequence']
    sequence = row['Transcript']
    sequence = wildtype_seq[sequence.split('.')[0]]
    
    if mutation.startswith("p."):
        try:
            return apply_stop_gained(sequence, mutation)
        except ValueError as e:
            return f"Error: {e}"

stop_gained_df['Mutated sequence'] = stop_gained_df.apply(apply_mutation, axis=1)

In [16]:
stop_gained_df

Unnamed: 0,rsIDs,Transcript,Allele Frequency,HGVS Consequence,VEP Annotation,Mutated sequence
19,rs2145266957,ENST00000269571.10,6.920032e-07,p.Glu2Ter,stop_gained,M*LAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
106,rs2145407678,ENST00000269571.10,6.218712e-07,p.Tyr50Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
135,rs1449036371,ENST00000269571.10,1.27621e-06,p.Gln75Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
164,rs1555614298,ENST00000269571.10,1.858692e-06,p.Arg103Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
166,rs1395467940,ENST00000269571.10,6.195081e-07,p.Gln106Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
201,rs1249755832,ENST00000269571.10,1.858911e-06,p.Arg143Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
279,,ENST00000269571.10,6.197484e-07,p.Trp205Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
427,rs2058829206,ENST00000269571.10,6.196317e-07,p.Arg340Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
438,rs2145576104,ENST00000269571.10,6.195887e-07,p.Leu350Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
441,rs770327076,ENST00000269571.10,6.209745e-07,p.Arg351Ter,stop_gained,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...


In [19]:
# Saving to CSV Excel Format
# Specify the file path where you want to save the CSV file
file_path = 'stop_gained.csv'

# Save the DataFrame to CSV
stop_gained_df.to_csv(file_path, index=False)

print(f"DataFrame saved to {file_path}")

DataFrame saved to stop_gained.csv


In [20]:
# Specify the file path where you want to save the JSON file (txt format)
file_path = 'stop_gained.json'

# Save the DataFrame to JSON
stop_gained_df.to_json(file_path, orient='records')

print(f"DataFrame saved to {file_path}")

DataFrame saved to stop_gained.json


In [22]:
import json

# Specify paths
json_file = 'stop_gained.json'
txt_file = 'stop_gained.txt'

# Read JSON data from file
with open(json_file, 'r') as f:
    data = json.load(f)

# Format as plain text
text_data = ''
for item in data:
    text_data += f'''rsIDs: {item['rsIDs']}\nTranscript: {item['Transcript']}\nAllele Frequency: {item['Allele Frequency']}\nHGVS Consequence: {item['HGVS Consequence']}\nVEP Annotation: {item['VEP Annotation']}\nMutated sequence: {item['Mutated sequence']}
    \n============================================\n'''

# Write to text file
with open(txt_file, 'w') as f:
    f.write(text_data)

print(f"Data converted and saved to {txt_file}")

Data converted and saved to stop_gained.txt
