In [1]:
import pandas as pd
import re
import requests

In [2]:
# Function to convert three-letter amino acid code to one-letter code
def aa_three_to_one(three_letter_code):
    aa_dict = {
        'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
        'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
        'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
        'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
    }
    return aa_dict.get(three_letter_code, '')

In [3]:
# Function to fetch protein sequence using Ensembl API
def get_protein_sequence(transcript_id):
    server = "https://rest.ensembl.org"
    ext = f"/sequence/id/{transcript_id}?type=protein"
    
    response = requests.get(server + ext, headers={"Content-Type": "application/json"})
    if not response.ok:
        response.raise_for_status()
    
    return response.json()

In [19]:
def parse_hgvs_consequence(hgvs_consequence):
    import re
    deletion_pattern = re.compile(r'p\.([A-Za-z]+)(\d+)_?([A-Za-z]*)?(\d+)?del(?:ins([A-Za-z]+))?')
    
    match = deletion_pattern.match(hgvs_consequence)
    if match:
        start_aa = match.group(1)
        start_pos = int(match.group(2))
        end_aa = match.group(3) if match.group(3) else start_aa
        end_pos = int(match.group(4)) if match.group(4) else start_pos
        insertion = match.group(5) if match.group(5) else ''
        return (start_aa, start_pos, end_aa, end_pos, insertion)
    else:
        raise ValueError(f"Invalid HGVS consequence format: {hgvs_consequence}")

In [20]:
def apply_inframe_deletion(wildtype_sequence, hgvs_consequence):
    start_aa, start_pos, end_aa, end_pos, insertion = parse_hgvs_consequence(hgvs_consequence)
    
    deletion_start_index = start_pos - 1
    deletion_end_index = end_pos - 1
    
    # Perform the deletion
    mutated_sequence = wildtype_sequence[:deletion_start_index] + wildtype_sequence[deletion_end_index + 1:]
    
    # Handle insertion if present
    if insertion:
        insertion_one_letter = aa_three_to_one(insertion)
        mutated_sequence = mutated_sequence[:deletion_start_index] + insertion_one_letter + mutated_sequence[deletion_start_index:]
    
    return mutated_sequence

In [9]:
latest_df = pd.read_csv('dataset_v2.csv')

In [12]:
columns_to_copy = ['rsIDs', 'Transcript', 'Allele Frequency', 'HGVS Consequence', 'VEP Annotation']
inframe_del_df = latest_df[columns_to_copy].copy()

In [13]:
inframe_del_df = inframe_del_df[inframe_del_df['VEP Annotation'] == 'inframe_deletion']

In [14]:
unique_transcripts = inframe_del_df['Transcript'].unique()
unique_transcripts = [x.split('.')[0] for x in unique_transcripts]
unique_transcripts = set(unique_transcripts)

In [15]:
wildtype_seq = {}

for transcript in unique_transcripts:
    try:
        sequence = get_protein_sequence(transcript)
        wildtype_seq[transcript] = sequence['seq']
    except Exception as e:
        print(f"Error fetching sequence for {transcript_id}: {e}")

In [21]:
def apply_mutation(row):
    mutation = row['HGVS Consequence']
    sequence = row['Transcript']
    sequence = wildtype_seq[sequence.split('.')[0]]
    
    if mutation.startswith("p."):
        try:
            return apply_inframe_deletion(sequence, mutation)
        except ValueError as e:
            return f"Error: {e}"

inframe_del_df['Mutated sequence'] = inframe_del_df.apply(apply_mutation, axis=1)

In [23]:
inframe_del_df

Unnamed: 0,rsIDs,Transcript,Allele Frequency,HGVS Consequence,VEP Annotation,Mutated sequence
300,,ENST00000269571.10,6.216369e-07,p.Gly222_Ala225del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
460,,ENST00000269571.10,1.239157e-06,p.Lys369del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
765,rs1185835397,ENST00000269571.10,7.438299e-06,p.Glu620del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
815,,ENST00000269571.10,6.197154e-07,p.Leu667_Val670del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
899,,ENST00000269571.10,6.195464e-07,p.Arg756del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
928,rs756428331,ENST00000269571.10,6.194942e-07,p.Thr798_Met801del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
1057,,ENST00000269571.10,6.197907e-07,p.Lys937_Gly938delinsArg,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
1130,,ENST00000269571.10,6.199874e-07,p.Leu1000_Thr1003del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
1172,,ENST00000269571.10,6.196493e-07,p.Phe1031del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
1216,,ENST00000269571.10,8.682655e-06,p.Glu1069del,inframe_deletion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...


In [24]:
# Saving to CSV Excel Format
# Specify the file path where you want to save the CSV file
file_path = 'inframe_deletion.csv'

# Save the DataFrame to CSV
inframe_del_df.to_csv(file_path, index=False)

print(f"DataFrame saved to {file_path}")

DataFrame saved to inframe_deletion.csv


In [25]:
# Specify the file path where you want to save the JSON file (txt format)
file_path = 'inframe_deletion.json'

# Save the DataFrame to JSON
inframe_del_df.to_json(file_path, orient='records')

print(f"DataFrame saved to {file_path}")

DataFrame saved to inframe_deletion.json


In [26]:
import json

# Specify paths
json_file = 'inframe_deletion.json'
txt_file = 'inframe_deletionn.txt'

# Read JSON data from file
with open(json_file, 'r') as f:
    data = json.load(f)

# Format as plain text
text_data = ''
for item in data:
    text_data += f'''rsIDs: {item['rsIDs']}\nTranscript: {item['Transcript']}\nAllele Frequency: {item['Allele Frequency']}\nHGVS Consequence: {item['HGVS Consequence']}\nVEP Annotation: {item['VEP Annotation']}\nMutated sequence: {item['Mutated sequence']}
    \n============================================\n'''

# Write to text file
with open(txt_file, 'w') as f:
    f.write(text_data)

print(f"Data converted and saved to {txt_file}")


Data converted and saved to inframe_deletionn.txt


In [27]:
wildtype_seq

{'ENST00000269571': 'MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLELTYLPTNASLSFLQDIQEVQGYVLIAHNQVRQVPLQRLRIVRGTQLFEDNYALAVLDNGDPLNNTTPVTGASPGGLRELQLRSLTEILKGGVLIQRNPQLCYQDTILWKDIFHKNNQLALTLIDTNRSRACHPCSPMCKGSRCWGESSEDCQSLTRTVCAGGCARCKGPLPTDCCHEQCAAGCTGPKHSDCLACLHFNHSGICELHCPALVTYNTDTFESMPNPEGRYTFGASCVTACPYNYLSTDVGSCTLVCPLHNQEVTAEDGTQRCEKCSKPCARVCYGLGMEHLREVRAVTSANIQEFAGCKKIFGSLAFLPESFDGDPASNTAPLQPEQLQVFETLEEITGYLYISAWPDSLPDLSVFQNLQVIRGRILHNGAYSLTLQGLGISWLGLRSLRELGSGLALIHHNTHLCFVHTVPWDQLFRNPHQALLHTANRPEDECVGEGLACHQLCARGHCWGPGPTQCVNCSQFLRGQECVEECRVLQGLPREYVNARHCLPCHPECQPQNGSVTCFGPEADQCVACAHYKDPPFCVARCPSGVKPDLSYMPIWKFPDEEGACQPCPINCTHSCVDLDDKGCPAEQRASPLTSIISAVVGILLVVVLGVVFGILIKRRQQKIRKYTMRRLLQETELVEPLTPSGAMPNQAQMRILKETELRKVKVLGSGAFGTVYKGIWIPDGENVKIPVAIKVLRENTSPKANKEILDEAYVMAGVGSPYVSRLLGICLTSTVQLVTQLMPYGCLLDHVRENRGRLGSQDLLNWCMQIAKGMSYLEDVRLVHRDLAARNVLVKSPNHVKITDFGLARLLDIDETEYHADGGKVPIKWMALESILRRRFTHQSDVWSYGVTVWELMTFGAKPYDGIPAREIPDLLEKGERLPQPPICTIDVYMIMVKCWMIDSECRPRFRELVSEFSRM