In [95]:
import pandas as pd
import re
import requests

In [96]:
# Function to convert three-letter amino acid code to one-letter code
def aa_three_to_one(three_letter_code):
    aa_dict = {
        'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
        'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
        'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
        'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
    }
    return aa_dict.get(three_letter_code, '')

In [97]:
# Function to fetch protein sequence using Ensembl API
def get_protein_sequence(transcript_id):
    server = "https://rest.ensembl.org"
    ext = f"/sequence/id/{transcript_id}?type=protein"
    
    response = requests.get(server + ext, headers={"Content-Type": "application/json"})
    if not response.ok:
        response.raise_for_status()
    
    return response.json()

In [98]:
def parse_hgvs_insertion(hgvs_consequence):
    import re
    insertion_pattern = re.compile(r'p\.([A-Za-z]+)(\d+)_?([A-Za-z]*)?(\d+)?(dup|ins)?([A-Za-z]*)')
    
    match = insertion_pattern.match(hgvs_consequence)
    if match:
        start_aa = match.group(1)
        start_pos = int(match.group(2))
        end_aa = match.group(3) if match.group(3) else start_aa
        end_pos = int(match.group(4)) if match.group(4) else start_pos
        insertion_type = match.group(5) if match.group(5) else 'dup'
        insertion = match.group(6)
        return (start_aa, start_pos, end_aa, end_pos, insertion, insertion_type)
    else:
        raise ValueError(f"Invalid HGVS consequence format: {hgvs_consequence}")

In [99]:
def apply_inframe_insertion(wildtype_sequence, hgvs_consequence):
    start_aa, start_pos, end_aa, end_pos, insertion, insertion_type = parse_hgvs_insertion(hgvs_consequence)
    
    insertion_index = end_pos

    if insertion_type == 'dup':
        # Duplication: duplicate the amino acids in the specified range
        dup_sequence = wildtype_sequence[start_pos-1:end_pos]
        mutated_sequence = wildtype_sequence[:insertion_index] + dup_sequence + wildtype_sequence[insertion_index:]
    elif insertion_type == 'ins':
        # Insertion: convert the insertion sequence to one-letter code and insert it
        insertion_one_letter = ''.join([aa_three_to_one(insertion[i:i+3]) for i in range(0, len(insertion), 3)])
        mutated_sequence = wildtype_sequence[:start_pos] + insertion_one_letter + wildtype_sequence[start_pos:]
    
    return mutated_sequence

In [100]:
latest_df = pd.read_csv('dataset_v2.csv')

In [101]:
columns_to_copy = ['rsIDs', 'Transcript', 'Allele Frequency', 'HGVS Consequence', 'VEP Annotation']
inframe_ins_df = latest_df[columns_to_copy].copy()

In [102]:
inframe_ins_df = inframe_ins_df[inframe_ins_df['VEP Annotation'] == 'inframe_insertion']

In [103]:
unique_transcripts = inframe_ins_df['Transcript'].unique()
unique_transcripts = [x.split('.')[0] for x in unique_transcripts]
unique_transcripts = set(unique_transcripts)

In [104]:
wildtype_seq = {}

for transcript in unique_transcripts:
    try:
        sequence = get_protein_sequence(transcript)
        wildtype_seq[transcript] = sequence['seq']
    except Exception as e:
        print(f"Error fetching sequence for {transcript_id}: {e}")

In [105]:
wildtype_seq

{'ENST00000269571': 'MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLELTYLPTNASLSFLQDIQEVQGYVLIAHNQVRQVPLQRLRIVRGTQLFEDNYALAVLDNGDPLNNTTPVTGASPGGLRELQLRSLTEILKGGVLIQRNPQLCYQDTILWKDIFHKNNQLALTLIDTNRSRACHPCSPMCKGSRCWGESSEDCQSLTRTVCAGGCARCKGPLPTDCCHEQCAAGCTGPKHSDCLACLHFNHSGICELHCPALVTYNTDTFESMPNPEGRYTFGASCVTACPYNYLSTDVGSCTLVCPLHNQEVTAEDGTQRCEKCSKPCARVCYGLGMEHLREVRAVTSANIQEFAGCKKIFGSLAFLPESFDGDPASNTAPLQPEQLQVFETLEEITGYLYISAWPDSLPDLSVFQNLQVIRGRILHNGAYSLTLQGLGISWLGLRSLRELGSGLALIHHNTHLCFVHTVPWDQLFRNPHQALLHTANRPEDECVGEGLACHQLCARGHCWGPGPTQCVNCSQFLRGQECVEECRVLQGLPREYVNARHCLPCHPECQPQNGSVTCFGPEADQCVACAHYKDPPFCVARCPSGVKPDLSYMPIWKFPDEEGACQPCPINCTHSCVDLDDKGCPAEQRASPLTSIISAVVGILLVVVLGVVFGILIKRRQQKIRKYTMRRLLQETELVEPLTPSGAMPNQAQMRILKETELRKVKVLGSGAFGTVYKGIWIPDGENVKIPVAIKVLRENTSPKANKEILDEAYVMAGVGSPYVSRLLGICLTSTVQLVTQLMPYGCLLDHVRENRGRLGSQDLLNWCMQIAKGMSYLEDVRLVHRDLAARNVLVKSPNHVKITDFGLARLLDIDETEYHADGGKVPIKWMALESILRRRFTHQSDVWSYGVTVWELMTFGAKPYDGIPAREIPDLLEKGERLPQPPICTIDVYMIMVKCWMIDSECRPRFRELVSEFSRM

In [106]:
inframe_ins_df

Unnamed: 0,rsIDs,Transcript,Allele Frequency,HGVS Consequence,VEP Annotation
603,,ENST00000269571.10,6.200674e-07,p.Val480dup,inframe_insertion
1219,,ENST00000269571.10,6.203343e-07,p.Ala1070_Ala1081dup,inframe_insertion
1306,,ENST00000269571.10,6.248102e-07,c.3406_3412+2dup,inframe_insertion
1401,,ENST00000269571.10,6.195349e-07,p.Pro1205_Gln1206insHisPro,inframe_insertion


In [107]:
def apply_mutation(row):
    mutation = row['HGVS Consequence']
    sequence = row['Transcript']
    sequence = wildtype_seq[sequence.split('.')[0]]
    
    if mutation.startswith("p."):
        try:
            return apply_inframe_insertion(sequence, mutation)
        except ValueError as e:
            return f"Error: {e}"

inframe_ins_df['Mutated sequence'] = inframe_ins_df.apply(apply_mutation, axis=1)

In [108]:
inframe_ins_df

Unnamed: 0,rsIDs,Transcript,Allele Frequency,HGVS Consequence,VEP Annotation,Mutated sequence
603,,ENST00000269571.10,6.200674e-07,p.Val480dup,inframe_insertion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
1219,,ENST00000269571.10,6.203343e-07,p.Ala1070_Ala1081dup,inframe_insertion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
1306,,ENST00000269571.10,6.248102e-07,c.3406_3412+2dup,inframe_insertion,
1401,,ENST00000269571.10,6.195349e-07,p.Pro1205_Gln1206insHisPro,inframe_insertion,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...


In [109]:
# Saving to CSV Excel Format
# Specify the file path where you want to save the CSV file
file_path = 'inframe_insertion.csv'

# Save the DataFrame to CSV
inframe_ins_df.to_csv(file_path, index=False)

print(f"DataFrame saved to {file_path}")

DataFrame saved to inframe_insertion.csv


In [110]:
# Specify the file path where you want to save the JSON file (txt format)
file_path = 'inframe_insertion.json'

# Save the DataFrame to JSON
inframe_ins_df.to_json(file_path, orient='records')

print(f"DataFrame saved to {file_path}")

DataFrame saved to inframe_insertion.json


In [111]:
import json

# Specify paths
json_file = 'inframe_insertion.json'
txt_file = 'inframe_insertion.txt'

# Read JSON data from file
with open(json_file, 'r') as f:
    data = json.load(f)

# Format as plain text
text_data = ''
for item in data:
    text_data += f'''rsIDs: {item['rsIDs']}\nTranscript: {item['Transcript']}\nAllele Frequency: {item['Allele Frequency']}\nHGVS Consequence: {item['HGVS Consequence']}\nVEP Annotation: {item['VEP Annotation']}\nMutated sequence: {item['Mutated sequence']}
    \n============================================\n'''

# Write to text file
with open(txt_file, 'w') as f:
    f.write(text_data)

print(f"Data converted and saved to {txt_file}")

Data converted and saved to inframe_insertion.txt


### Testing

In [93]:
test = inframe_ins_df['Mutated sequence'].iloc[0]

In [94]:
test[479:481]

'VV'

In [86]:
test[1081:1093]

'APRSPLAPSEGA'

In [87]:
ori = wildtype_seq['ENST00000269571']

In [92]:
ori[479:481]

'VP'

In [89]:
ori[1081:1093]

'GSDVFDGDLGMG'

In [55]:
len(test)

1267