In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from io import StringIO
from Bio.SeqUtils import seq3, seq1
import pandas as pd
import datetime
import re


# Process variants from gnomAD into relevant sequences

We are interested in the structure of relevant in-population sequences 

We combine gnomAD info with the reference sequence from Uniprot to make our variants which we will then co-fold downstream

In [14]:
data = {"CYP3A4": ("./CYP3A4/CYP3A4_gnomAD_v4.1.0_ENSG00000160868.csv", "./CYP3A4/CYP3A4_P08684.fasta", "P08684"),
        "CYP2C9": ("./CYP2C9/CYP2C9_gnomAD_v4.1.0_ENSG00000138109.csv", "./CYP2C9/CYP2C9_P11712.fasta", "P11712"),
        "CYP2D6": ("./CYP2D6/CYP2D6_gnomAD_v4.1.0_ENSG00000100197.csv", "./CYP2D6/CYP2D6_P10635.fasta", "P10635"),
        "CYP2J2": ("./CYP2J2/CYP2J2_gnomAD_v4.1.0_ENSG00000134716.csv", "./CYP2J2/CYP2J2_P51589.fasta", "P51589"),
        "PXR":    ("./PXR/PXR_gnomAD_v4.1.0_ENSG00000144852.csv", "./PXR/PXR_O75469.fasta", "O75469"),
        "AHR":    ("./AHR/AHR_gnomAD_v4.1.0_ENSG00000106546.csv", "./AHR/AHR_P35869.fasta", "P35869")
        
       }

## VEPS

We are not interested in intron or splice variants as we wish to be working with protein sequences, PLOF in these sequences is a bioinformatics not structural biology problem. 

We could keep stop-lost, stop_gained and inframe and frameshifts in but will assume PLOF at this stage. 

In [3]:
mis_and_PLOF_VEPs = ["stop_lost", "stop_gained", "missense_variant", "inframe_deletion", "frameshift_variant"]

In [4]:
mis_VEPs = ["missense_variant", "inframe_deletion"]

# Transcripts

We are not considering transcripts other than the canonical MANE transcript as they may not match the Uniprot, NCBI and EMBL-EBI sequences, we need to record their numbers here

In [5]:
canonical_MANE_transcripts = {
    "CYP3A4": "ENST00000651514.1"
}


In [6]:
def read_seq(path):
    seq = SeqIO.parse(path, "fasta")

    seqs = []
    for r in seq.records:
        seqs.append(r.seq)
    assert len(seqs) == 1
    sequence = seqs[0]
    return sequence

In [7]:
def parse_hgvs(hgvs):
    match = re.match(r"p\.([A-Za-z]+)(\d+)([A-Za-z]+)$", hgvs)
    if match:
        return match.groups()  # Returns (original_aa, position, new_aa)
    return None  # Return None for frameshifts or other invalid cases


In [23]:
def apply_hgvs_annotation(ref_seq, hgvs_annotation, uniprot_id):
    data = parse_hgvs(hgvs_annotation)
    print(f"HGVS: {hgvs_annotation}")
    print(f"parsed {data}")

    if data == None:
        return None
    else:
        # unpack
        original, hgvs_position, new = data

    # adjust for 1 indexing 
    biopython_idx = int(hgvs_position) -1
    # check original matches
    ref_seq_original = ref_seq[biopython_idx]
        
    # convert to 1 letter 
    ref_seq_original = seq3(ref_seq_original)
        
    # check they line up        
    print(f"original {original}, reference {ref_seq_original}")
    assert original == ref_seq_original

    mutable_seq = MutableSeq(ref_seq)

    
    if new == "del":
        print("Handling deletion...")
        del mutable_seq[biopython_idx]  
    
    else:
        # apply new
        new = seq1(new)
        mutable_seq[biopython_idx] = new

    # Convert to FASTA format
    modified_seq = Seq(str(mutable_seq))
    record = SeqRecord(modified_seq, id=uniprot_id, description=f"Modified on HGVS annotation: {hgvs_annotation}")

    fasta_output = StringIO()
    SeqIO.write(record, fasta_output, "fasta")
    return fasta_output.getvalue()
        


    
    
    
    
    
    

In [24]:
def process_data(target, gnomad_csv, reference_seq_fasta, uniprot_id,  veps, canoncial_transcript):
    print(f"processing {target} {gnomad_csv} {reference_seq_fasta}")
    
    # grab gnomad data 
    variant_data = pd.read_csv(gnomad_csv)
    canonical_only = variant_data[variant_data["Transcript"] == canoncial_transcript]
    sorted = canonical_only.sort_values("Allele Frequency", ascending=False)
    relevant_veps = sorted[sorted["VEP Annotation"].isin(veps)]

    # grab ref seq 
    ref_seq = read_seq(reference_seq_fasta)
    print(ref_seq)
    
    relevant_veps["mutant_seq"] = relevant_veps["Protein Consequence"].apply(lambda x: apply_hgvs_annotation(ref_seq, x, uniprot_id))

    relevant_veps.to_csv("test.csv", index=False)    

    

In [25]:
process_data("CYP3A4", data["CYP3A4"][0], data["CYP3A4"][1], data["CYP3A4"][2], mis_VEPs, canonical_MANE_transcripts["CYP3A4"])

processing CYP3A4 ./CYP3A4/CYP3A4_gnomAD_v4.1.0_ENSG00000160868.csv ./CYP3A4/CYP3A4_P08684.fasta
MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPFLGNILSYHKGFCMFDMECHKKYGKVWGFYDGQQPVLAITDPDMIKTVLVKECYSVFTNRRPFGPVGFMKSAISIAEDEEWKRLRSLLSPTFTSGKLKEMVPIIAQYGDVLVRNLRREAETGKPVTLKDVFGAYSMDVITSTSFGVNIDSLNNPQDPFVENTKKLLRFDFLDPFFLSITVFPFLIPILEVLNICVFPREVTNFLRKSVKRMKESRLEDTQKHRVDFLQLMIDSQNSKETESHKALSDLELVAQSIIFIFAGYETTSSVLSFIMYELATHPDVQQKLQEEIDAVLPNKAPPTYDTVLQMEYLDMVVNETLRLFPIAMRLERVCKKDVEINGMFIPKGVVVMIPSYALHRDPKYWTEPEKFLPERFSKKNKDNIDPYIYTPFGSGPRNCIGMRFALMNMKLALIRVLQNFSFKPCKETQIPLKLSLGGLLQPEKPVVLKVESRDGTVSGA
HGVS: p.Met445Thr
parsed ('Met', '445', 'Thr')
original Met, reference Met
HGVS: p.Asp174His
parsed ('Asp', '174', 'His')
original Asp, reference Asp
HGVS: p.Arg162Gln
parsed ('Arg', '162', 'Gln')
original Arg, reference Arg
HGVS: p.Leu293Pro
parsed ('Leu', '293', 'Pro')
original Leu, reference Leu
HGVS: p.Gly56Asp
parsed ('Gly', '56', 'Asp')
original Gly, reference Gly
HGVS: p.Ser222Pro
parsed (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps["Protein Consequence"].apply(lambda x: apply_hgvs_annotation(ref_seq, x, uniprot_id))
