In [50]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from io import StringIO
from Bio.SeqUtils import seq3, seq1
import pandas as pd
import datetime
import re
from pathlib import Path
import shutil



# Process variants from gnomAD into relevant sequences

We are interested in the structure of relevant in-population sequences 

We combine gnomAD info with the reference sequence from Uniprot to make our variants which we will then co-fold downstream

In [51]:
tlist = [f.name for f in Path('.').iterdir() if f.is_dir() and not f.name.startswith('.')]
tlist

['PDE3A',
 'CB1',
 'M1',
 'PXR',
 'Cav1.2',
 'CYP2J2',
 'B1AR',
 'SLC6A3',
 'SLC6A4',
 'Nav1.5',
 'CYP3A4',
 'A1AR',
 'AHR',
 'M2',
 'CYP2D6',
 'M3',
 'HERG',
 'CYP2C9',
 'HTR2B']

In [52]:

def generate_target_data(folder_list, base_path="."):
    """
    Scans folders for specific gnomAD CSVs and FASTA files.
    Returns a dictionary of (csv_path, fasta_path, uniprot_id).
    Raises FileNotFoundError if required files are missing.
    """
    target_data = {}
    base = Path(base_path)

    for folder in folder_list:
        folder_path = base / folder
        
        # 1. Search for files using glob patterns
        csv_files = list(folder_path.glob(f"{folder}_gnomAD_*.csv"))
        fasta_files = list(folder_path.glob(f"{folder}_*.fasta"))

        # 2. Strict Error Handling
        if not csv_files:
            raise FileNotFoundError(f"Required CSV not found in: {folder_path}")
        if not fasta_files:
            raise FileNotFoundError(f"Required FASTA not found in: {folder_path}")

        # 3. Path and ID Extraction
        # We take the first match [0] found by glob
        csv_path = csv_files[0]
        fasta_path = fasta_files[0]
        
        # Extract UniProt ID: 'CYP3A4_P08684' -> 'P08684'
        uniprot_id = fasta_path.stem.split('_')[-1]

        # 4. Add to dictionary (using .as_posix() for consistent "/" separators)
        target_data[folder] = (
            str(csv_path.as_posix()),
            str(fasta_path.as_posix()),
            uniprot_id
        )

    return target_data

In [53]:
data = generate_target_data(tlist)
data

{'PDE3A': ('PDE3A/PDE3A_gnomAD_v4.1.0_ENSG00000172572.csv',
  'PDE3A/PDE3A_Q14432.fasta',
  'Q14432'),
 'CB1': ('CB1/CB1_gnomAD_v4.1.0_ENSG00000118432.csv',
  'CB1/CB1_P21554.fasta',
  'P21554'),
 'M1': ('M1/M1_gnomAD_v4.1.0_ENSG00000168539.csv',
  'M1/M1_P11229.fasta',
  'P11229'),
 'PXR': ('PXR/PXR_gnomAD_v4.1.0_ENSG00000144852.csv',
  'PXR/PXR_O75469.fasta',
  'O75469'),
 'Cav1.2': ('Cav1.2/Cav1.2_gnomAD_v4.1.0_ENSG00000151067.csv',
  'Cav1.2/Cav1.2_Q13936.fasta',
  'Q13936'),
 'CYP2J2': ('CYP2J2/CYP2J2_gnomAD_v4.1.0_ENSG00000134716.csv',
  'CYP2J2/CYP2J2_P51589.fasta',
  'P51589'),
 'B1AR': ('B1AR/B1AR_gnomAD_v4.1.0_ENSG00000043591.csv',
  'B1AR/B1AR_P08588.fasta',
  'P08588'),
 'SLC6A3': ('SLC6A3/SLC6A3_gnomAD_v4.1.0_ENSG00000142319.csv',
  'SLC6A3/SLC6A3_Q01959.fasta',
  'Q01959'),
 'SLC6A4': ('SLC6A4/SLC6A4_gnomAD_v4.1.0_ENSG00000108576.csv',
  'SLC6A4/SLC6A4_P31645.fasta',
  'P31645'),
 'Nav1.5': ('Nav1.5/Nav1.5_gnomAD_v4.1.0_ENSG00000183873.csv',
  'Nav1.5/Nav1.5_Q14524.fasta'

In [38]:
# write to YAML

import yaml

# Assuming 'data' is the dictionary returned by generate_target_data(tlist)
def save_to_yaml(data, filename="targets.yaml"):
    with open(filename, 'w') as file:
        # default_flow_style=False ensures a clean, nested layout
        # sort_keys=False keeps the order of the dictionary as it is
        yaml.dump(data, file, default_flow_style=False, sort_keys=False)
    
    print(f"Successfully wrote data to {filename}")

# Usage
save_to_yaml(data)

Successfully wrote data to targets.yaml


## VEPS

We are not interested in intron or splice variants as we wish to be working with protein sequences, PLOF in these sequences is a bioinformatics not structural biology problem. 

We could keep stop-lost, stop_gained and inframe and frameshifts in but will assume PLOF at this stage. 

In [54]:
mis_and_PLOF_VEPs = ["stop_lost", "stop_gained", "missense_variant", "inframe_deletion", "frameshift_variant"]

In [55]:
mis_VEPs = ["missense_variant", "inframe_deletion"]

# Transcripts

We are not considering transcripts other than the canonical MANE transcript as they may not match the Uniprot, NCBI and EMBL-EBI sequences, we need to record their numbers here

In [56]:
canonical_MANE_transcripts = {
    "CYP3A4": "ENST00000651514.1",
    "CYP2C9": "ENST00000260682.8",
    "CYP2D6": "ENST00000645361.2",
    "CYP2J2": "ENST00000371204.4",
    "AHR": "ENST00000242057.9",
    "PXR": "ENST00000393716.8",
    "HERG": "ENST00000262186.10",
    "Nav1.5": "ENST00000423572.7", 
    "Cav1.2": "ENST00000399655.6",
    "M1": "ENST00000306960.4",
    "M2": "ENST00000680005.1",
    "M3": "ENST00000676153.1",
    "A1AR": "ENST00000337894.9", 
    "B1AR": "ENST00000369295.4",
    "HTR2B": "ENST00000258400.4",
    "PDE3A": "ENST00000359062.4",
    "SLC6A3": "ENST00000270349.12",
    "SLC6A4": "ENST00000650711.1",
    "CB1": "ENST00000369501.3",

}




In [57]:
save_to_yaml(canonical_MANE_transcripts, filename="MANE_transcripts.yaml")

Successfully wrote data to MANE_transcripts.yaml


In [58]:
for k in canonical_MANE_transcripts:
    if k not in data:
        raise ValueError(f"missing MANE transcript {k}")

In [66]:
def read_seq(path):
    seq = SeqIO.parse(path, "fasta")

    seqs = []
    for r in seq:
        seqs.append(r.seq)
    assert len(seqs) == 1
    sequence = seqs[0]
    return sequence

In [67]:
def parse_hgvs(hgvs):
    match = re.match(r"p\.([A-Za-z]+)(\d+)([A-Za-z]+)$", hgvs)
    if match:
        return match.groups()  # Returns (original_aa, position, new_aa)
    return None  # Return None for frameshifts or other invalid cases


In [104]:
def apply_hgvs_annotation(ref_seq, hgvs_annotation, gnomad_id, uniprot_id):
    # print(f"processing {hgvs_annotation}")
    try:
        data = parse_hgvs(hgvs_annotation)
    except:
        return None
    if data == None:
        return None
    else:
        # unpack
        original, hgvs_position, new = data

    # adjust for 1 indexing 
    biopython_idx = int(hgvs_position) -1
    
    # check original matches
    ref_seq_original = ref_seq[biopython_idx]
        
    # convert to 1 letter 
    ref_seq_original = seq3(ref_seq_original)
        
    # check they line up        
    if not original == ref_seq_original:
        print(f"something went wrong with HGVS {hgvs_annotation} non matching seq data = {data}")
        return None        

    mutable_seq = MutableSeq(ref_seq)

    
    if new == "del":
        print("Handling deletion...")
        del mutable_seq[biopython_idx]  
    
    else:
        # apply new
        new = seq1(new)
        mutable_seq[biopython_idx] = new

    # Convert to FASTA format
    modified_seq = Seq(str(mutable_seq))
    id = f"{uniprot_id}_{hgvs_annotation}_{gnomad_id}"
    record = SeqRecord(modified_seq, id=id , description=f"{uniprot_id} HGVS annotation: {hgvs_annotation}")

    fasta_output = StringIO()
    SeqIO.write(record, fasta_output, "fasta")
    return fasta_output.getvalue()
        


    
    
    
    
    
    

In [105]:

def force_overwrite_directory(directory_path):
    """
    Force overwrite a directory by first removing it if it exists,
    then creating a new empty directory.
    
    Args:
        directory_path: Path to the directory to overwrite
    """
    # Convert to Path object if it's a string
    path = Path(directory_path)
    
    # Remove directory if it exists
    if path.exists():
        # Use shutil.rmtree to remove directory and all its contents
        shutil.rmtree(path)
    
    # Create new empty directory
    path.mkdir(parents=True)
    
    return path

In [106]:
def write_fasta(gnomad_id, hgvs_annotation, uniprot_id, fasta, output_dir):
    if fasta == None:
        print("skipping unparseable ", hgvs_annotation)
    else:
        fname = f"{uniprot_id}_{hgvs_annotation}_{gnomad_id}.fasta"
        path = output_dir / fname
        with open(path, 'w') as f:
            f.write(fasta)

In [107]:
def process_data(target, gnomad_csv, reference_seq_fasta, uniprot_id,  veps, canoncial_transcript):
    print(f"processing {target} {gnomad_csv} {reference_seq_fasta}")
    
    # grab gnomad data 
    variant_data = pd.read_csv(gnomad_csv)
    canonical_only = variant_data[variant_data["Transcript"] == canoncial_transcript]
    sorted = canonical_only.sort_values("Allele Frequency", ascending=False)
    relevant_veps = sorted[sorted["VEP Annotation"].isin(veps)]

    # grab ref seq 
    ref_seq = read_seq(reference_seq_fasta)
    print(ref_seq)
    
    relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)

    summary_workdir = f"./{target}/variant_summary"
    summary_path = force_overwrite_directory(summary_workdir)

    relevant_veps.to_csv(summary_path/"variant_summary.csv", index=False)
    
    all_workdir = f"./{target}/all_variants"
    all_path = force_overwrite_directory(all_workdir)

    
    relevant_veps.apply(lambda x: write_fasta(x["gnomAD ID"], x["Protein Consequence"], uniprot_id, x["mutant_seq"], all_path), axis=1)

    
    

In [108]:
for k, v in data.items():
    target = k 
    gnomad, ref_seq_fasta, uniprot_id = v
    canonical_transcript = canonical_MANE_transcripts[target]

    process_data(target, gnomad, ref_seq_fasta, uniprot_id, mis_VEPs, canonical_MANE_transcripts[target])

processing PDE3A PDE3A/PDE3A_gnomAD_v4.1.0_ENSG00000172572.csv PDE3A/PDE3A_Q14432.fasta
MAVPGDAARVRDKPVHSGVSQAPTAGRDCHHRADPASPRDSGCRGCWGDLVLQPLRSSRKLSSALCAGSLSFLLALLVRLVRGEVGCDLEQCKEAAAAEEEEAAPGAEGGVFPGPRGGAPGGGARLSPWLQPSALLFSLLCAFFWMGLYLLRAGVRLPLAVALLAACCGGEALVQIGLGVGEDHLLSLPAAGVVLSCLAAATWLVLRLRLGVLMIALTSAVRTVSLISLERFKVAWRPYLAYLAGVLGILLARYVEQILPQSAEAAPREHLGSQLIAGTKEDIPVFKRRRRSSSVVSAEMSGCSSKSHRRTSLPCIPREQLMGHSEWDHKRGPRGSQSSGTSITVDIAVMGEAHGLITDLLADPSLPPNVCTSLRAVSNLLSTQLTFQAIHKPRVNPVTSLSENYTCSDSEESSEKDKLAIPKRLRRSLPPGLLRRVSSTWTTTTSATGLPTLEPAPVRRDRSTSIKLQEAPSSSPDSWNNPVMMTLTKSRSFTSSYAISAANHVKAKKQSRPGALAKISPLSSPCSSPLQGTPASSLVSKISAVQFPESADTTAKQSLGSHRALTYTQSAPDLSPQILTPPVICSSCGRPYSQGNPADEPLERSGVATRTPSRTDDTAQVTSDYETNNNSDSSDIVQNEDETECLREPLRKASACSTYAPETMMFLDKPILAPEPLVMDNLDSIMEQLNTWNFPIFDLVENIGRKCGRILSQVSYRLFEDMGLFEAFKIPIREFMNYFHALEIGYRDIPYHNRIHATDVLHAVWYLTTQPIPGLSTVINDHGSTSDSDSDSGFTHGHMGYVFSKTYNVTDDKYGCLSGNIPALELMALYVAAAMHDYDHPGRTNAFLVATSAPQAVLYNDRSVLENHHAAAAWNLFMSRPEYNFLINLDHVEFKHFRFLVIEAILATDLKK

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


skipping unparseable  p.Leu76_Val78del
skipping unparseable  p.Met146_Leu148del
skipping unparseable  p.Leu163_Cys167del
skipping unparseable  p.Arg209_Leu210del
skipping unparseable  p.Pro105_Ala107del
skipping unparseable  nan
skipping unparseable  p.Lys706_Gly708del
skipping unparseable  p.Gly245_Gly248del
processing CB1 CB1/CB1_gnomAD_v4.1.0_ENSG00000118432.csv CB1/CB1_P21554.fasta
MKSILDGLADTTFRTITTDLLYVGSNDIQYEDIKGDMASKLGYFPQKFPLTSFRGSPFQEKMTAGDNPQLVPADQVNITEFYNKSLSSFKENEENIQCGENFMDIECFMVLNPSQQLAIAVLSLTLGTFTVLENLLVLCVILHSRSLRCRPSYHFIGSLAVADLLGSVIFVYSFIDFHVFHRKDSRNVFLFKLGGVTASFTASVGSLFLTAIDRYISIHRPLAYKRIVTRPKAVVAFCLMWTIAIVIAVLPLLGWNCEKLQSVCSDIFPHIDETYLMFWIGVTSVLLLFIVYAYMYILWKAHSHAVRMIQRGTQKSIIIHTSEDGKVQVTRPDQARMDIRLAKTLVLILVVLIICWGPLLAIMVYDVFGKMNKLIKTVFAFCSMLCLLNSTVNPIIYALRSKDLRHAFRSMFPSCEGTAQPLDNSMGDSDCLHKHANNAASVHRAAESCIKSTVKIAKVTMSVSTDTSAEAL
Handling deletion...
Handling deletion...
Handling deletion...
Handling deletion...
Handling deletion...
processing M1 M1/M1_gnomAD_v4.1.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Handling deletion...
Handling deletion...
Handling deletion...
skipping unparseable  p.Ala265_Arg267del
processing PXR PXR/PXR_gnomAD_v4.1.0_ENSG00000144852.csv PXR/PXR_O75469.fasta
MEVRPKESWNHADFVHCEDTESVPGKPSVNADEEVGGPQICRVCGDKATGYHFNVMTCEGCKGFFRRAMKRNARLRCPFRKGACEITRKTRRQCQACRLRKCLESGMKKEMIMSDEAVEERRALIKRKKSERTGTQPLGVQGLTEEQRMMIRELMDAQMKTFDTTFSHFKNFRLPGVLSSGCELPESLQAPSREEAAKWSQVRKDLCSLKVSLQLRGEDGSVWNYKPPADSGGKEIFSLLPHMADMSTYMFKGIISFAKVISYFRDLPIEDQISLLKGAAFELCQLRFNTVFNAETGTWECGRLSYCLEDTAGGFQQLLLEPMLKFHYMLKKLQLHEEEYVLMQAISLFSPDRPGVLQHRVVDQLQEQFAITLKSYIECNRPQPAHRFLFLKIMAMLTELRSINAQHTQRLLRIQDIHPFATPLMQELFGITGS
Handling deletion...
Handling deletion...
Handling deletion...
Handling deletion...
Handling deletion...
skipping unparseable  p.Gln141_Leu143del
skipping unparseable  p.Val118_Ala123del
skipping unparseable  p.Ser114_Val118del
skipping unparseable  p.Met107_Lys108del
skipping unparseable  p.Ala96_Leu103del
processing Cav1.2 Cav1.2/Cav1.2_gnomAD_v4.1.0_ENSG00000151067.csv Cav1.2/C

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


skipping unparseable  p.Met1426Val
skipping unparseable  p.Gln1243His
skipping unparseable  p.Leu981Phe
skipping unparseable  p.Leu981Val
skipping unparseable  p.Val1406Ala
skipping unparseable  p.Cys2100Arg
skipping unparseable  p.Met1426Ile
skipping unparseable  p.Ile982Leu
skipping unparseable  p.Glu2112Lys
skipping unparseable  p.Glu2112Gln
skipping unparseable  p.Lys1573Glu
skipping unparseable  p.Arg2107Gln
skipping unparseable  p.Thr1723Pro
skipping unparseable  p.Ser1849Arg
skipping unparseable  p.His1817Tyr
skipping unparseable  p.His1017Arg
skipping unparseable  p.Ala1847Ser
skipping unparseable  p.Arg1889Ser
skipping unparseable  p.Cys1848Ser
skipping unparseable  p.Pro1781Leu
skipping unparseable  p.Arg1819Leu
skipping unparseable  p.Ala1847Thr
skipping unparseable  p.Gly2110Ala
skipping unparseable  p.Pro1427Gln
skipping unparseable  p.Ser1818Tyr
skipping unparseable  p.Gly1116Glu
skipping unparseable  p.Glu792_Pro794del
skipping unparseable  p.Ala2108Pro
skipping unparsea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


skipping unparseable  p.Ser278_Pro279del
skipping unparseable  p.Asp433_Asp434del
skipping unparseable  p.Val280_Pro287del
skipping unparseable  p.Ser276_Pro279del
skipping unparseable  p.Pro283_Ala293del
skipping unparseable  p.Ala284_Pro285del
skipping unparseable  p.Arg222_Tyr224del
skipping unparseable  p.Asp464_Ser473del
skipping unparseable  p.Phe372_Asn373delinsTyr
skipping unparseable  p.Thr117_Ser128del
processing SLC6A3 SLC6A3/SLC6A3_gnomAD_v4.1.0_ENSG00000142319.csv SLC6A3/SLC6A3_Q01959.fasta
MSKSKCSVGLMSSVVAPAKEPNAVGPKEVELILVKEQNGVQLTSSTLTNPRQSPVEAQDRETWGKKIDFLLSVIGFAVDLANVWRFPYLCYKNGGGAFLVPYLLFMVIAGMPLFYMELALGQFNREGAAGVWKICPILKGVGFTVILISLYVGFFYNVIIAWALHYLFSSFTTELPWIHCNNSWNSPNCSDAHPGDSSGDSSGLNDTFGTTPAAEYFERGVLHLHQSHGIDDLGPPRWQLTACLVLVIVLLYFSLWKGVKTSGKVVWITATMPYVVLTALLLRGVTLPGAIDGIRAYLSVDFYRLCEASVWIDAATQVCFSLGVGFGVLIAFSSYNKFTNNCYRDAIVTTSINSLTSFSSGFVVFSFLGYMAQKHSVPIGDVAKDGPGLIFIIYPEAIATLPLSSAWAVVFFIMLLTLGIDSAMGGMESVITGLIDEFQLLHRHRELFTLFIVLATFLLSLFCVTNGGIYVFTLLDHFAAGTSILFGVLIE

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


Handling deletion...
skipping unparseable  p.Pro418_Ala419del
processing Nav1.5 Nav1.5/Nav1.5_gnomAD_v4.1.0_ENSG00000183873.csv Nav1.5/Nav1.5_Q14524.fasta
MANFLLPRGTSSFRRFTRESLAAIEKRMAEKQARGSTTLQESREGLPEEEAPRPQLDLQASKKLPDLYGNPPQELIGEPLEDLDPFYSTQKTFIVLNKGKTIFRFSATNALYVLSPFHPIRRAAVKILVHSLFNMLIMCTILTNCVFMAQHDPPPWTKYVEYTFTAIYTFESLVKILARGFCLHAFTFLRDPWNWLDFSVIIMAYTTEFVDLGNVSALRTFRVLRALKTISVISGLKTIVGALIQSVKKLADVMVLTVFCLSVFALIGLQLFMGNLRHKCVRNFTALNGTNGSVEADGLVWESLDLYLSDPENYLLKNGTSDVLLCGNSSDAGTCPEGYRCLKAGENPDHGYTSFDSFAWAFLALFRLMTQDCWERLYQQTLRSAGKIYMIFFMLVIFLGSFYLVNLILAVVAMAYEEQNQATIAETEEKEKRFQEAMEMLKKEHEALTIRGVDTVSRSSLEMSPLAPVNSHERRSKRRKRMSSGTEECGEDRLPKSDSEDGPRAMNHLSLTRGLSRTSMKPRSSRGSIFTFRRRDLGSEADFADDENSTAGESESHHTSLLVPWPLRRTSAQGQPSPGTSAPGHALHGKKNSTVDCNGVVSLLGAGDPEATSPGSHLLRPVMLEHPPDTTTPSEEPGGPQMLTSQAPCVDGFEEPGARQRALSAVSVLTSALEELEESRHKCPPCWNRLAQRYLIWECCPLWMSIKQGVKLVVMDPFTDLTITMCIVLNTLFMALEHYNMTSEFEEMLQVGNLVFTGIFTAEMTFKIIALDPYYYFQQGWNIFDSIIVILSLMELGLSRMSNLSVLRSFRLLRVFKLAKSWPTLNTLIKIIGNSVGALGNLTLV

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


skipping unparseable  p.Ser1102Tyr
skipping unparseable  p.Phe2003Leu
skipping unparseable  p.Arg1192Gln
skipping unparseable  p.Val1950Leu
skipping unparseable  p.Ser1786Asn
skipping unparseable  p.Phe1292Ser
skipping unparseable  p.Thr1303Met
skipping unparseable  p.Leu1307Phe
skipping unparseable  p.Leu1987Arg
skipping unparseable  p.Phe1595Ile
skipping unparseable  p.Arg1896Trp
skipping unparseable  p.Val1531Ile
skipping unparseable  p.Ile1835Thr
skipping unparseable  p.Val1278Ile
skipping unparseable  p.Asp1818Asn
skipping unparseable  p.Val1250Met
skipping unparseable  p.Gln1831Glu
skipping unparseable  p.Pro1961Leu
skipping unparseable  p.Asp1242Asn
skipping unparseable  p.Ala586_Leu587del
skipping unparseable  p.Ser1903Leu
skipping unparseable  p.Ala1679Thr
skipping unparseable  p.Ile1659Val
skipping unparseable  p.Arg1825His
skipping unparseable  p.Ala1087Thr
skipping unparseable  p.Arg1115Gln
skipping unparseable  p.Asp1113Asn
skipping unparseable  p.Arg1625His
skipping unpar

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

skipping unparseable  p.Met318_Glu320delinsLys
processing A1AR A1AR/A1AR_gnomAD_v4.1.0_ENSG00000163485.csv A1AR/A1AR_P30542.fasta
MPPSISAFQAAYIGIEVLIALVSVPGNVLVIWAVKVNQALRDATFCFIVSLAVADVAVGALVIPLAILINIGPQTYFHTCLMVACPVLILTQSSILALLAIAVDRYLRVKIPLRYKMVVTPRRAAVAIAGCWILSFVVGLTPMFGWNNLSAVERAWAANGSMGEPVIKCEFEKVISMEYMVYFNFFVWVLPPLLLMVLIYLEVFYLIRKQLNKKVSASSGDPQKYYGKELKIAKSLALILFLFALSWLPLHILNCITLFCPSCHKPSILTYIAIFLTHGNSAMNPIVYAFRIQKFRVTFLKIWNDHFRCQPAPPIDEDLPEERPDD
Handling deletion...
Handling deletion...
skipping unparseable  p.Glu178_Met180del
processing AHR AHR/AHR_gnomAD_v4.1.0_ENSG00000106546.csv AHR/AHR_P35869.fasta
MNSSSANITYASRKRRKPVQKTVKPIPAEGIKSNPSKRHRDRLNTELDRLASLLPFPQDVINKLDKLSVLRLSVSYLRAKSFFDVALKSSPTERNGGQDNCRAANFREGLNLQEGEFLLQALNGFVLVVTTDALVFYASSTIQDYLGFQQSDVIHQSVYELIHTEDRAEFQRQLHWALNPSQCTESGQGIEEATGLPQTVVCYNPDQIPPENSPLMERCFICRLRCLLDNSSGFLAMNFQGKLKYLHGQKKKGKDGSILPPQLALFAIATPLQPPSILEIRTKNFIFRTKHKLDFTPIGCDAKGRIVLGYTEAELCTRGSGYQFIHAADMLYCAESHIRMIKTGESGMIVFRLLTKNNRWTWVQSNARLLYKNGRPDYIIVT

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


skipping unparseable  p.Thr256_His258delinsAsn
skipping unparseable  p.Pro55_Cys57del
skipping unparseable  p.Val87_Thr93del
skipping unparseable  p.Ile109_Gln117del
processing M3 M3/M3_gnomAD_v4.1.0_ENSG00000133019.csv M3/M3_P20309.fasta
MTLHNNSTTSPLFPNISSSWIHSPSDAGLPPGTVTHFGSYNVSRAAGNFSSPDGTTDDPLGGHTVWQVVFIAFLTGILALVTIIGNILVIVSFKVNKQLKTVNNYFLLSLACADLIIGVISMNLFTTYIIMNRWALGNLACDLWLAIDYVASNASVMNLLVISFDRYFSITRPLTYRAKRTTKRAGVMIGLAWVISFVLWAPAILFWQYFVGKRTVPPGECFIQFLSEPTITFGTAIAAFYMPVTIMTILYWRIYKETEKRTKELAGLQASGTEAETENFVHPTGSSRSCSSYELQQQSMKRSNRRKYGRCHFWFTTKSWKPSSEQMDQDHSSSDSWNNNDAAASLENSASSDEEDIGSETRAIYSIVLKLPGHSTILNSTKLPSSDNLQVPEEELGMVDLERKADKLQAQKSVDDGGSFPKSFSKLPIQLESAVDTAKTSDVNSSVGKSTATLPLSFKEATLAKRFALKTRSQITKRKRMSLVKEKKAAQTLSAILLAFIITWTPYNIMVLVNTFCDSCIPKTFWNLGYWLCYINSTVNPVCYALCNKTFRTTFKMLLLCQCDKKKRRKQQYQQRQSVIFHKRAPEQAL
Handling deletion...
Handling deletion...
skipping unparseable  p.Ser39_Gly47del
processing HERG HERG/HERG_gnomAD_v4.1.0_ENSG00000055118.csv HERG/HERG_Q12809.fasta
MPVRRG

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


Handling deletion...
Handling deletion...
Handling deletion...
Handling deletion...
skipping unparseable  p.Gly187_Gly189del
skipping unparseable  p.Arg791_Ala797delinsPro
skipping unparseable  p.Pro1084_Thr1089del
skipping unparseable  p.Ser304_Thr305del
skipping unparseable  p.Ile288_Ala290del
skipping unparseable  p.Asp286_Ile288del
skipping unparseable  p.Ala188_Ala190del
skipping unparseable  p.Glu904_Gly911del
skipping unparseable  p.Gln884_Lys886del
skipping unparseable  p.Phe881_Ser882del
skipping unparseable  p.Pro1026_Val1038delinsLeu
skipping unparseable  p.Pro968_Pro972del
skipping unparseable  p.Arg887_Lys888del
skipping unparseable  p.Ile42_Tyr43delinsAsn
skipping unparseable  p.His147_Ser153del
skipping unparseable  p.Glu575_Pro577delinsAla
processing CYP2C9 CYP2C9/CYP2C9_gnomAD_v4.1.0_ENSG00000138109.csv CYP2C9/CYP2C9_P11712.fasta
MDSLVVLVLCLSCLLLLSLWRQSSGRGKLPPGPTPLPVIGNILQIGIKDISKSLTNLSKVYGPVFTLYFGLKPIVVLHGYEAVKEALIDLGEEFSGRGIFPLAERANRGFGIVFSNGKKWKEIRRFSLMTLRNFGMGKRSI

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_veps["mutant_seq"] = relevant_veps.apply(lambda x: apply_hgvs_annotation(ref_seq, x["Protein Consequence"], x["gnomAD ID"], uniprot_id), axis=1)


skipping unparseable  p.Arg270_Pro274delinsThr
