In [1]:
import pandas as pd

In [None]:
# Load data
file_path = "raw_data/redlat.genes.chr.counts.hg38_multianno.tsv"
original_df = pd.read_csv(file_path, sep='\t')

# Filter data we care about 
df = original_df[
    (original_df["Func.refGene"] == "exonic") & 
    (original_df["ExonicFunc.refGene"] == "nonsynonymous SNV")
]

In [3]:
# Verify filtering worked
df["AAChange.refGene"].head()

45       TARDBP:NM_007375:exon2:c.G149C:p.C50S
323     TARDBP:NM_007375:exon6:c.A776G:p.N259S
324     TARDBP:NM_007375:exon6:c.G881T:p.G294V
326    TARDBP:NM_007375:exon6:c.A1147G:p.I383V
327    TARDBP:NM_007375:exon6:c.G1154T:p.W385L
Name: AAChange.refGene, dtype: object

In [4]:
desired_isoforms = {
    "PSEN1": "NM_000021",
    "PSEN2": "NM_000447",
    "TARDBP": "NM_007375",
    "MAPT": "NM_005910"
}

In [5]:
# Let's parse all the rows and select the correct variant name
# It will be in the AAChange.refGene column, but there is a lot of fields to parse

def extract_variant(row):
    isoforms = row["AAChange.refGene"].split(",")
    all_affected = row["All_affected"]
    all_unaffected = row["All_unaffected"]

    case_cnt = int(all_affected.split("/")[1])
    control_cnt = int(all_unaffected.split("/")[1])
    
    for isoform in isoforms:
        parts = isoform.split(":")
        if len(parts) < 4:
            continue  # Skip malformed entries

        gene_name, isoform_id, exon = parts[0], parts[1], parts[2]

        if gene_name not in desired_isoforms:
            continue  # Skip unknown gene names

        if desired_isoforms[gene_name] == isoform_id:
            aavariant = parts[-1][2:]  # Extract amino acid variant
            aa_number = aavariant[1:-1] if len(aavariant) > 2 else ""  # Extract the inner number
            return pd.Series([aavariant, aa_number, case_cnt, control_cnt, exon])

    return pd.Series([None, None, 0, 0, ""])  # No match found

df[["variant", "AA", "case", "control", "exon"]] = df.apply(extract_variant, axis=1)
df = df.dropna(subset=["AA"])
df["case"] = df["case"].astype(int)
df["control"] = df["control"].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["variant", "AA", "case", "control", "exon"]] = df.apply(extract_variant, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["variant", "AA", "case", "control", "exon"]] = df.apply(extract_variant, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["variant", "AA", "case", "contr

In [None]:
# Now export the selected columns
for gene in desired_isoforms.keys():
    print(gene)
    filtered_df = df[df["Gene.refGene"] == gene][["Gene.refGene", "AA", "variant", "case", "control", "exon"]]
    file_name = f"data/{gene}_variants.txt"
    filtered_df.to_csv(file_name, index=False, sep='\t')

PSEN1
PSEN2
TARDBP
MAPT
