Bioinformatics Project


In [16]:
import pandas as pd

In [48]:
#Read in/Cleanup Files

#Load in 23 and me file and skip bad line
df23 = pd.read_csv(
    "23andme_v5_hg19_ref.txt.gz",
    sep="\t",
    comment="#",
    header = None,
    skiprows=[1458])

df23.columns = ["chrom", "pos", "rsid", "allele_23andme"]
df23


Unnamed: 0,chrom,pos,rsid,allele_23andme
0,chr1,69869,rs548049170,T
1,chr1,74792,rs13328684,G
2,chr1,565508,rs9283150,G
3,chr1,726912,i713426,A
4,chr1,727841,rs116587930,G
...,...,...,...,...
638457,chrM,16524,i4000693,A
638458,chrM,16524,i704756,A
638459,chrM,16525,i705255,A
638460,chrM,16526,i4000757,G


In [49]:
#Load in PharmGKB file 
pharmdf = pd.read_csv("var_drug_ann.tsv", sep="\t", dtype=str, on_bad_lines='skip')

# Keep only rows where Variant/Haplotypes starts with "rs"
pharm_rs = pharmdf[pharmdf["Variant/Haplotypes"].str.startswith("rs", na=False)].copy()



1. Map/merge the 23andme file and the variant-drug annotation file based on dbSNP_ID (also known as rsID).



In [53]:
#Merge 23andMe with PharmGKB on rsID 
merged = df23.merge(
    pharm_rs,
    left_on="rsid",
    right_on="Variant/Haplotypes",
    how="inner"
)

#Final columns for merged data
final_cols = {
    "rsid": "dbSNP_ID",
    "Gene": "GENE_SYMBOL",
    "Drug(s)": "DRUG_NAME",
    "PMID": "PMID",
    "Phenotype Category": "PHENOTYPE_CATEGORY",
    "Significance": "SIGNIFICANCE",
    "Notes": "NOTES",
    "Sentence": "SENTENCE",
    "Alleles": "ALLELE_PharmGKB",
    "allele_23andme": "ALLELE_23andme"
}

final_df = merged[list(final_cols.keys())].rename(columns=final_cols)

#Save the merged, annotated file for future use
final_df.to_csv("23andme_pharmgkb_merged.csv", index=False)

final_df

Unnamed: 0,dbSNP_ID,GENE_SYMBOL,DRUG_NAME,PMID,PHENOTYPE_CATEGORY,SIGNIFICANCE,NOTES,SENTENCE,ALLELE_PharmGKB,ALLELE_23andme
0,rs2651899,PRDM16,Selective serotonin (5HT1) agonists,26502740,Efficacy,yes,"""Hence, the following risk alleles were determ...",Allele C is associated with increased response...,C,T
1,rs11807862,PRDM16,"atorvastatin, HMG-CoA reductase inhibitors, si...",24096969,Efficacy,no,There could be strand confusion with this A/T ...,Genotype AA is associated with decreased respo...,AA,T
2,rs228729,PER3,lithium,21781277,Efficacy,no,,Allele T is not associated with increased resp...,T,T
3,rs228642,PER3,lithium,21781277,Efficacy,no,,Allele C is not associated with increased resp...,C,C
4,rs228666,PER3,lithium,21781277,Efficacy,no,,Allele C is not associated with increased resp...,C,T
...,...,...,...,...,...,...,...,...,...,...
6115,rs3810651,GABRQ,botulinum toxin type a,31014225,Efficacy,no,No significant difference in allele frequency ...,Allele T is not associated with response to bo...,T,T
6116,rs17435,MECP2,"cisplatin, fluorouracil, mitoxantrone",21635146,Efficacy,yes,The study described this variant within the ME...,Allele A is associated with response to cispla...,A,T
6117,rs1734787,MECP2,"cisplatin, fluorouracil, mitoxantrone",21635146,Efficacy,yes,The study described this variant within the ME...,Allele A is associated with response to cispla...,A,A
6118,rs1734791,MECP2,"cisplatin, fluorouracil, mitoxantrone",21635146,Efficacy,yes,The study described this variant within the ME...,Allele A is associated with response to cispla...,A,A
