# Bonferroni corrected Fisher's exact test

In this notebook, we will be performing a Fisher's exact test with Bonferroni correction to determine if there is a significant difference in allele frequencies between population groups.

In [2]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import scipy as sci

# Import UCSC Canonical Transcripts file:
canon = pd.read_csv("../rawData/canonical.txt", sep="\t")

In [3]:
def vep_read (geneDict):
    """Imports E! Ensembl Variant Effect Prediction files as a Pandas DataFrame.


    Parameters:
    geneDict (dict): A dict of gene names and filepaths to import.

    Returns:
    dict:Returns a dict of pandas DataFrames, one per gene.
    """
    dataFrames = dict()
    for geneName, vepFile in geneDict.items():
        dataFrames[str(geneName)] = pd.read_csv(vepFile, sep='\t')
        dataFrames[str(geneName)] = dataFrames[str(geneName)].join(dataFrames[str(geneName)]['Feature'].str.split(".", expand=True)).rename(columns={0: "Reference Name", 1: "Reference Version", "#Uploaded_variation": "Variant", "Existing_variation": "Existing Variation"})
        #dataFrames[str(geneName)].set_index('Variant', inplace=True)
        dataFrames[str(geneName)]['Variant'] = dataFrames[str(geneName)]['Variant'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)
        dataFrames[str(geneName)] = dataFrames[str(geneName)][~dataFrames[str(geneName)].Variant.str.contains("<CN")]
        dataFrames[str(geneName)] = pd.merge(left=dataFrames[str(geneName)], right=canon, how="inner", left_on="Reference Name", right_on='hg19.refGene.name')[['Variant', 'Location', 'Existing Variation', 'Consequence', 'Reference Name', 'SOURCE', 'SIFT', 'PolyPhen', 'Condel', 'CADD_PHRED', 'PHENOTYPES']]
    return dataFrames


def pd_read_fwf (geneDict):
    """Imports Fixed-Width files as a Pandas DataFrame and performs cleanup, and pivots the table into wide format.


    Parameters:
    geneDict (dict): A dict of gene names and filepaths to import.

    Returns:
    dict:Returns a dict of pandas DataFrames, one per gene.
    """
    dataFrames = dict()
    for geneName, geneFile in geneDict.items():
        listie = list()
        rf = open(geneFile)
        for row in rf.readlines():
            listie.append(row.split())
        rf.close()
        listie = pd.DataFrame(np.asarray(listie))
        listie.columns = listie.iloc[0]
        listie = listie[1:]
        dataFrames[str(geneName)] = listie

        # Clean up naming of variants with both custom names and rsID's.
        dataFrames[str(geneName)]['SNP'] = dataFrames[str(geneName)]['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)

        # Strip out copy number variants.
        dataFrames[str(geneName)] = dataFrames[str(geneName)][~dataFrames[str(geneName)].SNP.str.contains(r"<CN[0-9]+>")].astype({"MAC": "int32", "NCHROBS": "int32"})
       
        # Rename columns
        dataFrames[str(geneName)] = dataFrames[str(geneName)].rename(columns={"MAC": "Alternate_OBS", "SNP": "Variant", "A1": "Alternate Allele", "A2": "Reference Allele"}, errors="raise")
        
        # Calculate new column for A2 observations (Needed for contingency table later)
        dataFrames[str(geneName)]['Reference_OBS'] = dataFrames[str(geneName)].apply(lambda row: row.NCHROBS - row.Alternate_OBS, axis=1)
        dataFrames[str(geneName)]['MAF'] = pd.to_numeric(dataFrames[str(geneName)]['MAF'])
        dataFrames[str(geneName)]['P Value'] = dataFrames[str(geneName)].apply(lambda row: 0, axis='columns')
        dataFrames[str(geneName)]["Odds Ratio"] = dataFrames[str(geneName)].apply(lambda row: 0, axis='columns')
        dataFrames[str(geneName)] = pd.pivot_table(dataFrames[str(geneName)], index=['Variant', 'Alternate Allele', 'Reference Allele'], columns=['CLST'], values=['Alternate_OBS', 'Reference_OBS', 'MAF', 'P Value', 'Odds Ratio'])
    return dataFrames


Genes = pd_read_fwf({"CYP2A6": "../Final/SUPER/ALL_CYP2A6_SUPER.frq.strat", "CYP2B6": "../Final/SUPER/ALL_CYP2B6_SUPER.frq.strat", "UGT2B7": "../Final/SUPER/ALL_UGT2B7_SUPER.frq.strat"})
VEP = vep_read({"CYP2A6": "../Final/CYP2A6_VEP.txt", "CYP2B6": "../Final/CYP2B6_VEP.txt", "UGT2B7": "../Final/UGT2B7_VEP.txt"})

In [4]:
display(Genes["CYP2A6"]['MAF'].reset_index())

CLST,Variant,Alternate Allele,Reference Allele,AFR,AMR,EAS,EUR,SAS
0,19:41349573T-C,C,T,0.06250,0.00000,0.000000,0.00000,0.00000
1,19:41349640C-G,G,C,0.27600,0.43370,0.426600,0.57850,0.58690
2,19:41349906T-G,G,T,0.25000,0.00000,0.000000,0.00000,0.00000
3,19:41349912T-G,G,T,0.06250,0.00000,0.000000,0.00000,0.00000
4,19:41352497C-G,G,C,0.06250,0.00000,0.000000,0.00000,0.00000
...,...,...,...,...,...,...,...,...
386,rs8192723,T,C,0.00000,0.00000,0.000992,0.00000,0.00000
387,rs8192726,A,C,0.08169,0.04323,0.179600,0.06859,0.13090
388,rs8192728,T,G,0.01210,0.00000,0.116100,0.00000,0.02965
389,rs8192729,T,C,0.08419,0.02450,0.030750,0.08350,0.05521


In [5]:
SupTable = dict()

for gene in Genes:
    SupTable[str(gene)] = pd.merge(
        right=Genes[str(gene)]['MAF'].reset_index(),
        left=VEP[str(gene)], 
        how="outer", 
        right_on='Variant', 
        left_on='Variant'
    )[['Variant', 'Location', 'Reference Allele', 'Alternate Allele', 'Existing Variation', 'Consequence', 'Reference Name', 'SOURCE', 'SIFT', 'PolyPhen', 'Condel', 'CADD_PHRED', 'PHENOTYPES', 'AFR', 'AMR', 'EUR', 'EAS', 'SAS']]

#display(Genes['CYP2A6']['MAF'].stack(0).reset_index(level=1))
#SupTable['CYP2A6'] = VEP['CYP2A6'].join(Genes['CYP2A6']['MAF'].stack(0).reset_index(level=1), on='Variant')

In [6]:
display(SupTable['CYP2A6'])
# Strip erroneously named variants:
#CYP2A6['SNP'] = CYP2A6['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)
#CYP2B6['SNP'] = CYP2B6['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)
#UGT2B7['SNP'] = UGT2B7['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)

#CYP2A6.to_excel("text.xlsx")
#Genes['CYP2A6'] = Genes['CYP2A6'][['SNP', 'A1_OBS', 'A2_OBS', 'CLST']].melt(id_vars=['SNP', 'CLST'], var_name='Alleles', value_name='Observations')

Unnamed: 0,Variant,Location,Reference Allele,Alternate Allele,Existing Variation,Consequence,Reference Name,SOURCE,SIFT,PolyPhen,Condel,CADD_PHRED,PHENOTYPES,AFR,AMR,EUR,EAS,SAS
0,rs72549453,19:40843438-40843438,T,C,rs72549453,downstream_gene_variant,NM_000762,RefSeq,-,-,-,0.539,-,0.000756,0.002882,0.013920,0.000992,0.009202
1,rs146895354,19:40843446-40843446,C,G,rs146895354,downstream_gene_variant,NM_000762,RefSeq,-,-,-,5.910,-,0.000000,0.000000,0.000000,0.001984,0.000000
2,rs569152950,19:40843552-40843552,G,A,rs569152950,3_prime_UTR_variant,NM_000762,RefSeq,-,-,-,7.991,COUMARIN_RESISTANCE+MIM_morbid+ENSG00000255974...,0.000000,0.000000,0.000000,0.000992,0.000000
3,rs192752442,19:40843567-40843567,C,G,rs192752442,3_prime_UTR_variant,NM_000762,RefSeq,-,-,-,6.913,-,0.000000,0.001441,0.003976,0.000000,0.000000
4,rs558177610,19:40843603-40843603,C,T,rs558177610,3_prime_UTR_variant,NM_000762,RefSeq,-,-,-,7.442,-,0.000000,0.000000,0.000994,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,rs201793905,19:40850453-40850453,C,T,rs201793905,upstream_gene_variant,NM_000762,RefSeq,-,-,-,0.022,-,0.000000,0.000000,0.000000,0.001984,0.000000
389,rs28399433,19:40850474-40850474,A,C,"rs28399433,CR015160",upstream_gene_variant,NM_000762,RefSeq,-,-,-,16.85,-,0.082450,0.099420,0.071570,0.239100,0.154400
390,rs528755675,19:40850518-40850518,T,C,rs528755675,upstream_gene_variant,NM_000762,RefSeq,-,-,-,2.089,-,0.000000,0.000000,0.000994,0.000000,0.000000
391,rs547290288,19:40850520-40850520,C,T,rs547290288,upstream_gene_variant,NM_000762,RefSeq,-,-,-,0.834,-,0.000000,0.000000,0.000994,0.000000,0.000000


In [7]:
def Fishers (dataframe, refPop, compPop):
    """Runs a row-wise Fisher's Exact Test between the two listed populations


    Parameters:
    dataframe (dict): A dict of DataFrames to work on.
    pop1 (str): A str matching a column name in each dataset corresponding to that populations frequency data. 
    compPop (list): A str matching a column name in each dataset corresponding to that populations frequency data.

    Returns:
    dict:Returns a dict of pandas DataFrames, one per gene.
    """
    def row_wise_fishers(row, referencePop, comparisonPop):
        data = {
            str(refPop): [row['Reference_OBS'][str(refPop)], row['Alternate_OBS'][str(referencePop)]],
            str(comparisonPop): [row['Reference_OBS'][str(comparisonPop)], row['Alternate_OBS'][str(comparisonPop)]]
        }
        index=['Reference Allele', "Alternate Allele"]
        dataFrame = pd.DataFrame(data, index=index)
        return sci.stats.fisher_exact(dataFrame)
        

    columns = list()
    columns.append('Variant')
    rstData = dataframe.reset_index()
    for pop in compPop:
        oddsLabel = str(refPop) + "_OR_" + str(pop)
        pLabel = str(refPop) + "_P_" + str(pop)
        rstData[oddsLabel], rstData[pLabel] = zip(*rstData.apply(lambda row: row_wise_fishers(row, refPop, pop), axis='columns'))

        columns.append(oddsLabel)
        columns.append(pLabel)

    return rstData[columns].stack().droplevel('CLST')

In [8]:
wenis = Fishers(Genes['CYP2A6'], "AFR", ["EUR", 'AMR', 'EAS', "SAS"])

In [9]:
wenis
#Genes['CYP2A6'].reset_index().set_index('Variant').columns.values#.columns.values

Unnamed: 0,AFR_OR_AMR,AFR_OR_EAS,AFR_OR_EUR,AFR_OR_SAS,AFR_P_AMR,AFR_P_EAS,AFR_P_EUR,AFR_P_SAS,Variant
0,,,,,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,19:41349573T-C
1,2.009226,1.951621,3.600908,3.727224,1.085330e-12,2.827227e-14,5.134219e-50,1.037685e-51,19:41349640C-G
2,,,,,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,19:41349906T-G
3,,,,,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,19:41349912T-G
4,,,,,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,19:41352497C-G
...,...,...,...,...,...,...,...,...,...
386,,inf,,,1.000000e+00,4.326180e-01,1.000000e+00,1.000000e+00,rs8192723
387,0.507865,2.460186,0.827760,1.692723,1.088472e-03,1.783012e-12,2.691282e-01,1.649831e-04,rs8192726
388,0.000000,10.718434,0.000000,2.494336,2.187284e-03,2.535064e-28,1.432758e-04,3.451960e-03,rs8192728
389,0.273135,0.345131,0.990981,0.635680,3.072189e-08,4.868325e-08,1.000000e+00,7.397930e-03,rs8192729


In [10]:
excelFile = pd.ExcelWriter("../Final/Supplementary Table.xlsx")
for gene in Genes:
    print(gene)
    SupTable[str(gene)] = pd.merge(
        right=Fishers(Genes[str(gene)], "AFR", ["EUR", 'AMR', 'EAS', "SAS"]),
        left=SupTable[str(gene)],
        on='Variant'
    )
    SupTable[str(gene)].to_excel(excelFile, sheet_name=str(gene), index=False)
excelFile.save()
excelFile.close()

#Fishers(Genes['CYP2A6']['MAF'].reset_index(), "AFR", "AMR")
#Fishers(Genes['CYP2A6']['MAF'].reset_index(), "AFR", "EAS")
#Fishers(Genes['CYP2A6']['MAF'].reset_index(), "AFR", "SAS")

#display(Genes['CYP2A6'].loc['rs8192723']['A1_OBS'])

#display(Genes['CYP2A6'].loc['rs8192723']['A2_OBS'])




CYP2A6
CYP2B6
UGT2B7
