# Bonferroni corrected Fisher's exact test

In this notebook, we will be performing a Fisher's exact test with Bonferroni correction to determine if there is a significant difference in allele frequencies between population groups.

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import scipy as sci

# Import UCSC Canonical Transcripts file:
canon = pd.read_csv("../rawData/canonical.txt", sep="\t")

In [2]:
def vep_read (geneDict):
    """Imports E! Ensembl Variant Effect Prediction files as a Pandas DataFrame.


    Parameters:
    geneDict (dict): A dict of gene names and filepaths to import.

    Returns:
    dict:Returns a dict of pandas DataFrames, one per gene.
    """
    dataFrames = dict()
    for geneName, vepFile in geneDict.items():
        dataFrames[str(geneName)] = pd.read_csv(vepFile, sep='\t')
        dataFrames[str(geneName)] = dataFrames[str(geneName)].join(dataFrames[str(geneName)]['Feature'].str.split(".", expand=True)).rename(columns={0: "Reference Name", 1: "Reference Version", "#Uploaded_variation": "Variant", "Existing_variation": "Existing Variation"})
        #dataFrames[str(geneName)].set_index('Variant', inplace=True)
        dataFrames[str(geneName)]['Variant'] = dataFrames[str(geneName)]['Variant'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)
        dataFrames[str(geneName)] = dataFrames[str(geneName)][~dataFrames[str(geneName)].Variant.str.contains("<CN")]
        dataFrames[str(geneName)] = pd.merge(left=dataFrames[str(geneName)], right=canon, how="inner", left_on="Reference Name", right_on='hg19.refGene.name')[['Variant', 'Location', 'Existing Variation', 'Consequence', 'Reference Name', 'SOURCE', 'SIFT', 'PolyPhen', 'Condel', 'CADD_PHRED', 'PHENOTYPES']]
    return dataFrames


def pd_read_fwf (geneDict):
    """Imports Fixed-Width files as a Pandas DataFrame and performs cleanup, and pivots the table into wide format.


    Parameters:
    geneDict (dict): A dict of gene names and filepaths to import.

    Returns:
    dict:Returns a dict of pandas DataFrames, one per gene.
    """
    dataFrames = dict()
    for geneName, geneFile in geneDict.items():
        listie = list()
        rf = open(geneFile)
        for row in rf.readlines():
            listie.append(row.split())
        rf.close()
        listie = pd.DataFrame(np.asarray(listie))
        listie.columns = listie.iloc[0]
        listie = listie[1:]
        dataFrames[str(geneName)] = listie

        # Clean up naming of variants with both custom names and rsID's.
        dataFrames[str(geneName)]['SNP'] = dataFrames[str(geneName)]['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)

        # Strip out copy number variants.
        dataFrames[str(geneName)] = dataFrames[str(geneName)][~dataFrames[str(geneName)].SNP.str.contains(r"<CN[0-9]+>")].astype({"MAC": "int32", "NCHROBS": "int32"})
       
        # Rename columns
        dataFrames[str(geneName)] = dataFrames[str(geneName)].rename(columns={"MAC": "Alternate_OBS", "SNP": "Variant", "A1": "Alternate Allele", "A2": "Reference Allele"}, errors="raise")
        
        # Calculate new column for A2 observations (Needed for contingency table later)
        dataFrames[str(geneName)]['Reference_OBS'] = dataFrames[str(geneName)].apply(lambda row: row.NCHROBS - row.Alternate_OBS, axis=1)
        dataFrames[str(geneName)]['MAF'] = pd.to_numeric(dataFrames[str(geneName)]['MAF'])
        dataFrames[str(geneName)] = pd.pivot_table(dataFrames[str(geneName)], index=['Variant', 'Alternate Allele', 'Reference Allele'], columns=['CLST'], values=['Alternate_OBS', 'Reference_OBS', 'MAF'])
    return dataFrames


Genes = pd_read_fwf({"CYP2A6": "../Final/SUPER/ALL_CYP2A6_SUPER.frq.strat", "CYP2B6": "../Final/SUPER/ALL_CYP2B6_SUPER.frq.strat", "UGT2B7": "../Final/SUPER/ALL_UGT2B7_SUPER.frq.strat"})
VEP = vep_read({"CYP2A6": "../Final/CYP2A6_VEP.txt", "CYP2B6": "../Final/CYP2B6_VEP.txt", "UGT2B7": "../Final/UGT2B7_VEP.txt"})

In [3]:
SupTable = dict()

for gene in Genes:
    SupTable[str(gene)] = pd.merge(
        right=Genes[str(gene)]['MAF'].reset_index(), 
        left=VEP[str(gene)], 
        how="outer", 
        right_on='Variant', 
        left_on='Variant'
    )[['Variant', 'Location', 'Reference Allele', 'Alternate Allele', 'Existing Variation', 'Consequence', 'Reference Name', 'SOURCE', 'SIFT', 'PolyPhen', 'Condel', 'CADD_PHRED', 'PHENOTYPES', 'AFR', 'AMR', 'EUR', 'EAS', 'SAS']]

#display(Genes['CYP2A6']['MAF'].stack(0).reset_index(level=1))
#SupTable['CYP2A6'] = VEP['CYP2A6'].join(Genes['CYP2A6']['MAF'].stack(0).reset_index(level=1), on='Variant')

In [4]:
print(SupTable['CYP2A6'].columns.values)
# Strip erroneously named variants:
#CYP2A6['SNP'] = CYP2A6['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)
#CYP2B6['SNP'] = CYP2B6['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)
#UGT2B7['SNP'] = UGT2B7['SNP'].str.replace(r'^[0-9]{1,2}:[0-9]+[A-Z]+-[A-Z]+;(rs[0-9]+)$', r'\1', regex=True)

#CYP2A6.to_excel("text.xlsx")
#Genes['CYP2A6'] = Genes['CYP2A6'][['SNP', 'A1_OBS', 'A2_OBS', 'CLST']].melt(id_vars=['SNP', 'CLST'], var_name='Alleles', value_name='Observations')

['Variant' 'Location' 'Reference Allele' 'Alternate Allele'
 'Existing Variation' 'Consequence' 'Reference Name' 'SOURCE' 'SIFT'
 'PolyPhen' 'Condel' 'CADD_PHRED' 'PHENOTYPES' 'AFR' 'AMR' 'EUR' 'EAS'
 'SAS']


In [108]:
def Fishers (dataframe, refPop, compPop):
    """Runs a row-wise Fisher's Exact Test between the two listed populations


    Parameters:
    dataframe (dict): A dict of DataFrames to work on.
    pop1 (str): A str matching a column name in each dataset corresponding to that populations frequency data. 
    pop2 (str): A str matching a column name in each dataset corresponding to that populations frequency data. 

    Returns:
    dict:Returns a dict of pandas DataFrames, one per gene.
    """
    #display(dataframe)
    def fishersP (row, referencePop, pop1):
        data = {
            str(referencePop): [row['Reference_OBS'][str(referencePop)], row['Alternate_OBS'][str(referencePop)]],
            str(pop): [row['Reference_OBS'][str(pop1)], row['Alternate_OBS'][str(pop1)]]
        }
        index=['Reference Allele', "Alternate Allele"]
        dataFrame = pd.DataFrame(data, index=index)
        oddsRatio, pValue = sci.stats.fisher_exact(dataFrame)
        return pValue
    
    def fishersOR (row, referencePop, pop1):
        data = {
            str(referencePop): [row['Reference_OBS'][str(referencePop)], row['Alternate_OBS'][str(referencePop)]],
            str(pop): [row['Reference_OBS'][str(pop1)], row['Alternate_OBS'][str(pop1)]]
        }
        index=['Reference Allele', "Alternate Allele"]
        dataFrame = pd.DataFrame(data, index=index)
        oddsRatio, pValue = sci.stats.fisher_exact(dataFrame)
        return oddsRatio
        
    
    rstData = dataframe.stack(level="CLST").reset_index(['Variant', 'Reference Allele', 'Alternate Allele', 'CLST'])
    columns = list()
    columns.append('Variant')
    for pop in compPop:
        oddsLabel = str(refPop) + "_" + str(pop)
        pLabel = str(refPop) + "_" + str(pop)
        columns.append(oddsLabel)
        columns.append(pLabel)
        rstData.apply(lambda row: fishersOR(row, refPop, pop), axis=1)
        
        #rstData[str(pop)] = rstData.apply(lambda row: fishersOR(row, refPop, pop), axis=1)
        #rstData[str(pop)] = rstData.apply(lambda row: fishersP(row, refPop, pop), axis=1)
        #display(rstData.apply(lambda row: fishers(row, refPop, pop), axis=1).to_list())
    return rstData#rstData[columns].stack().droplevel('CLST')

In [68]:
Fishers(Genes['CYP2A6'], "AFR", ["EUR", 'AMR', 'EAS', "SAS"])

#Genes['CYP2A6'].reset_index().set_index('Variant').columns.values#.columns.values

KeyError: 'Odds Ratio'

In [34]:
for gene in Genes:
    SupTable[str(gene)] = pd.merge(
        right=Fishers(Genes[str(gene)], "AFR", ["EUR", 'AMR', 'EAS', "SAS"]),
        left=SupTable[str(gene)],
        on='Variant'
    )
    SupTable[str(gene)].to_excel("../Final/Supplementary Table.xlsx", sheet_name=str(gene))
#Fishers(Genes['CYP2A6']['MAF'].reset_index(), "AFR", "AMR")
#Fishers(Genes['CYP2A6']['MAF'].reset_index(), "AFR", "EAS")
#Fishers(Genes['CYP2A6']['MAF'].reset_index(), "AFR", "SAS")

#display(Genes['CYP2A6'].loc['rs8192723']['A1_OBS'])

#display(Genes['CYP2A6'].loc['rs8192723']['A2_OBS'])

TypeError: 'tuple' object does not support item assignment

In [None]:
Fishers = dict()
for key, value in Genes.items():
    Fishers[str(key)] = sm.stats.Table(value)

In [None]:
print(Fishers['CYP2A6'].test_nominal_association().pvalue)

In [114]:
Genes['CYP2A6'].stack(level="CLST").reset_index(['Variant', 'Reference Allele', 'Alternate Allele', 'CLST'])#.unstack(level=0).reorder_levels([0,"CLST"], axis=1)

Unnamed: 0,Variant,Alternate Allele,Reference Allele,CLST,Alternate_OBS,MAF,Reference_OBS,AFR_OR_EUR,AFR_P_EUR,AFR_OR_AMR,AFR_P_AMR,AFR_OR_EAS,AFR_P_EAS,AFR_OR_SAS,AFR_P_SAS
0,19:41349573T-C,C,T,AFR,1.0,0.0625,15.0,,,,,,,,
1,19:41349573T-C,C,T,AMR,0.0,0.0000,0.0,,,,,,,,
2,19:41349573T-C,C,T,EAS,0.0,0.0000,0.0,,,,,,,,
3,19:41349573T-C,C,T,EUR,0.0,0.0000,0.0,,,,,,,,
4,19:41349573T-C,C,T,SAS,0.0,0.0000,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2341,rs8192733,C,G,AMR,382.0,0.5504,312.0,,,,,,,,
2342,rs8192733,C,G,EAS,567.0,0.5625,441.0,,,,,,,,
2343,rs8192733,C,G,EUR,517.0,0.5139,489.0,,,,,,,,
2344,rs8192733,C,G,SAS,427.0,0.4366,551.0,,,,,,,,
