In [36]:
import pandas as pd
from scipy.stats import fisher_exact
import matplotlib.pylab as pylab

pylab.rcParams.update({
    'font.family': 'DejaVu Serif',
    'axes.titlesize': 14,  # 标题字体大小
    'axes.labelsize': 12,  # x和y轴标签的字体大小
    'xtick.labelsize': 10,  # x轴坐标标签字体大小
    'ytick.labelsize': 10,  # y轴坐标标签字体大小
})

In [37]:
xls = pd.ExcelFile('PZM-zcf2.xls', engine='xlrd')

with open('PZM-zcf_gene2RNAiPO.txt', 'w') as o:
    o.write('Gene\tRNAiPO\tSheet\n')
    for sheet in xls.sheet_names:
        df = xls.parse(sheet)
        df = df[df.columns.to_list()[0:2]]
        df.dropna(inplace=True)

        for i, row in df.iterrows():
            try:
                genename = row['Your Input']
            except:
                genename = row['Public Name']
            RNAiPO = row['RNAi Phenotype Observed']

            for PO in RNAiPO.split(','):
                o.write(f'{genename}\t{PO.strip()}\t{sheet}\n')

### Fisher's exact test

In [38]:
df = pd.read_csv('PZM-zcf_gene2RNAiPO.txt', sep='\t')
# Remove RNAiPO 'N.A.'
df = df[df['RNAiPO'] != 'N.A.']

# Find gene only present in one sheet and its corresponding sheet
gene_sheet = df.groupby('Gene')['Sheet'].nunique()
gene_sheet = gene_sheet[gene_sheet == 1]
gene_sheet = df[df['Gene'].isin(gene_sheet.index)][['Gene', 'Sheet']].drop_duplicates()

# Do Fisher's exact test for each sheet
with open('PZM-zcf_Fisher.csv', 'w') as o:

    o.write('Sheet,PO,Count(PO) in foreground,Count(PO) in background,Count(non-PO) in foreground,Count(non-PO) in background,Gene(fore),Gene(Back),oddsratio,pvalue\n')
    for sheet in xls.sheet_names:

        uniq_gene = gene_sheet[gene_sheet['Sheet'] == sheet]['Gene'].tolist()

        foreground = df[df['Gene'].isin(uniq_gene)]
        background = df[df['Gene'].isin(uniq_gene) == False]

        for PO in foreground['RNAiPO'].unique():
            
            gene_PO_fore = foreground[foreground['RNAiPO'] == PO]['Gene'].unique().tolist()
            gene_PO_back = background[background['RNAiPO'] == PO]['Gene'].unique().tolist()
            gene_nonPO_fore = set(foreground['Gene'].unique()) - set(gene_PO_fore)
            gene_nonPO_back = set(background['Gene'].unique()) - set(gene_PO_back)

            a = len(gene_PO_fore)
            b = len(gene_PO_back)
            c = len(gene_nonPO_fore)
            d = len(gene_nonPO_back)

            oddsratio, pvalue = fisher_exact([[a, b], [c, d]], alternative='greater')

            o.write(f'{sheet},{PO},{a},{b},{c},{d},{"/".join(gene_PO_fore)},{"/".join(gene_PO_back)},{oddsratio},{pvalue}\n')

            if pvalue < 0.05:
                print(f'{sheet},{PO},{a},{b},{c},{d},{oddsratio},{pvalue}')

simplemine-M,germline transgene silencing variant,3,5,56,552,5.914285714285715,0.033020953568689414
simplemine-M,nuclear number variant,3,4,56,553,7.40625,0.022131462378378756
simplemine-M,protein expression variant,3,6,56,551,4.919642857142857,0.04620296922015563
simplemine-M,mitochondria morphology variant,4,7,55,550,5.714285714285714,0.015044285316220093
simplemine-M,aldicarb resistant,6,18,53,539,3.389937106918239,0.020450030157714415
simplemine-M,body wall muscle sarcomere morphology variant,4,7,55,550,5.714285714285714,0.015044285316220093
simplemine-M,early larval lethal,5,9,54,548,5.637860082304527,0.006943228061268611
simplemine-M,paralyzed,4,7,55,550,5.714285714285714,0.015044285316220093
simplemine-Z,centration defective early emb,1,0,23,592,inf,0.038961038961016575
simplemine-Z,pharyngeal pumping irregular,1,0,23,592,inf,0.038961038961016575
simplemine-Z,uterus morphology variant,1,0,23,592,inf,0.038961038961016575
simplemine-P,extended life span,53,19,321,223,1.93785866535

In [19]:
import pandas as pd
from statsmodels.stats.multitest import multipletests

pzm_df = pd.read_csv('PZM-zcf_Fisher.csv', sep=',')
p_values = pzm_df['pvalue']
fdr_corrected = multipletests(p_values, method='fdr_bh')
pzm_df['fdr_pvalue'] = fdr_corrected[1]

pzm_df.to_csv("PZMFisherFDR.csv",sep=",")