In [1]:
import os
import pandas as pd
from scipy.stats import fisher_exact

In [2]:
experiments = []

In [3]:
def create_subject():
    table = pd.read_table('geneName.txt')['Unknown']
    subject = table.value_counts().index.tolist()[1:]
    return subject

def create_gene_sets():
    gene_sets = []
    for f in os.listdir('data'):
        if f.endswith('.txt'):
            gene_sets.append(f)
    return gene_sets

def create_gene_set(set_name):
    with open('data/' + set_name) as f:
        gene_set= f.readlines()
    return [line.strip() for line in gene_set]


def create_all_genes():
    with open('all') as f:
        all_genes = f.readlines()
    return [line.strip() for line in all_genes]

In [4]:
def contingency_matrix(subject, gene_set, all_genes):
    a, b, c, d = [], [], [], []
    for gene in subject:
        if gene in gene_set:
            a.append(gene)
        else:
            b.append(gene)
    for gene in gene_set:
        if gene not in subject:
            c.append(gene)
    for gene in all_genes:
        if gene not in subject and gene not in gene_set:
            d.append(gene)
    return a, b, c, d

In [5]:
subject = create_subject()
gene_sets = create_gene_sets()
all_genes = create_all_genes()

In [6]:
for set_name in gene_sets:
    experiment = []
    gene_set = create_gene_set(set_name)
    title = gene_set[0]
    description = gene_set[1]
    genes = gene_set[2:]
    a, b, c, d = contingency_matrix(subject, genes, all_genes)
    experiment.append(title) # 0
    experiment.append(description) # 1
    experiment.append([[a, b], [c, d]]) # 2
    experiment.append([[len(a), len(b)], [len(c), len(d)]]) # 3
    oddsratio, p_value = fisher_exact(experiment[3])
    experiment.append(p_value) # 4
    experiments.append(experiment)

In [14]:
df = pd.DataFrame(experiments)
df

Unnamed: 0,0,1,2,3,4
0,HALLMARK_ADIPOGENESIS,> Genes up-regulated during adipocyte differen...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4186]]",0.113194
1,HALLMARK_ALLOGRAFT_REJECTION,> Genes up-regulated during transplant rejection.,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4187]]",0.113188
2,HALLMARK_ANDROGEN_RESPONSE,> Genes defining response to androgens.,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [101, 4286]]",0.647125
3,HALLMARK_ANGIOGENESIS,> Genes up-regulated during formation of blood...,"[[[PDGFA], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA...","[[1, 59], [35, 4351]]",0.388039
4,HALLMARK_APICAL_JUNCTION,> Genes encoding components of apical junction...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4186]]",0.113194
5,HALLMARK_APICAL_SURFACE,> Genes encoding proteins over-represented on ...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [44, 4341]]",1.0
6,HALLMARK_APOPTOSIS,> Genes mediating programmed cell death (apopt...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [161, 4225]]",0.278864
7,HALLMARK_BILE_ACID_METABOLISM,> Genes involve in metabolism of bile acids an...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [112, 4273]]",0.405679
8,HALLMARK_CHOLESTEROL_HOMEOSTASIS,> Genes involved in cholesterol homeostasis.,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [74, 4311]]",0.626499
9,HALLMARK_COAGULATION,> Genes encoding components of blood coagulati...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [138, 4249]]",0.26343


In [16]:
df.sort_values(4, axis=0)

Unnamed: 0,0,1,2,3,4
1,HALLMARK_ALLOGRAFT_REJECTION,> Genes up-regulated during transplant rejection.,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4187]]",0.113188
28,HALLMARK_KRAS_SIGNALING_UP,> Genes up-regulated by KRAS activation.,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4187]]",0.113188
26,HALLMARK_INTERFERON_GAMMA_RESPONSE,> Genes up-regulated in response to IFNG [Gene...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4187]]",0.113188
20,HALLMARK_HEME_METABOLISM,> Genes involved in metabolism of heme (a cofa...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4187]]",0.113188
14,HALLMARK_ESTROGEN_RESPONSE_EARLY,> Genes defining early response to estrogen.,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4187]]",0.113188
13,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,> Genes defining epithelial-mesenchymal transi...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4187]]",0.113188
0,HALLMARK_ADIPOGENESIS,> Genes up-regulated during adipocyte differen...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4186]]",0.113194
35,HALLMARK_OXIDATIVE_PHOSPHORYLATION,> Genes encoding proteins involved in oxidativ...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4186]]",0.113194
30,HALLMARK_MTORC1_SIGNALING,> Genes up-regulated through activation of mTO...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4186]]",0.113194
22,HALLMARK_IL2_STAT5_SIGNALING,> Genes up-regulated by STAT5 in response to I...,"[[[], [SDK1, MAD1L1, PRKAR1B, CARD11, GNA12, S...","[[0, 60], [200, 4186]]",0.113194
