# Extract Gene Expression Files

This script will extract the data from the various files and put a summary into the assets folder

In [2]:
import os
from os import listdir
from os.path import isfile, join


In [21]:
base_directory = '../data'
if not os.path.exists(base_directory):
    raise ValueError("Could not find base directory")

myfiles = [f for f in listdir(base_directory) if isfile(join(base_directory, f))]

In [22]:
## Get the refined DGE files for each of the 39 tissues

In [23]:
dgefiles = [f for f in myfiles if 'DGE_refined' in f]

In [24]:
if len(dgefiles) != 39:
    raise ValueError("We were expecting 39 tissues, but we got ", len(dgefiles))
print("We extracted a total of %d DGE files, e.g. %s, %s" % (len(dgefiles), dgefiles[0], dgefiles[1]))

We extracted a total of 39 DGE files, e.g. pituitary_DGE_refined.csv, brain_cortex_DGE_refined.csv


## Get the mapping files for each of the 39 tissues

In [26]:
ensgfiles = [f for f in myfiles if 'ensg' in f]
if len(ensgfiles) != 39:
    raise ValueError("We were expecting 39 tissues, but we got ", len(ensgfiles))
print("We extracted a total of %d DGE files, e.g. %s, %s" % (len(ensgfiles), ensgfiles[0], ensgfiles[1]))

We extracted a total of 39 DGE files, e.g. brain_frontal_cortex_ba_9_DGE_ensg_map.csv, liver_DGE_ensg_map.csv


## Make a dictionary of ensg - symbol and vice versa

In [27]:
from collections import defaultdict

In [30]:
ens2sym = defaultdict(str)
sym2ens = defaultdict(str)
for f in ensgfiles:
    mypath = os.path.join(base_directory, f)
    with open(mypath) as fh:
        for line in fh:
            fields = line.rstrip('\n').split(',')
            if len(fields) != 2:
                raise ValueError("Bad line: ", line)
            ensg = fields[0]
            sym = fields[1]
            ens2sym[ensg] = sym
            sym2ens[sym] = ensg
print("We got %d entries for ens2sym and %d entries for sym2ens" % (len(ens2sym), len(sym2ens)))

We got 8202 entries for ens2sym and 8202 entries for sym2ens


## Make a summary of all DGE events

header is logFC,AveExpr,t,P.Value,adj.P.Val,B

In [39]:
import csv
outfile = '../assets/dge_by_tissue_summary.tsv'
gh = open(outfile, 'wt')
c = 0
dge_genes = set()
for f in dgefiles:
    tissue = f.split('_')[0]
    mypath = os.path.join(base_directory, f)
    with open(mypath) as fh:
        csvreader = csv.reader(fh, delimiter=',')
        header = next(csvreader) # discard header
        for row in csvreader:
            ensg = row[0]
            ensg = ensg.split('.')[0] # remove version number
            if not ensg in ens2sym:
                raise ValueError("Could not find symbol for ", ensg)
            sym = ens2sym[ensg]
            gh.write("%s\t%s\t%s\n" %(tissue, sym, ensg))
            dge_genes.add(s)
            c += 1
gh.close()
print("Wrote %d DGE by tissue events" % c)

    

Wrote 12716 DGE by tissue events
