In [1]:
import pandas as pd
import numpy as np

import os.path
import urllib.request 

In [2]:
#Define the data directory and file paths
#Create a directory for data if it doesn't exist
DATA_DIR = "data/"
os.makedirs(DATA_DIR, exist_ok=True)

ATAC_SEQ_PATH = os.path.join(DATA_DIR, "ImmGenATAC18_AllOCRsInfo.csv")
RNA_SEQ_PATH = os.path.join(DATA_DIR, "mmc2.csv")
MMC1_PATH = os.path.join(DATA_DIR, "mmc1.xlsx")
REFFLAT_PATH = os.path.join(DATA_DIR, "refFlat.txt.gz")

#Check if data files are already downloaded and if not, download them
#ATAC-seq
if not os.path.isfile(ATAC_SEQ_PATH):
    urllib.request.urlretrieve("https://sharehost.hms.harvard.edu/immgen/ImmGenATAC18_AllOCRsInfo.csv", ATAC_SEQ_PATH)

#RNA-seq    
if not os.path.isfile(RNA_SEQ_PATH):
    urllib.request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2018.12.036/attachment/4392da81-c56e-471a-b1df-0e72f03ecd77/mmc2.csv", RNA_SEQ_PATH)

if not os.path.isfile(MMC1_PATH):
    urllib.request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2018.12.036/attachment/e5df7329-d77d-40b3-a03a-34bdbe4b402c/mmc1.xlsx", MMC1_PATH)
    
if not os.path.isfile(REFFLAT_PATH):
    urllib.request.urlretrieve("http://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/refFlat.txt.gz", REFFLAT_PATH)

#Load the ATAC-seq data
atac = pd.read_csv(ATAC_SEQ_PATH, index_col=0)
print(atac.head())

#Load the RNA-seq data
rna = pd.read_csv(RNA_SEQ_PATH, index_col=0)
print(rna.head())

#Load the annotation data
refFlat = pd.read_csv(REFFLAT_PATH, sep="\t", header=None, comment="#", compression="gzip")
refFlat.columns = ["geneName", "name", "chrom", "strand", "txStart", "txEnd",
    "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds"]
print(refFlat.head())


                      chrom   Summit  mm10.60way.phastCons_scores  \
ImmGenATAC1219.peakID                                               
ImmGenATAC1219.peak_1  chr1  3020786                         0.00   
ImmGenATAC1219.peak_2  chr1  3087226                         0.00   
ImmGenATAC1219.peak_3  chr1  3120109                         0.07   
ImmGenATAC1219.peak_4  chr1  3121485                         0.15   
ImmGenATAC1219.peak_5  chr1  3372787                         0.03   

                       _-log10_bestPvalue  Included.in.systematic.analysis  \
ImmGenATAC1219.peakID                                                        
ImmGenATAC1219.peak_1                0.56                              NaN   
ImmGenATAC1219.peak_2                0.50                              NaN   
ImmGenATAC1219.peak_3               10.80                              1.0   
ImmGenATAC1219.peak_4                3.02                              1.0   
ImmGenATAC1219.peak_5                1.31       

In [9]:
print(atac["Summit"].mean())
print(atac["Summit"].var())

75968662.46984266
1980732023775990.8
