# Setup
Run this file to generate the folder structure of the project and download the necessary data files. If a download fails you can try to manually download the files and place them in the *data* folder.


- [Processed ATAC-seq data and called peaks](https://sharehost.hms.harvard.edu/immgen/ImmGenATAC18_AllOCRsInfo.csv)
- [Processed RNA-seq data](https://www.cell.com/cms/10.1016/j.cell.2018.12.036/attachment/4392da81-c56e-471a-b1df-0e72f03ecd77/mmc2.csv)
- [Summary of Immune Cell Populations Profiled by ATAC-Seq and Their QC Matrices](https://www.cell.com/cms/10.1016/j.cell.2018.12.036/attachment/e5df7329-d77d-40b3-a03a-34bdbe4b402c/mmc1.xlsx)
- [Transcript, Coding, and Exon Start and End Positions](http://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/refFlat.txt.gz)
- [Chromvar TF motif associations for all OCRs](https://sharehost.hms.harvard.edu/immgen/ImmGenATAC18_AllTFmotifsInOCRs.txt)

In [2]:
#Define the data directory and file paths
DATA_DIR = "data/"

import os.path
import urllib.request 

#Create a directory for data if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

#Define file paths for the datasets
ATAC_SEQ_PATH = os.path.join(DATA_DIR, "ImmGenATAC18_AllOCRsInfo.csv")
RNA_SEQ_PATH = os.path.join(DATA_DIR, "mmc2.csv")
MMC1_PATH = os.path.join(DATA_DIR, "mmc1.xlsx")
REFFLAT_PATH = os.path.join(DATA_DIR, "refFlat.txt.gz")
TFMOTIF_PATH = os.path.join(DATA_DIR, "ImmGenATAC18_AllTFmotifsInOCRs.txt")

#Check if data files are already downloaded and if not, download them
#ATAC-seq
if not os.path.isfile(ATAC_SEQ_PATH):
    urllib.request.urlretrieve("https://sharehost.hms.harvard.edu/immgen/ImmGenATAC18_AllOCRsInfo.csv", ATAC_SEQ_PATH)

#RNA-seq    
if not os.path.isfile(RNA_SEQ_PATH):
    urllib.request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2018.12.036/attachment/4392da81-c56e-471a-b1df-0e72f03ecd77/mmc2.csv", RNA_SEQ_PATH)

#QC metrics
if not os.path.isfile(MMC1_PATH):
    urllib.request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2018.12.036/attachment/e5df7329-d77d-40b3-a03a-34bdbe4b402c/mmc1.xlsx", MMC1_PATH)

#Gene references    
if not os.path.isfile(REFFLAT_PATH):
    urllib.request.urlretrieve("http://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/refFlat.txt.gz", REFFLAT_PATH)

#TF motifs
if not os.path.isfile(TFMOTIF_PATH):
    urllib.request.urlretrieve("https://sharehost.hms.harvard.edu/immgen/ImmGenATAC18_AllTFmotifsInOCRs.txt", TFMOTIF_PATH)