# MitoCheck Gene Movie Downloader
### Downloads all movies for each gene listed in Ens82-gene_phenotypes.txt (from MitoCheck database)

#### Import libraries

In [6]:
import pandas as pd
import wget
import os
import datetime
import logging
import concurrent.futures
from itertools import repeat
import time

#### Load gene IDs for use in api calls

In [7]:
gene_IDs_dataframe = pd.read_csv("Ens82-gene_phenotypes.txt", sep="	")
gene_IDs = gene_IDs_dataframe.iloc[:, 0]
#Show gene_IDs dataframe
gene_IDs

0        ENSG00000167711
1        ENSG00000198478
2        ENSG00000187118
3        ENSG00000086758
4        ENSG00000101276
              ...       
19955    ENSG00000215454
19956    ENSG00000276438
19957    ENSG00000278535
19958    ENSG00000136425
19959    ENSG00000087884
Name: Unnamed: 0, Length: 19960, dtype: object

#### Get movie IDs for each gene and URL for each movie ID

In [8]:
def get_movie_IDs(gene_ID):
    gene_movie_data_url = "https://www.mitocheck.org/cgi-bin/mtc?action=get_data;gene=" + gene_ID + ";data=images;format=json"
    gene_movie_data = pd.read_json(gene_movie_data_url)
    if gene_movie_data.empty:
        raise ValueError("No movie data for gene " + gene_ID)
    #remove all image ids not from: Mitocheck primary screen scource
    gene_movie_data = gene_movie_data.drop(gene_movie_data[gene_movie_data.source != "Mitocheck primary screen"].index)
    gene_movie_IDs = gene_movie_data.iloc[:, 0]
    return gene_movie_IDs

def get_movie_url(movie_ID):
    gene_movie_ID_url = "https://www.mitocheck.org/cgi-bin/mtc?action=get_data;images=" + movie_ID + ";data=images;format=json"
    gene_url_data = pd.read_json(gene_movie_ID_url)
    url_suffix = gene_url_data.iloc[0,0] #something like /mitocheck_screen/movies/mp4/LT0020_40--ex2005_06_01--sp2005_04_18-tt17--c3/236--20--08--(11,13)--9332--SERPINF2-gfp.mp4
    url = "https://www.mitocheck.org/data" + url_suffix
    return url


# movie_IDs = get_movie_IDs(gene_IDs[0])
# url = get_movie_url(movie_IDs[0])

#### Download all movies for a particular gene ID

In [9]:
def download_movie(movie_ID, gene_ID, gene_movie_downloads):
    movie_url = get_movie_url(movie_ID)
    wget.download(movie_url, gene_movie_downloads + "/" + movie_ID)

def download_gene_movies(gene_ID):    
    try:
        movie_IDs = get_movie_IDs(gene_ID)
    except ValueError:
        logging.info("No movie data for " + gene_ID)
    else:
        #make directory for all movdow for this particular gene
        gene_movie_downloads = os.getcwd() + "/movie_downloads/" + gene_ID
        os.mkdir(gene_movie_downloads)
        #threaded download of all movies for the particular gene
        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            executor.map(download_movie, movie_IDs, repeat(gene_ID), repeat(gene_movie_downloads))
        
        logging.info(str(datetime.datetime.now()) + " Downloaded " + gene_ID)

#### Notes from downloading first 100 genes:
Run time: 313s
Genes with Data: 94
Total Movies: 907
Average Number of Movies/Gene: 9.6
Download Size: 961mb

So for all ~20,000 genes:
Run time: ~17 hours
Genes with Data: ~18800
Total Movies: ~181400
Average Number of Movies/Gene: 9.6
Download Size: 192.2gb

#### Run the code and log gene download completion time


In [10]:
#configure logger
logger = logging.getLogger()
handler = logging.FileHandler('phenotype_movie_downloader.log')
logger.addHandler(handler)
logger.info(str(datetime.datetime.now()) + " Starting Download")

print("Starting download")

start = time.time()

for gene_ID in gene_IDs:
    download_gene_movies(gene_ID)
    print(gene_ID + ", Total time: " + str(time.time()-start))

Starting download


KeyboardInterrupt: 