In [1]:
import os
import gzip
import shutil
import fp_utils
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
fp_utils.settings.init_fp_utils()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
url = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/"

In [4]:
dir_to_download = Path('../data/pubchem/sdfs')
dir_to_download.mkdir(parents=True, exist_ok=True)

In [5]:
processed_file = Path("../data/pubchem/processed.txt")
with processed_file.open('a'):
    pass

In [6]:
with open(processed_file, 'r') as f:
    processed_before = set(f.read().split())

In [7]:
processed_before

{'Compound_000000001_000500000.sdf.gz',
 'Compound_000500001_001000000.sdf.gz',
 'Compound_001000001_001500000.sdf.gz',
 'Compound_001500001_002000000.sdf.gz',
 'Compound_041500001_042000000.sdf.gz',
 'Compound_042000001_042500000.sdf.gz',
 'Compound_042500001_043000000.sdf.gz',
 'Compound_043000001_043500000.sdf.gz',
 'Compound_043500001_044000000.sdf.gz',
 'Compound_083000001_083500000.sdf.gz',
 'Compound_083500001_084000000.sdf.gz',
 'Compound_084000001_084500000.sdf.gz',
 'Compound_084500001_085000000.sdf.gz',
 'Compound_085000001_085500000.sdf.gz',
 'Compound_085500001_086000000.sdf.gz',
 'Compound_086000001_086500000.sdf.gz',
 'Compound_086500001_087000000.sdf.gz',
 'Compound_124000001_124500000.sdf.gz',
 'Compound_124500001_125000000.sdf.gz',
 'Compound_125000001_125500000.sdf.gz',
 'Compound_125500001_126000000.sdf.gz'}

In [8]:
def get_urls(url, ext):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, features='lxml')
    all_links = [link.get("href") for link in soup("a")]
    return pd.Series(filter(lambda x: x.endswith(ext), all_links))

In [9]:
sdfs = get_urls(url, ".sdf.gz")

In [10]:
file_names = pd.Series(list(sorted(sdfs)))

In [11]:
def tag_processed(file_name):
    with open(processed_file, 'a') as f:
        f.write(str(file_name) + '\n')
    print('prcessed:', file_name)

In [12]:
def download_zips(file_name):
    if file_name in processed_before:
        print("skipped", file_name)
        return
    print("start", file_name)
    with requests.get(url + file_name, stream=True) as r:
        with open(dir_to_download / file_name, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    with gzip.open(dir_to_download / file_name, 'rb') as f_in:
        with open(dir_to_download / file_name[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(dir_to_download / file_name)
    tag_processed(file_name)

In [13]:
file_names.parallel_apply(download_zips)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=83), Label(value='0 / 83'))), HBox…

skippedskippedskippedskipped   Compound_041500001_042000000.sdf.gz 
Compound_000000001_000500000.sdf.gz
Compound_083000001_083500000.sdf.gzCompound_124000001_124500000.sdf.gzskipped

skippedskipped  skipped Compound_042000001_042500000.sdf.gzCompound_000500001_001000000.sdf.gz 
Compound_124500001_125000000.sdf.gzCompound_083500001_084000000.sdf.gz

skippedskipped  Compound_042500001_043000000.sdf.gzCompound_084000001_084500000.sdf.gz

skipped
skippedskippedskipped    Compound_001000001_001500000.sdf.gzCompound_084500001_085000000.sdf.gzCompound_043000001_043500000.sdf.gz
Compound_125000001_125500000.sdf.gz
skippedskipped
  Compound_085000001_085500000.sdf.gzCompound_043500001_044000000.sdf.gzskipped
 startCompound_125500001_126000000.sdf.gz
 Compound_044000001_044500000.sdf.gzskipped
 
skipped
Compound_001500001_002000000.sdf.gz 
startstart Compound_085500001_086000000.sdf.gz
Compound_126000001_126500000.sdf.gz skipped
 Compound_086000001_086500000.sdf.gz
skipped Compound_086500001_087

EOFError: Ran out of input

In [14]:
def check_processed(file_name, processed):
    for processed_name in processed:
        if file_name in processed_name:
            return True
    return False

In [15]:
def remove_unprocessed():
    with processed_file.open('r') as f:
        processed = f.read().split()
    for file in os.listdir(dir_to_download):
        if check_processed(file, processed):
            continue
        os.remove(dir_to_download / file)
        print('removed:', dir_to_download / file)


In [16]:
remove_unprocessed()

removed: ../data/pubchem/sdfs/Compound_019000001_019500000.sdf.gz
removed: ../data/pubchem/sdfs/Compound_105000001_105500000.sdf
removed: ../data/pubchem/sdfs/Compound_063000001_063500000.sdf.gz
removed: ../data/pubchem/sdfs/Compound_019000001_019500000.sdf
