In [1]:
import os
import gzip
import shutil
import fp_utils
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
fp_utils.settings.init_fp_utils()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
url = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/"

In [4]:
dir_to_download = Path('../data/pubchem/sdfs')
dir_to_download.mkdir(parents=True, exist_ok=True)

In [5]:
processed_file = Path("../data/pubchem/processed.txt")
with processed_file.open('a'):
    pass

In [6]:
with open(processed_file, 'r') as f:
    processed_before = set(f.read().split())

In [7]:
processed_before

{'Compound_000000001_000500000.sdf.gz',
 'Compound_041500001_042000000.sdf.gz',
 'Compound_042000001_042500000.sdf.gz',
 'Compound_083000001_083500000.sdf.gz',
 'Compound_083500001_084000000.sdf.gz',
 'Compound_124000001_124500000.sdf.gz'}

In [8]:
def get_urls(url, ext):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, features='lxml')
    all_links = [link.get("href") for link in soup("a")]
    return pd.Series(filter(lambda x: x.endswith(ext), all_links))

In [9]:
sdfs = get_urls(url, ".sdf.gz")

In [10]:
file_names = pd.Series(list(sorted(sdfs)))

In [11]:
def tag_processed(file_name):
    with open(processed_file, 'a') as f:
        f.write(str(file_name) + '\n')
    print('prcessed:', file_name)

In [12]:
def download_zips(file_name):
    if file_name in processed_before:
        print("skipped", file_name)
        return
    print("start", file_name)
    with requests.get(url + file_name, stream=True) as r:
        with open(dir_to_download / file_name, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    with gzip.open(dir_to_download / file_name, 'rb') as f_in:
        with open(dir_to_download / file_name[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(dir_to_download / file_name)
    tag_processed(file_name)

In [13]:
file_names.parallel_apply(download_zips)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=83), Label(value='0 / 83'))), HBox…

startstartstart start  Compound_124000001_124500000.sdf.gzCompound_083000001_083500000.sdf.gzCompound_000000001_000500000.sdf.gz 
Compound_041500001_042000000.sdf.gz


prcessed: Compound_041500001_042000000.sdf.gz
start Compound_042000001_042500000.sdf.gz
prcessed: Compound_042000001_042500000.sdf.gz
start Compound_042500001_043000000.sdf.gz
prcessed: Compound_083000001_083500000.sdf.gz
start Compound_083500001_084000000.sdf.gz
prcessed: Compound_083500001_084000000.sdf.gz
start Compound_084000001_084500000.sdf.gz
prcessed: Compound_124000001_124500000.sdf.gz
start 
Compound_124500001_125000000.sdf.gzprcessed: Compound_000000001_000500000.sdf.gz
start Compound_000500001_001000000.sdf.gz


Process ForkPoolWorker-3:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-en

  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/site-packages/urllib3/response.py", line 566, in read
    data = self._fp_read(amt) if not fp_closed else b""
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/shutil.py", line 195, in copyfileobj
    buf = fsrc_read(length)
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/site-packages/urllib3/response.py", line 566, in read
    data = self._fp_read(amt) if not fp_closed else b""
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/site-packages/urllib3/response.py", line 532, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/http/client.py", line 465, in read
    s = self.fp.read(amt)
  File "/home/Vsevolod.Vaskin/anaconda3/envs/rdkit-env/lib/python3.10/site-packages/urllib3/response.py", line 532, in _fp_read
    return self._fp.read(amt) if amt is not None

KeyboardInterrupt: 

In [14]:
def check_processed(file_name, processed):
    for processed_name in processed:
        if file_name in processed_name:
            return True
    return False

In [15]:
def remove_unprocessed():
    with processed_file.open('r') as f:
        processed = f.read().split()
    for file in os.listdir(dir_to_download):
        if check_processed(file, processed):
            continue
        os.remove(dir_to_download / file)
        print('removed:', dir_to_download / file)


In [16]:
remove_unprocessed()

removed: ../data/pubchem/sdfs/Compound_124500001_125000000.sdf.gz
removed: ../data/pubchem/sdfs/Compound_084000001_084500000.sdf.gz
removed: ../data/pubchem/sdfs/Compound_042500001_043000000.sdf.gz
removed: ../data/pubchem/sdfs/Compound_000500001_001000000.sdf.gz
