In [4]:
import os
import gzip
import shutil
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

In [5]:
url = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/"

In [6]:
dir_to_download = Path('../data/pubchem/sdfs')
dir_to_download.mkdir(parents=True, exist_ok=True)

In [7]:
processed_file = Path("../data/pubchem/processed.txt")
with processed_file.open('a'):
    pass

In [8]:
with open(processed_file, 'r') as f:
    processed_before = set(f.read().split())

In [9]:
processed_before

{'Compound_000000001_000500000.sdf.gz',
 'Compound_000500001_001000000.sdf.gz',
 'Compound_001000001_001500000.sdf.gz',
 'Compound_001500001_002000000.sdf.gz',
 'Compound_002000001_002500000.sdf.gz',
 'Compound_002500001_003000000.sdf.gz',
 'Compound_003000001_003500000.sdf.gz',
 'Compound_003500001_004000000.sdf.gz',
 'Compound_004000001_004500000.sdf.gz',
 'Compound_004500001_005000000.sdf.gz',
 'Compound_005000001_005500000.sdf.gz',
 'Compound_005500001_006000000.sdf.gz',
 'Compound_006000001_006500000.sdf.gz',
 'Compound_006500001_007000000.sdf.gz',
 'Compound_007000001_007500000.sdf.gz',
 'Compound_007500001_008000000.sdf.gz',
 'Compound_008000001_008500000.sdf.gz',
 'Compound_008500001_009000000.sdf.gz',
 'Compound_009000001_009500000.sdf.gz',
 'Compound_009500001_010000000.sdf.gz',
 'Compound_010000001_010500000.sdf.gz',
 'Compound_010500001_011000000.sdf.gz',
 'Compound_011000001_011500000.sdf.gz',
 'Compound_011500001_012000000.sdf.gz',
 'Compound_012000001_012500000.sdf.gz',


In [10]:
def get_urls(url, ext):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, features='lxml')
    all_links = [link.get("href") for link in soup("a")]
    return pd.Series(filter(lambda x: x.endswith(ext), all_links))

In [11]:
sdfs = get_urls(url, ".sdf.gz")

In [12]:
file_names = pd.Series(list(sorted(sdfs)))[:40]

In [13]:
file_names

0     Compound_000000001_000500000.sdf.gz
1     Compound_000500001_001000000.sdf.gz
2     Compound_001000001_001500000.sdf.gz
3     Compound_001500001_002000000.sdf.gz
4     Compound_002000001_002500000.sdf.gz
5     Compound_002500001_003000000.sdf.gz
6     Compound_003000001_003500000.sdf.gz
7     Compound_003500001_004000000.sdf.gz
8     Compound_004000001_004500000.sdf.gz
9     Compound_004500001_005000000.sdf.gz
10    Compound_005000001_005500000.sdf.gz
11    Compound_005500001_006000000.sdf.gz
12    Compound_006000001_006500000.sdf.gz
13    Compound_006500001_007000000.sdf.gz
14    Compound_007000001_007500000.sdf.gz
15    Compound_007500001_008000000.sdf.gz
16    Compound_008000001_008500000.sdf.gz
17    Compound_008500001_009000000.sdf.gz
18    Compound_009000001_009500000.sdf.gz
19    Compound_009500001_010000000.sdf.gz
20    Compound_010000001_010500000.sdf.gz
21    Compound_010500001_011000000.sdf.gz
22    Compound_011000001_011500000.sdf.gz
23    Compound_011500001_012000000

In [14]:
def tag_processed(file_name):
    with open(processed_file, 'a') as f:
        f.write(str(file_name) + '\n')
    print('prcessed:', file_name)

In [15]:
def download_zips(file_name):
    if file_name in processed_before:
        print("skipped", file_name)
        return
    print("start", file_name)
    with requests.get(url + file_name, stream=True) as r:
        with open(dir_to_download / file_name, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    with gzip.open(dir_to_download / file_name, 'rb') as f_in:
        with open(dir_to_download / file_name[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(dir_to_download / file_name)
    tag_processed(file_name)

In [14]:
file_names.parallel_apply(download_zips)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10), Label(value='0 / 10'))), HBox…

startstartstart  Compound_005000001_005500000.sdf.gzCompound_010000001_010500000.sdf.gz start

 Compound_015000001_015500000.sdf.gz
Compound_000000001_000500000.sdf.gz
prcessed: Compound_015000001_015500000.sdf.gz
start Compound_015500001_016000000.sdf.gz
prcessed: Compound_015500001_016000000.sdf.gz
start Compound_016000001_016500000.sdf.gz
prcessed: Compound_000000001_000500000.sdf.gz
start Compound_000500001_001000000.sdf.gz
prcessed: Compound_005000001_005500000.sdf.gz
start Compound_005500001_006000000.sdf.gz
prcessed: Compound_010000001_010500000.sdf.gz
start Compound_010500001_011000000.sdf.gz
prcessed: Compound_005500001_006000000.sdf.gz
start Compound_006000001_006500000.sdf.gz
prcessed: Compound_000500001_001000000.sdf.gz
start Compound_001000001_001500000.sdf.gz
prcessed: Compound_016000001_016500000.sdf.gz
start Compound_016500001_017000000.sdf.gz
prcessed: Compound_006000001_006500000.sdf.gz
start
 Compound_006500001_007000000.sdf.gzprcessed: Compound_001000001_001500000.s

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
dtype: object

In [16]:
def check_processed(file_name, processed):
    for processed_name in processed:
        if file_name in processed_name:
            return True
    return False

In [17]:
def remove_unprocessed():
    with processed_file.open('r') as f:
        processed = f.read().split()
    for file in os.listdir(dir_to_download):
        if check_processed(file, processed):
            continue
        os.remove(dir_to_download / file)
        print('removed:', dir_to_download / file)


In [17]:
remove_unprocessed()