In [1]:
!pip install pyquery

Collecting pyquery
  Downloading https://files.pythonhosted.org/packages/78/43/95d42e386c61cb639d1a0b94f0c0b9f0b7d6b981ad3c043a836c8b5bc68b/pyquery-1.4.1-py2.py3-none-any.whl
Collecting cssselect>0.7.9
  Downloading https://files.pythonhosted.org/packages/3b/d4/3b5c17f00cce85b9a1e6f91096e1cc8e8ede2e1be8e96b87ce1ed09e92c5/cssselect-1.1.0-py2.py3-none-any.whl
Installing collected packages: cssselect, pyquery
Successfully installed cssselect-1.1.0 pyquery-1.4.1


In [2]:
from urllib.parse import urljoin
from pyquery import PyQuery
import os
import requests
import csv


class PascalSentenceDataSet():

    DATASET_DIR = 'dataset/'
    SENTENCE_DIR = 'sentence/'
    PASCAL_SENTENCE_DATASET_URL = 'http://vision.cs.uiuc.edu/pascal-sentences/'

    def __init__(self):
        self.url = PascalSentenceDataSet.PASCAL_SENTENCE_DATASET_URL

    def download_images(self):
        dom = PyQuery(self.url)
        for img in dom('img').items():
            img_src = img.attr['src']
            category, img_file_name = os.path.split(img_src)

            output_dir = PascalSentenceDataSet.DATASET_DIR + category
            print(output_dir)
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)

            output = os.path.join(output_dir, img_file_name)
            print(output)
            if img_src.startswith('http'):
                img_url = img_src
            else:
                img_url = urljoin(self.url, img_src)
            if os.path.isfile(output):
                print("Already downloaded, Skipping: %s" % output)
                continue
            print("Downloading: %s" % output)
            with open(output,'wb') as f:

                while True:
                    result = requests.get(img_url)
                    raw = result.content
                    if result.status_code == 200:
                        f.write(raw)
                        break
                    print("error occurred while fetching img")
                    print("retry...")


    def download_sentences(self):
        dom = PyQuery(self.url)
        for tr in dom('body>table>tr').items():
            img_src = tr('img').attr['src']
            category, img_file_name = os.path.split(img_src)

            output_dir = PascalSentenceDataSet.SENTENCE_DIR + category
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)

            head, tail = os.path.splitext(img_file_name)
            sentence_file_name = head + "txt"
            output = os.path.join(output_dir, sentence_file_name)
            if os.path.isfile(output):
                print("Already downloaded, Skipping: %s" % output)
                continue
            print("Downloading: %s" % output)
            with open(output,'w') as f:
                for td in tr('table tr td').items():
                    f.write(td.text() + "\n")


if __name__=="__main__":

    dataset = PascalSentenceDataSet()
    dataset.download_images()
    dataset.download_sentences()

dataset/aeroplane
dataset/aeroplane/2008_000716.jpg
Downloading: dataset/aeroplane/2008_000716.jpg
dataset/aeroplane
dataset/aeroplane/2008_001227.jpg
Downloading: dataset/aeroplane/2008_001227.jpg
dataset/aeroplane
dataset/aeroplane/2008_001380.jpg
Downloading: dataset/aeroplane/2008_001380.jpg
dataset/aeroplane
dataset/aeroplane/2008_001448.jpg
Downloading: dataset/aeroplane/2008_001448.jpg
dataset/aeroplane
dataset/aeroplane/2008_001468.jpg
Downloading: dataset/aeroplane/2008_001468.jpg
dataset/aeroplane
dataset/aeroplane/2008_001801.jpg
Downloading: dataset/aeroplane/2008_001801.jpg
dataset/aeroplane
dataset/aeroplane/2008_001971.jpg
Downloading: dataset/aeroplane/2008_001971.jpg
dataset/aeroplane
dataset/aeroplane/2008_001985.jpg
Downloading: dataset/aeroplane/2008_001985.jpg
dataset/aeroplane
dataset/aeroplane/2008_002358.jpg
Downloading: dataset/aeroplane/2008_002358.jpg
dataset/aeroplane
dataset/aeroplane/2008_002454.jpg
Downloading: dataset/aeroplane/2008_002454.jpg
dataset/ae

In [3]:
!zip -r images.zip dataset
!zip -r captions.zip sentence

  adding: dataset/ (stored 0%)
  adding: dataset/sofa/ (stored 0%)
  adding: dataset/sofa/2008_000704.jpg (deflated 0%)
  adding: dataset/sofa/2008_008538.jpg (deflated 0%)
  adding: dataset/sofa/2008_000648.jpg (deflated 0%)
  adding: dataset/sofa/2008_004416.jpg (deflated 1%)
  adding: dataset/sofa/2008_008103.jpg (deflated 0%)
  adding: dataset/sofa/2008_007837.jpg (deflated 1%)
  adding: dataset/sofa/2008_001660.jpg (deflated 0%)
  adding: dataset/sofa/2008_000636.jpg (deflated 0%)
  adding: dataset/sofa/2008_000541.jpg (deflated 0%)
  adding: dataset/sofa/2008_007021.jpg (deflated 0%)
  adding: dataset/sofa/2008_008517.jpg (deflated 0%)
  adding: dataset/sofa/2008_007169.jpg (deflated 0%)
  adding: dataset/sofa/2008_000493.jpg (deflated 0%)
  adding: dataset/sofa/2008_005926.jpg (deflated 0%)
  adding: dataset/sofa/2008_008622.jpg (deflated 0%)
  adding: dataset/sofa/2008_002903.jpg (deflated 0%)
  adding: dataset/sofa/2008_006436.jpg (deflated 0%)
  adding: dataset/sofa/2008_0037