In [74]:
# native python
import concurrent.futures
import datetime
import glob
import io
import logging
import os.path
import os
import shlex
import urllib.parse
import json


# numeric stuff
import numpy as np
import scipy.stats
import pandas # for tables

# converting things
import pathlib
import pydash # for functional stuff
import dateutil.parser # garbage date/times

# image processing
import skimage.io
import skimage.transform
from PIL import Image

# plotting
import matplotlib.pyplot as plt # plotting 
import matplotlib.dates # date axes
import seaborn


# web requests
import ssl
import requests # urls
import mako.template # html formatting

import IPython.display # notebook
import tqdm # progress bar

%matplotlib inline

# replace default logging
#del logging.root.handlers[0]
#logging.basicConfig(level=logging.INFO)

In [75]:
# Create a sparql query to get all the oil paintings from the wikipedia
query = """

SELECT ?item ?itemLabel ?cid ?_image ?_inception ?_creator ?_creatorLabel WHERE {
  # select all paintings
  ?item wdt:P31 wd:Q3305213.
  # made with oil
  ?item wdt:P186 wd:Q296955.
  # written in english
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  # store the image
  OPTIONAL { ?item wdt:P18 ?_image. }
  # store the creation date
  OPTIONAL { ?item wdt:P571 ?_inception. }
  # store the creator
  OPTIONAL { ?item wdt:P170 ?_creator. }
}
"""

In [174]:
url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
resp = requests.get(url, 
                    params=dict(query=query), 
                    headers={'Accept': 'application/json'})
data = resp.json()

INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): query.wikidata.org


In [175]:
# take a random sample
sample = False
if sample:
    data['results']['bindings'] = np.random.choice(data['results']['bindings'], replace=False, size=sample)
    len(data['results']['bindings'])

In [176]:
# parse the dates
for row in data['results']['bindings']:
    # try and parse the date
    if '_inception' in row and row['_inception']['type'] == 'literal':
        try:
            date = row['_inception']['value']
            row['date'] = dateutil.parser.parse(date)
        except ValueError as e:
            logging.info("not parsed %s\n%s", date, row)
    # drop first part
    if pydash.has(row, 'item.value'):
        entity = row['item']['value'].split("/")[-1]
        row['entity'] = entity
            

In [177]:
rows = []
for row in data['results']['bindings']:
    creator = pydash.get(row, '_creatorLabel.value')
    name = pydash.get(row, 'itemLabel.value')

    datestr = pydash.get(row, '_inception.value')
    datetype = pydash.get(row, '_inception.type')
    img = pydash.get(row, '_image.value')
    entity = pydash.get(row, 'entity')
    if datestr and datetype == 'literal':
        try:
            date = dateutil.parser.parse(datestr)
        except:
            logging.exception("error %s\n%s", datestr, row)
    else:
        date = None
    rows.append(
        dict(
            creator=creator,
            name=name,
            date=date,
            img=img,
            entity=entity
        )
    )

In [178]:
df = pandas.DataFrame(data=rows)
# drop anonymous paintings
# df.ix[df.creator == 'anonymous'] = None
# drop missings and duplicates
df = df.dropna().drop_duplicates()
# calculate new index
df = df.reset_index()

def url2name(url):
    """convert url to file tag"""
    return urllib.parse.unquote(url.split(':')[-1].replace('FilePath/', 'File:'))
df['wiki_tag'] = df['img'].apply(url2name)
df['chunk'] = df.index // 50

len(df)

18752

In [179]:
df.set_index('entity').head()
df.to_json('paintings.json')

In [180]:
imgs = df.img
imgs.to_csv('urls.txt', index=False)

In [185]:

# get a token from: /w/api.php?action=query&format=json&meta=tokens&type=csrf%7Clogin
rows = []
groups = df.groupby('chunk')

for chunk_id, chunk in tqdm.tqdm_notebook(groups, desc='groups'):

    names = "|".join(chunk.img.apply(url2name))
    url = 'https://commons.wikimedia.org/w/api.php'
    params = {
        "action": "query",
        "titles": names,
        "prop": "imageinfo",
        "iiprop": "timestamp|thumbmime|bitdepth|dimensions|sha1|url|mediatype|metadata|extmetadata",
        "format": "json"

    }
    headers = {'Accept': 'application/json'}
    data = requests.get(url, params=params, headers=headers, allow_redirects=True).json() #.content.decode('utf-8')
    for i, (wiki_id, page) in enumerate(data['query']['pages'].items()):
        row = {}
        assert len(pydash.get(page, 'imageinfo', [])) == 1
        row['wiki_page_id'] = pydash.get(page, 'pageid')
        row['ns'] = pydash.get(page, 'ns')
        row['title'] = pydash.get(page, 'title')
        row['url'] = pydash.get(page, 'imageinfo.0.url')
        row['height'] = pydash.get(page, 'imageinfo.0.height')
        row['width'] = pydash.get(page, 'imageinfo.0.width')    
        row['descriptionurl'] = pydash.get(page, 'imageinfo.0.descriptionurl')
        row['descriptionshorturl'] = pydash.get(page, 'imageinfo.0.descriptionshorturl')
        row['sha1'] = pydash.get(page, 'imageinfo.0.sha1').upper()
        row['metadata'] = pydash.get(page, 'imageinfo.0.metadata')
        rows.append(row)

INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): commons.wikimedia.org
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connecti




ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [184]:

wikimedia_df = pandas.DataFrame.from_dict(rows)
wikimedia_df.to_json('wikimedia.json')
wikimedia_df = pandas.read_json('wikimedia.json')
wikimedia_df = wikimedia_df.reset_index().drop('index', axis=1)

In [126]:
with open('headers.json') as f:
    records = json.load(f)

# headers contain lists of 1 or more items
for record in records:
    for key, item in record.items():
        if isinstance(item, list) and len(item) == 1:
            item = item[0]
            record[key] = item
    
headers_df = pandas.DataFrame.from_records(records)
# convert base36 to base16
headers_df['sha1'] = headers_df['X-Object-Meta-Sha1Base36'].apply(
    lambda x: np.base_repr(int(x, base=36), 16).zfill(40) 
)

In [127]:
'''
rows = []
for fname in tqdm.tqdm_notebook(os.listdir('/Users/fedor/data/highres/'), desc='files'):
    fullpath = os.path.join('/Users/fedor/data/highres/', fname)
    quoted_path = shlex.quote(fullpath)
    sha1 = !shasum $quoted_path | cut -f 1 -d ' '
    row = {
        "filename": fname,
        "sha1": sha1[0].strip().upper(),
        "size": os.stat(fullpath).st_size
        
    }
    rows.append(row)
files_df = pandas.DataFrame(rows)
files_df.to_json('files.json')
'''
files_df = pandas.read_json('files.json')
# drop index
files_df = files_df.reset_index().drop('index', axis=1)

In [154]:
paintings_df = pandas.read_json('paintings.json')
paintings_df = paintings_df.reset_index().drop('level_0', axis=1)

In [155]:
sha1 = wikimedia_df.iloc[0]['sha1']
sha1


'5E60CEE7FC0D60D623CD6952554C9F5B740CB4A7'

In [159]:
W = wikimedia_df.ix[wikimedia_df.sha1 == sha1].iloc[0]
H = headers_df.ix[headers_df.sha1 == sha1].iloc[0]
W.descriptionurl

'https://commons.wikimedia.org/wiki/File:CaravaggioJeromeMeditation.jpg'

In [168]:
P = paintings_df.iloc[0]
P.img

'http://commons.wikimedia.org/wiki/Special:FilePath/Johannes%20Vermeer%20-%20Gezicht%20op%20huizen%20in%20Delft%2C%20bekend%20als%20%27Het%20straatje%27%20-%20Google%20Art%20Project.jpg'

0        https://upload.wikimedia.org/wikipedia/commons...
1        https://upload.wikimedia.org/wikipedia/commons...
2        https://upload.wikimedia.org/wikipedia/commons...
3        https://upload.wikimedia.org/wikipedia/commons...
4        https://upload.wikimedia.org/wikipedia/commons...
5        https://upload.wikimedia.org/wikipedia/commons...
6        https://upload.wikimedia.org/wikipedia/commons...
7        https://upload.wikimedia.org/wikipedia/commons...
8        https://upload.wikimedia.org/wikipedia/commons...
9        https://upload.wikimedia.org/wikipedia/commons...
10       https://upload.wikimedia.org/wikipedia/commons...
11       https://upload.wikimedia.org/wikipedia/commons...
12       https://upload.wikimedia.org/wikipedia/commons...
13       https://upload.wikimedia.org/wikipedia/commons...
14       https://upload.wikimedia.org/wikipedia/commons...
15       https://upload.wikimedia.org/wikipedia/commons...
16       https://upload.wikimedia.org/wikipedia/commons.