In [None]:
import datetime
import logging
import os.path

import numpy as np
import pandas # for tables
import pydash # for functional stuff
import dateutil.parser # garbage date/times
import requests # urls
import matplotlib.pyplot as plt # plotting 
import matplotlib.dates # date axes
import mako.template # html formatting
import IPython.display # notebook
import skimage.io
import skimage.transform
import ssl
import io
from PIL import Image

%matplotlib inline

# replace default logging
del logging.root.handlers[0]
logging.basicConfig(level=logging.INFO)

In [None]:
# Create a sparql query to get all the oil paintings from the wikipedia
query = """

SELECT ?item ?itemLabel ?cid ?_image ?_inception ?_creator ?_creatorLabel WHERE {
  # select all paintings
  ?item wdt:P31 wd:Q3305213.
  # made with oil
  ?item wdt:P186 wd:Q296955.
  # written in english
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  # store the image
  OPTIONAL { ?item wdt:P18 ?_image. }
  # store the creation date
  OPTIONAL { ?item wdt:P571 ?_inception. }
  # store the creator
  OPTIONAL { ?item wdt:P170 ?_creator. }
}
LIMIT 1000
"""

In [None]:
url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
resp = requests.get(url, 
                    params=dict(query=query), 
                    headers={'Accept': 'application/json'})
data = resp.json()

In [None]:
# take a random sample
data['results']['bindings'] = np.random.choice(data['results']['bindings'], replace=False, size=500)
len(data['results']['bindings'])

In [None]:
# parse the dates
for row in data['results']['bindings']:
    if '_inception' in row and row['_inception']['type'] == 'literal':
        try:
            date = row['_inception']['value']
            row['date'] = dateutil.parser.parse(date)
        except ValueError as e:
            logging.info("not parsed %s\n%s", date, row)
    if pydash.has(row, 'item.value'):
        entity = row['item']['value'].split("/")[-1]
        row['entity'] = entity
            

In [None]:
# analyze all data
dates = []
for row in data['results']['bindings']:
    if 'date' in row:
        dates.append(
            matplotlib.dates.date2num(row['date'])
        )

In [None]:
# plot it
from scipy.stats import gaussian_kde
import numpy as np

fig, ax = plt.subplots()
_ = ax.hist(dates, normed=True, bins=30)
start_date = matplotlib.dates.date2num(datetime.datetime(1300, 1, 1))
end_date = matplotlib.dates.date2num(datetime.datetime(2050, 1, 1))
ax.set_xlim(start_date, end_date)
ax.xaxis_date(tz=dateutil.tz.tzutc())
ax.xaxis.set_major_locator(matplotlib.dates.YearLocator(100))
x_dates = np.linspace(start_date, end_date)
density = gaussian_kde(dates)
# density.covariance_factor = lambda : .25
ax.plot(x_dates, density(x_dates), linewidth=10, alpha=0.5) 


In [None]:
rows = []
for row in data['results']['bindings']:
    creator = pydash.get(row, '_creatorLabel.value')
    name = pydash.get(row, 'itemLabel.value')

    datestr = pydash.get(row, '_inception.value')
    datetype = pydash.get(row, '_inception.type')
    img = pydash.get(row, '_image.value')
    entity = pydash.get(row, 'entity')
    if datestr and datetype == 'literal':
        try:
            date = dateutil.parser.parse(datestr)
        except:
            logging.exception("error %s\n%s", datestr, row)
    else:
        date = None
    rows.append(
        dict(
            creator=creator,
            name=name,
            date=date,
            img=img,
            entity=entity
        )
    )

In [None]:
df = pandas.DataFrame(data=rows)
df.ix[df.creator == 'anonymous'] = None
# drop missings and duplicates
df = df.dropna().drop_duplicates()
len(df)

In [None]:
table = pandas.crosstab(df.creator, columns=['creator'])
table.sort_index(by="creator", ascending=False).head()

In [None]:
template = """
<style>

.paintings {
  display: flex;
  flex-flow: row wrap;

}
.painting img {
  width: 50px;
  height: 50px;
}
</style>
<div class="paintings">
% for i, row in table:
<div class="painting">
<img src="${row.img}" /> 
</div>
% endfor
</div>
"""
T = mako.template.Template(template)

In [None]:
IPython.display.HTML(T.render(table=df.head(n=2).iterrows()))

In [None]:
df.head()

In [None]:
row = df.iloc[0]
def save_img(row):
    # get url
    url = row.img
    # lookup id
    entity = row.entity 
    filename = "paintings/%s.jpg" % (entity, )
    if os.path.exists(filename):
        return
    # download
    resp = requests.get(url)
    print(resp, entity)
    if not resp.status_code == 200:
        return
    # get the bytes
    content = io.BytesIO(resp.content)
    # open as image
    im = Image.open(content)
    # scale down
    N = 256
    im.thumbnail((N, N))
    # save
    im.save(filename)
save_img(row)

In [None]:
import multiprocessing
pool = multiprocessing.Pool(processes=8)

In [None]:
# work
rows = [row for (i, row) in df.iterrows()]
# execute it
pool.map(save_img, rows)

In [None]:
df