**Importing Libraries**

In [1]:
from IPython.display import clear_output
from bs4 import BeautifulSoup
from numpy.random import rand
from random import randint
from time import sleep
import json, requests
import pandas as pd
import random

**Using Chicago's museum API to get all the artworks**

In [2]:
#Main querry string
url = """https://api.artic.edu/api/v1/artworks?querry[artwork_type_id][1]=True&fields=id,title,image_id,artist_title,style_title,color,date_start,artist_display,material_titles,publication_history,artwork_type_title,artwork_type_id,style_title&limit=100"""
next_url = url
counter = 0

# Getting the total items in the museum collection
result = json.loads(requests.get(url).text)
total_items = result['pagination']['total']

In [3]:
# Scraped data dataframe
cols = ['date_start', 'artwork_type_title', 'artist_display', 'style_title','artist_title',
        'id', 'image_id', 'title', 'publication_history','artwork_type_id', 'h', 's', 'l']

scraped_data = pd.DataFrame(columns = cols)

**Auxiliary functions**

In [182]:
# regex function to get the last element of an URL
def regex_link(x):
    return re.split('/',x)[-1]

In [240]:
# Function to get all artists and details for each movement
def get_detail_movement(movement,url_f):
    try:
        artists = pd.DataFrame(columns=['movement','name','birth','death','country', 'art_type'])
        response = requests.get('http://www.artcyclopedia.com/history/'+url_f)
        soup = BeautifulSoup(response.content, "html.parser")
        soup_result = soup.select("#mainpage > table tr")

        for i in range(3,len(soup_result)):   
            split = re.split(r'\xa0',soup_result[i].get_text())
            date = re.split(r'-',split[1])
            artists.loc[len(artists)] = [movement,split[0],date[0],date[1],re.split(r' ',split[2])[0],re.split(r' ',split[2], 1)[1]]      
    except:
        print('No Artists')
    return artists

In [4]:
#function to flateen columns dictionaries
def flat_col(dt, col):
    l = []
    for i in range(0,len(dt)):
        try:
            l.append(dt[i]['color'][col])
        except:
            l.append(0)
            
    return l

In [5]:
# Function to get the data for each link
def get_data(sub_url):   
    # Respectfull Nap
    sleep(random.randint(1,2))
    print('Napping, iteration', counter)
    
    r = requests.get(sub_url)
    tmp = json.loads(r.text)
    
    data = pd.DataFrame(tmp['data'])
    data['h'] = flat_col(tmp['data'], 'h')
    data['s'] = flat_col(tmp['data'], 's')
    data['l'] = flat_col(tmp['data'], 'l')

    return data.drop(columns = ['color', 'material_titles']), tmp['pagination']['next_url']

**Geting data from the chicago museum**

In [None]:
while next_url:
    df, next_url = get_data(next_url)
    scraped_data = pd.concat([scraped_data, df], axis = 0)
    counter += 1

In [246]:
scraped_data.head(3)

Unnamed: 0,date_start,artwork_type_title,artist_display,style_title,artist_title,id,image_id,title,publication_history,artwork_type_id,h,s,l
0,1898.0,Sculpture,"Auguste Rodin\nFrench, 1840-1917",,Auguste Rodin,8961,0377f007-2251-af1f-e997-ac44217b6651,Head of Arthur Jerome Eddy,,3,52,0,65
1,1880.0,Painting,"Paul Cezanne\nFrench, 1839-1906",Post-Impressionism,Paul Cezanne,16487,d4ca6321-8656-3d3f-a362-2ee297b2b813,"The Bay of Marseille, Seen from L'Estaque","F. Jourdain, Cézanne (Paris, 1914), pl. 42.\n\...",1,194,10,58
2,1873.0,Painting,"Paul Cezanne\nFrench, 1839-1906",Impressionism,Paul Cezanne,14556,90bc0cec-0d4e-9af5-3912-52a082a428e5,"Auvers, Panoramic View","Journal des Arts 43 (July 9, 1921), p. 2.\n\nD...",1,99,8,41


**Getting images**

In [25]:
paintings = scraped_data[scraped_data.artwork_type_title=='Painting']
paintings = paintings.drop_duplicates(subset=['image_id'])
paintings.shape

(2882, 13)

In [14]:
# Get Images from Chicago museum using the url with the painting ID
error_images = []
i = 0

for painting_id in paintings.image_id:
    i = i+1
    clear_output()
    print('Images downloaded: ',i)
    try:
        # I have to use a try condicional here because some id's lead to invalid pictures.
        webs = requests.get('https://www.artic.edu/iiif/2/'+painting_id+'/full/843,/0/default.jpg')
        open('images_chicago/' + painting_id+'.jpg', 'wb').write(webs.content)
    except:
        print(painting_id,'Failed')
        error_images.append(painting_id)

Images downloaded:  2882


In [32]:
# Generated a file in the comand line with the valid files downloaded and now i'm comparing to the dataset
# so the dataset just has the valid paintings in it

valid_images = pd.read_csv('valid_files.csv')
paintings = paintings[paintings.image_id.isin(valid_images.ID)]
paintings.to_csv('artic_edu_Valid_paintings.csv')

**Scraping movements and artists from art encyclopedia**

In [78]:
url = "http://www.artcyclopedia.com/history/"

In [79]:
response = requests.get(url)
response.status_code

200

In [160]:
# 1st iteration geting a list of movements from the main page
soup = BeautifulSoup(response.content, "html.parser")
movements = soup.select("#mainpage > table a")
lst_links = pd.DataFrame(columns = ['Movement','Link'])

for i in range(len(movements)):
    lst_links.loc[len(lst_links)] = [movements[i].get_text(), movements[i]['href']]
    
lst_links.Link = lst_links.Link.apply(regex_link)

In [172]:
# dropping duplicate links for movements and the first empty line
lst_links = lst_links.drop_duplicates(subset=['Link'])[1:]

In [243]:
# 2nd iteration Creating a dataset to store artists by artistic movement
artist_by_movement = pd.DataFrame(columns=['movement','name','birth','death','country', 'art_type'])

for name in zip(lst_links['Movement'], lst_links['Link']):
# Respectfull Nap
    sleep(random.randint(1,2))
    print('Napping and downloading', name[0])
    # scraping all artists from a artistic movement
    artist_by_movement = pd.concat([artist_by_movement, get_detail_movement(name[0],name[1])], axis=0)           

Napping and downloading Gothic Art
No Artists
Napping and downloading Byzantine Art
No Artists
Napping and downloading The Early Renaissance
No Artists
Napping and downloading The High Renaissance
Napping and downloading The Northern Renaissance
No Artists
Napping and downloading Mannerism
No Artists
Napping and downloading Baroque Art
No Artists
Napping and downloading The Rococo Style
Napping and downloading Neoclassicism
Napping and downloading Academic Art
Napping and downloading Japanese Ukiyo-e
Napping and downloading Romanticism
Napping and downloading The Hudson River School
Napping and downloading The Pre-Raphaelite Brotherhood
Napping and downloading Victorian Classicism
No Artists
Napping and downloading The Arts and Crafts Movement
Napping and downloading Symbolism
No Artists
Napping and downloading Realism
No Artists
Napping and downloading The Barbizon School
Napping and downloading Impressionism
No Artists
Napping and downloading Tonalism
Napping and downloading Post-Imp

Unnamed: 0,movement,name,birth,death,country,art_type
0,Gothic Art,Benedetto Antelami,1150,1230,Italian,Sculptor
1,Gothic Art,Mastro Guglielmo,1158,1165,Italian,Sculptor
0,The Early Renaissance,Giotto di Bondone,1267,1337,Italian,Painter
1,The Early Renaissance,Bernardo Daddi,1280,1348,Italian,Painter
2,The Early Renaissance,Taddeo Gaddi,1300,1366,Italian,Painter
...,...,...,...,...,...,...
1,Minimalism,Barnett Newman,1905,1970,American,Painter
2,Minimalism,Agnes Martin,1912,2004,Canadian/American,Painter
3,Minimalism,Tony Smith,1912,1980,American,Sculptor
4,Minimalism,Ad Reinhardt,1913,1967,American,Painter


In [245]:
# Saving results to csv
artist_by_movement.to_csv('artists_by_movement.csv')