In [1]:
!pip install sqlmodel==0.0.22 -q
!pip install beautifulsoup4==4.12.0 -q

In [1]:
import requests
import sqlmodel
from bs4 import BeautifulSoup as bs
from typing import List
from concurrent.futures import ThreadPoolExecutor
from sqlmodel import select

In [2]:
class ArtistArtMovement(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    art_movement: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="art_movements")


class ArtistSchool(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    school: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="schools")


class ArtistGenre(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    genre: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="genres")


class ArtistField(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    field: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="fields")


class ArtistNationality(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    nationality: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="nationalities")


class ArtistInstitution(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    institution: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="institutions")


class ArtistBase(sqlmodel.SQLModel):
    id: int | None = sqlmodel.Field(primary_key=True)
    name: str | None
    slug: str = sqlmodel.Field(index=True)  # /en/claude-monet
    wikipedia_article: str | None
    wikipedia_url: str | None
    article: str | None


class Artist(ArtistBase, table=True):
    artworks: list["Artwork"] = sqlmodel.Relationship(back_populates='artist')
    art_movements: list["ArtistArtMovement"] = sqlmodel.Relationship(back_populates='artist')
    schools: list["ArtistSchool"] = sqlmodel.Relationship(back_populates='artist')
    genres: list["ArtistGenre"] = sqlmodel.Relationship(back_populates='artist')
    fields: list["ArtistField"] = sqlmodel.Relationship(back_populates='artist')
    nationalities: list["ArtistNationality"] = sqlmodel.Relationship(back_populates='artist')
    institutions: list["ArtistInstitution"] = sqlmodel.Relationship(back_populates='artist')

In [3]:
class ArtworkStyle(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artwork_id: int = sqlmodel.Field(foreign_key="artwork.id")
    style: str
    artwork: "Artwork" = sqlmodel.Relationship(back_populates="styles")


class ArtworkGenre(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artwork_id: int = sqlmodel.Field(foreign_key="artwork.id")
    genre: str
    artwork: "Artwork" = sqlmodel.Relationship(back_populates="genres")


class ArtworkMedia(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artwork_id: int = sqlmodel.Field(foreign_key="artwork.id")
    media: str
    artwork: "Artwork" = sqlmodel.Relationship(back_populates="media")


class ArtworkBase(sqlmodel.SQLModel):
    id: int | None = sqlmodel.Field(primary_key=True)
    url: str = sqlmodel.Field(
        index=True,
        unique=True,
    )  # Seems like sometimes an artwork is under a collection, and sometimes under an artist.
    name: str | None
    artist_id: int | None = sqlmodel.Field(foreign_key="artist.id")
    wikipedia_article: str | None
    wikipedia_url: str | None


class Artwork(ArtworkBase, table=True):
    artist: Artist = sqlmodel.Relationship(back_populates="artworks")
    styles: list[ArtworkStyle] = sqlmodel.Relationship(back_populates="artwork")
    genres: list[ArtworkGenre] = sqlmodel.Relationship(back_populates="artwork")
    media: list[ArtworkMedia] = sqlmodel.Relationship(back_populates="artwork")

In [4]:
engine = sqlmodel.create_engine("sqlite:///wikiart.db", connect_args={"check_same_thread": False})
sqlmodel.SQLModel.metadata.create_all(engine)

In [5]:
def get_movements() -> list[str]:
    url = "https://www.wikiart.org/en/artists-by-art-movement"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of movements found")
    return [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]


def get_artists_by_movement(movement: str):
    url = f'https://www.wikiart.org{movement}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    return [(li.a['href']) for li in ul.find_all('li')]


def get_school_or_groups() -> list[str]:
    url = "https://www.wikiart.org/en/artists-by-painting-school"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of schools found")
    return [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]


def get_artists_by_school_or_group(school_or_group: str):
    url = f'https://www.wikiart.org{school_or_group}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    return [(li.a['href']) for li in ul.find_all('li')]


def get_genres() -> list[str]:
    url = "https://www.wikiart.org/en/artists-by-genre"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of genres found")
    return [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]


def get_artists_by_genre(genre: str):
    url = f'https://www.wikiart.org{genre}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    return [(li.a['href']) for li in ul.find_all('li')]


def get_fields() -> list[str]:
    """
    Returns a list of fields an artist can belong to
    https://www.wikiart.org/en/artists-by-field
    """
    url = "https://www.wikiart.org/en/artists-by-field"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of fields found")

    fields = [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]
    return fields


def get_artists_by_field(field: str):
    url = f'https://www.wikiart.org{field}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    main = soup.find('main')
    if not main:
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    artists = [(li.a['href']) for li in ul.find_all('li')]
    return artists


def get_nationalities() -> list[str]:
    """
    Returns a list of nationalities
    https://www.wikiart.org/en/artists-by-nation
    """
    url = "https://www.wikiart.org/en/artists-by-nation"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of nationalities found")

    nationalities = [
        (li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs
    ]

    return nationalities


def get_artists_by_nationality(nationality_slug: str):
    url = f'https://www.wikiart.org{nationality_slug}/text-list'
    resp = requests.get(url)

    soup = bs(resp.text, 'html.parser')
    main = soup.find('main')
    if not main:
        raise ValueError("Main content not found")
    ul = main.find('ul')
    if not ul:
        raise ValueError("List of artists not found")
    artists = [(li.a['href']) for li in ul.find_all('li')]
    return artists


def get_institutions() -> list[str]:
    """
    Returns a list of (institution name, institution slug)
    https://www.wikiart.org/en/artists-by-art-institution
    """
    base_url = "https://www.wikiart.org/en/artists-by-art-institution/"
    resp = requests.get(base_url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of institutions found")

    institutions = [
        (li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs
    ]
    return institutions


def get_artists_by_institution(institution_slug: str) -> list[str]:
    """
    Returns a list of artists that are associated with the institution
    Institution: Name of the institution
    institution_slug: institution slug  /en/artists-by-art-institution/{institute}

    returns:
    [artist_slug]
    """
    url = f'https://www.wikiart.org{institution_slug}/text-list'
    resp = requests.get(url)

    soup = bs(resp.text, 'html.parser')
    main = soup.find('main')
    if not main:
        raise ValueError("Main content not found")

    ul = main.find('ul')
    if not ul:
        raise ValueError("List of artists not found")

    artists = [(li.a['href']) for li in ul.find_all('li')]
    return artists

In [5]:
def get_artist_and_artworks(artist_slug) -> None:
    """
    Gets an artist and their artwork slugs
    """
    url = f'https://www.wikiart.org{artist_slug}'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    if breadcrumbs_link := main.find('div', {"class": "wiki-breadcrumbs-links"}):
        artist_name = breadcrumbs_link.find('a', href=False).text
    else:
        artist_name = "Unknown Name"

    if article := main.find('div', {'id': 'info-tab-description'}):
        article_contents = article.find('p').text
    else:
        article_contents = ""

    if wikipedia_article := main.find('div', {'id': 'info-tab-wikipediaArticle'}):
        wikipedia_article_contents = wikipedia_article.find('p', class_=False).text
        if link_wrapper := wikipedia_article.find('div', {"class": 'wiki-link-wrapper'}):
            link = link_wrapper.find('a', {'class': 'wiki-link'})['href']
        else:
            link = ""
    else:
        wikipedia_article_contents = ""
        link = ""

    with sqlmodel.Session(engine) as session:
        artist = session.exec(select(Artist).where(Artist.slug == artist_slug)).one()
        artist.name = artist_name
        artist.article = article_contents
        artist.wikipedia_article = wikipedia_article_contents
        artist.wikipedia_url = link
        session.add(artist)
        artist_id = artist.id
        session.commit()
    artworks_link = f'https://www.wikiart.org{artist_slug}/all-works/text-list'

    resp = requests.get(artworks_link)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    painting_list_text = main.find('ul', {"class": "painting-list-text"})
    if painting_list_text:
        name_and_href = [
            (li.a['href'], li.a.text)
            for li in painting_list_text.find_all('li', {"class": "painting-list-text-row"})
            if li.a
        ]
    else:
        name_and_href = []
    with sqlmodel.Session(engine) as session:
        for link, name in name_and_href:
            artwork = Artwork(url=link, name=name, artist_id=artist_id)
            session.add(artwork)
        session.commit()

In [7]:
from concurrent.futures import ThreadPoolExecutor

with sqlmodel.Session(engine) as session:
    artists = session.exec(select(Artist).where(Artist.name == None)).all()
    artist_slugs = [artist.slug for artist in artists]

with ThreadPoolExecutor(max_workers=5) as executor:
    for artist_slug in artist_slugs:
        executor.submit(get_artist_and_artworks, artist_slug)

In [5]:
def update_artwork_info(artwork_slug):
    url = f'https://www.wikiart.org{artwork_slug}'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    wiki_layout = soup.find('section', {'class': 'wiki-layout-left-menu'})

    style = []
    genre = []
    media = []
    if wiki_layout and (style_section := wiki_layout.find('s', string='Style:')):
        style = [link.text.strip() for link in style_section.find_next("span").find_all("a")] if style_section else []
    if wiki_layout and (genre_section := wiki_layout.find('s', string='Genre:')):
        genre = [link.text.strip() for link in genre_section.find_next("span").find_all("a")] if genre_section else []
    if wiki_layout and (media_section := wiki_layout.find('s', string='Media:')):
        media = [link.text.strip() for link in media_section.find_next("span").find_all("a")] if media_section else []

    wikipedia_div = soup.find('div', {'id': 'info-tab-wikipediadescription'})
    wikipedia_article = wikipedia_div.find('p').text if wikipedia_div else ""
    wikipedia_link = ""
    if wikipedia_article and (link_wrapper := wikipedia_div.find('div', {"class": 'wiki-link-wrapper'})):
        wikipedia_link = link_wrapper.find('a', {'class': 'wiki-link'})['href']

    with sqlmodel.Session(engine) as session:
        artwork = session.exec(select(Artwork).where(Artwork.url == artwork_slug)).one()
        for s in style:
            artwork.styles.append(ArtworkStyle(style=s))
        for g in genre:
            artwork.genres.append(ArtworkGenre(genre=g))
        for m in media:
            artwork.media.append(ArtworkMedia(media=m))
        artwork.wikipedia_article = wikipedia_article or ""
        artwork.wikipedia_url = wikipedia_link or ""
        if artwork.id % 100 == 0:
            print(f"Updated {artwork.id} artworks")
        session.add(artwork)
        session.commit()

In [None]:
with sqlmodel.Session(engine) as session:
    artworks = session.exec(select(Artwork).where(Artwork.id > 133900)).all()
    artwork_slugs = [artwork.url for artwork in artworks]

print(len(artwork_slugs))

with ThreadPoolExecutor(max_workers=20) as executor:
    for artwork_slug in artwork_slugs:
        executor.submit(update_artwork_info, artwork_slug)

108720
Updated 134000 artworks
Updated 134100 artworks
Updated 134200 artworks
Updated 134300 artworks
Updated 134400 artworks
Updated 134500 artworks
Updated 134600 artworks
Updated 134700 artworks
Updated 134800 artworks
Updated 134900 artworks
Updated 135000 artworks
Updated 135100 artworks
Updated 135200 artworks
Updated 135300 artworks
Updated 135400 artworks
Updated 135500 artworks
Updated 135600 artworks
Updated 135700 artworks
Updated 135800 artworks
Updated 135900 artworks
Updated 136000 artworks
Updated 136100 artworks
Updated 136200 artworks
Updated 136300 artworks
Updated 136400 artworks
Updated 136500 artworks
Updated 136600 artworks
Updated 136700 artworks
Updated 136800 artworks
Updated 136900 artworks
Updated 137000 artworks
Updated 137100 artworks
Updated 137200 artworks
Updated 137300 artworks
Updated 137400 artworks
Updated 137500 artworks
Updated 137600 artworks
Updated 137700 artworks
Updated 137800 artworks
Updated 137900 artworks
Updated 138000 artworks
Updated 1

KeyboardInterrupt: 

Updated 197100 artworks
Updated 197200 artworks
Updated 197300 artworks
Updated 197400 artworks
Updated 197500 artworks
Updated 197600 artworks
Updated 197700 artworks
Updated 197800 artworks
Updated 197900 artworks
Updated 198000 artworks
Updated 198100 artworks
Updated 198200 artworks
Updated 198300 artworks
Updated 198400 artworks
Updated 198500 artworks
Updated 198600 artworks
Updated 198700 artworks
