In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import re

In [2]:
URL = 'https://en.wikipedia.org/wiki/Art_movement'

response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

print(soup)

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Art movement - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"7841eacb-929d-46c3-85cc-3ef5a782f8f6","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Art_movement","wgTitle":"Art movement","wgCurRevisionId":1033755118,"wgRevisionId":1033755118,"wgArticleId":228568,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Articles with short description","Short description is different from Wikidata","Art movements","Art history","Style","Visual arts"],"

In [3]:
# get informations about modern current
currents = soup.findAll('div', {"class": 'div-col'})
modern_currents = currents[1]

# get informations about genres
genres = modern_currents.findAll('li')
genres

[<li><a href="/wiki/Academic_art" title="Academic art">Academic</a>, c. 1900s (decade)-ongoing</li>,
 <li><a class="mw-redirect" href="/wiki/American_realism" title="American realism">American realism</a>, c. 1890s–1920s</li>,
 <li><a class="mw-redirect" href="/wiki/Analytic_Cubism" title="Analytic Cubism">Analytic Cubism</a>, c. 1909–1912</li>,
 <li><a href="/wiki/Art_Deco" title="Art Deco">Art Deco</a>, c. 1920s–1940s</li>,
 <li><a href="/wiki/Ashcan_School" title="Ashcan School">Ashcan School</a>, c. 1890s–1920s</li>,
 <li><a href="/wiki/Australian_tonalism" title="Australian tonalism">Australian tonalism</a>, c. 1910s–1930s</li>,
 <li><a href="/wiki/Berlin_Secession" title="Berlin Secession">Berliner Sezession</a>, founded 1898</li>,
 <li><a href="/wiki/Bloomsbury_Group" title="Bloomsbury Group">Bloomsbury Group</a>, c. 1900s (decade)–1960s</li>,
 <li><a href="/wiki/Brandywine_School" title="Brandywine School">Brandywine School</a></li>,
 <li><a href="/wiki/Camden_Town_Group" title

In [4]:
genre_name = []
period = []
start_period = []
end_period = []

for genre in genres:
    genre_title = genre.a['title']
    genre_date = re.findall(r'[0-9]+', str(genre))
    
    if len(genre_date) != 0:
        genre_name.append(genre_title)
        period.append(genre_date)
        start_period.append(genre_date[0])
        end_period.append(genre_date[-1])

In [5]:
# create dataframe
df = pd.DataFrame({'name': genre_name, 'start_period': start_period, 'end_period': end_period})

df.head(5)

Unnamed: 0,name,start_period,end_period
0,Academic art,1900,1900
1,American realism,1890,1920
2,Analytic Cubism,1909,1912
3,Art Deco,1920,1940
4,Ashcan School,1890,1920


In [6]:
# change type of date to int
df['start_period'] = df['start_period'].astype('int32')
df['end_period'] = df['end_period'].astype('int32')

# remove outliers
df = df[df['start_period'] > 100]

In [7]:
ongoing_current = ['Academic art', 'Neo-Classicism']
df['end_period'] = df.apply(lambda x: 'ongoing' if x['name'] in ongoing_current else x['end_period'], axis=1)

Unnamed: 0,name,start_period,end_period
0,Academic art,1900,ongoing
1,American realism,1890,1920
2,Analytic Cubism,1909,1912
3,Art Deco,1920,1940
4,Ashcan School,1890,1920
5,Australian tonalism,1910,1930
6,Berlin Secession,1898,1898
7,Bloomsbury Group,1900,1960
8,Camden Town Group,1911,1913
9,Constructivism (art),1920,1940


In [10]:
df.to_csv('../data/artistic_current.csv', index=False)