In [1]:
import requests as rq
from bs4 import BeautifulSoup
import json

### Task

Collect a tabular dataset of all battles of WWII from wikipedia.
for each battle collect it's theater, location, date, outcomes and as much of additional info as there is

# Collect page and parse as a DOM

In [2]:
base_url = 'https://en.wikipedia.org/wiki/List_of_World_War_II_battles'

In [3]:
def get_dom(url):
    response = rq.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.content, 'html.parser')

## Search for the content

In [13]:
soup = get_dom(base_url)
content = soup.select('div#mw-content-text>div.mw-parser-output', limit=1)[0]

In [15]:
# content

In [5]:
# also can use :not(:last-of-type) in css
fronts = content.select('div.mw-parser-output>h2')[:-1]

In [6]:
for front in fronts:
    print(front.text[:-6])

African Front
Mediterranean Front
Western Front
Atlantic Ocean
Eastern Front
Indian Ocean
Pacific Theatre
China Front
Southeast Asia Front


In [7]:
def _abs_link(link, base='https://en.wikipedia.org'):
    return base + link
        

In [8]:
def dictify(ul, level=0):
    result = dict()
    
    for li in ul.find_all("li", recursive=False):
        text = li.stripped_strings
        key = next(text)
        
        try:
            time = next(text).replace(':', '').strip()
        except StopIteration:
            time = None

        ul, link = li.find("ul"), li.find('a')
        if link:
            link = _abs_link(link.get('href'))
            

        r ={'url': link,
            'time':time,
            'level': level} 
        
        if ul:
            r['children'] = dictify(ul, level=(level + 1))

        result[key] = r
    return result

## Try on one

In [16]:
# z = dictify(fronts[0].find_next_siblings("div", "div-col columns column-width")[0].ul)
# z

# Collect for all Fronts

In [17]:
theaters = {}

for front in fronts:
    list_element = front.find_next_siblings("div", "div-col columns column-width")[0].ul
    theaters[front.text[:-6]] = dictify(list_element)

In [18]:
theaters.keys()

dict_keys(['African Front', 'Mediterranean Front', 'Western Front', 'Atlantic Ocean', 'Eastern Front', 'Indian Ocean', 'Pacific Theatre', 'China Front', 'Southeast Asia Front'])

In [12]:
with open('all_battles.json', 'w') as f:
    json.dump(theaters, f)