In [2]:
import requests as rq
from bs4 import BeautifulSoup
import json

### Task

Collect a tabular dataset of all battles of WWII from wikipedia.
for each battle collect it's theater, location, date, outcomes and as much of additional info as there is

# Collect page and parse as a DOM

In [3]:
base_url = 'https://en.wikipedia.org/wiki/List_of_World_War_II_battles'

In [4]:
def get_dom(url):
    response = rq.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.content, 'html.parser')

# Search for the content

In [5]:
soup = get_dom(base_url)
content = soup.find('div', id='mw-content-text').find('div', 'mw-parser-output')

In [6]:
fronts = [el for el in content.find_all('h2', recursive=False)[:-1]]

In [7]:
for front in fronts:
    print(front.text[:-6])

African Front
Mediterranean Front
Western Front
Atlantic Ocean
Eastern Front
Indian Ocean
Pacific Theatre
China Front
Southeast Asia Front


In [8]:
def dictify(ul, level=0):
    result = dict()
    
    for li in ul.find_all("li", recursive=False):
        text = li.stripped_strings
        key = next(text)
        
        try:
            time = next(text).replace(':', '').strip()
        except StopIteration:
            time = None

        ul, link = li.find("ul"), li.find('a')
        if link:
            link = 'https://en.wikipedia.org' + link.get('href')
            
        nextlevel = level + 1

        r ={'url': link,
            'time':time,
            'level': nextlevel} 
        
        if ul:
            r['children'] = dictify(ul, level=nextlevel+1)

        result[key] = r
    return result

In [9]:
# fronts[0].find_next_siblings("div", "div-col columns column-width")[0].ul

In [10]:
theaters = {}
for front in fronts:
    theaters[front.text[:-6]] = dictify(front.find_next_siblings("div", "div-col columns column-width")[0].ul)

In [11]:
with open('all_battles.json', 'w') as f:
    json.dump(theaters, f)

In [12]:
theaters

{'African Front': {'North African Campaign': {'url': 'https://en.wikipedia.org/wiki/North_African_Campaign',
   'time': 'Western Desert Campaign',
   'level': 1,
   'children': {'Western Desert Campaign': {'url': 'https://en.wikipedia.org/wiki/Western_Desert_Campaign',
     'time': 'June 1940 – February 1943',
     'level': 3,
     'children': {'Italian invasion of Egypt': {'url': 'https://en.wikipedia.org/wiki/Italian_invasion_of_Egypt',
       'time': 'September 1940',
       'level': 5},
      'Operation Compass': {'url': 'https://en.wikipedia.org/wiki/Operation_Compass',
       'time': 'December 1940 – February 1941',
       'level': 5,
       'children': {'Battle of Nibeiwa': {'url': 'https://en.wikipedia.org/wiki/Battle_of_Nibeiwa',
         'time': 'December 1940',
         'level': 7},
        'Battle of Sidi Barrani': {'url': 'https://en.wikipedia.org/wiki/Battle_of_Sidi_Barrani',
         'time': 'December 1940',
         'level': 7},
        'Siege of Giarabub': {'url': 'htt