In [1]:
import requests as rq
from bs4 import BeautifulSoup

# Test on one link

In [2]:
url = 'https://en.wikipedia.org//wiki/Operation_Skorpion'
# url = 'https://en.wikipedia.org//wiki/Siege_of_Giarabub'
# url = 'https://en.wikipedia.org/wiki/Siege_of_Giarabub'
# url = 'https://en.wikipedia.org/wiki/Battle_of_Dakar'
# url = 'https://en.wikipedia.org/wiki/Vilnius_Offensive'
url

'https://en.wikipedia.org//wiki/Operation_Skorpion'

In [3]:
def get_dom(url):
    response = rq.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.content, 'html.parser')

In [4]:
info = get_dom(url)

## Processing one link

- actors
- date
- geolocation
- who wins
- casualities
- map?

In [6]:
table = info.select('table.infobox.vevent')[0]

In [8]:
def _table_to_dict(table):
    result = {}
    for row in table.find_all('tr'):
        result[row.th.text] = row.td.get_text().strip()
        
    return result

In [9]:
def _get_main_info(table):
    main = [el for el in table.tbody.find_all('tr', recursive=False) if 'Location' in el.get_text()][0]
    return  {'main': _table_to_dict(main) }


In [10]:
_get_main_info(table)

{'main': {'Date': '26–27 May 1941',
  'Location': 'Halfaya Pass, Egypt31°30′N 25°11′E\ufeff / \ufeff31.500°N 25.183°E\ufeff / 31.500; 25.183Coordinates: 31°30′N 25°11′E\ufeff / \ufeff31.500°N 25.183°E\ufeff / 31.500; 25.183',
  'Result': 'Axis victory',
  'Territorialchanges': 'Axis re-captured Halfaya Pass'}}

# Additional

In [23]:
def _parse_row(row, names=("allies", "axis", "third party")):
    """parse secondory info row
    as dict of info points
    """
    cells = row.find_all("td", recursive=False)
    if len(cells) == 1:
        return {"total": cells[0].get_text(separator=" ").strip()}

    return {
        name: cell.get_text(separator=" ").strip() for name, cell in zip(names, cells)
    }

    
def _find_row_by_header(table, string):
    header = table.tbody.find('tr', text=string)
    if header is not None:
        return header.next_sibling
    

def _additional(table):
    """collects additional info
    using header keywords and returning
    data from the row below each
    """

    keywords = (
        "Belligerents",
        "Commanders and leaders",
        "Strength",
        "Casualties and losses",
    )

    result = {}
    for keyword in keywords:
        try:
            data = _find_row_by_header(table, keyword)
            if data:
                result[keyword] = _parse_row(data)
        except Exception as e:
            raise Exception(keyword, e)

    return result


In [24]:
_additional(table)

{'Belligerents': {'allies': 'United Kingdom', 'axis': 'Germany'},
 'Commanders and leaders': {'allies': 'William Gott',
  'axis': 'Maximilian von Herff'},
 'Strength': {'allies': 'Infantry battalion and supporting arms',
  'axis': 'Kampfgruppe von Herff'},
 'Casualties and losses': {'total': '173 men 12 guns 5  Infantry tanks'}}

## Test on  a few 

In [25]:
urls = {
'Dakar': 'https://en.wikipedia.org/wiki/Battle_of_Dakar',
'Brest': 'https://en.wikipedia.org/wiki/Battle_for_Brest',
'Torpedo Alley': 'https://en.wikipedia.org/wiki/Torpedo_Alley',
'Moravo': 'https://en.wikipedia.org/wiki/Battle_of_Morava%E2%80%93Ivan'
}

In [26]:
def _parse_battle_page(url):
    print(url)
    dom = get_dom(url)
    dom['url'] = url
    
    table = dom.find('table','infobox vevent')
    if table is None:
        return {}
    
    data = _get_main_info(table)
    
    additional = _additional(table)
    data.update(additional)
    return data



In [27]:
_parse_battle_page(url)

https://en.wikipedia.org//wiki/Operation_Skorpion


{'main': {'Date': '26–27 May 1941',
  'Location': 'Halfaya Pass, Egypt31°30′N 25°11′E\ufeff / \ufeff31.500°N 25.183°E\ufeff / 31.500; 25.183Coordinates: 31°30′N 25°11′E\ufeff / \ufeff31.500°N 25.183°E\ufeff / 31.500; 25.183',
  'Result': 'Axis victory',
  'Territorialchanges': 'Axis re-captured Halfaya Pass'},
 'Belligerents': {'allies': 'United Kingdom', 'axis': 'Germany'},
 'Commanders and leaders': {'allies': 'William Gott',
  'axis': 'Maximilian von Herff'},
 'Strength': {'allies': 'Infantry battalion and supporting arms',
  'axis': 'Kampfgruppe von Herff'},
 'Casualties and losses': {'total': '173 men 12 guns 5  Infantry tanks'}}

In [28]:
result = {k:_parse_battle_page(v) for k, v in urls.items()}

https://en.wikipedia.org/wiki/Battle_of_Dakar
https://en.wikipedia.org/wiki/Battle_for_Brest
https://en.wikipedia.org/wiki/Torpedo_Alley
https://en.wikipedia.org/wiki/Battle_of_Morava%E2%80%93Ivan


In [29]:
result['Moravo']

{'main': {'Date': '14–23 November 1940',
  'Location': 'Morava mountains, south-east AlbaniaCoordinates: 40°35′N 20°40′E\ufeff / \ufeff40.583°N 20.667°E\ufeff / 40.583; 20.667',
  'Result': 'Greek victory \n\nGreeks capture Korçë and Argyrokastro.'},
 'Belligerents': {'allies': 'Italy', 'axis': 'Greece'},
 'Commanders and leaders': {'allies': 'Gabriele Nasci',
  'axis': 'Ioannis Pitsikas   Georgios Kosmas'},
 'Strength': {'allies': '55,000 200 field guns',
  'axis': '70,000 198 field guns'},
 'Casualties and losses': {'allies': 'unknown killed & wounded 1,000 captured',
  'axis': '624 killed 2,348 wounded'}}

In [30]:
result['Brest']

{'main': {'Date': '7 August – 19 September 1944',
  'Location': 'Brittany, France',
  'Result': 'Allied victory'},
 'Belligerents': {'allies': 'United States \xa0 United Kingdom',
  'axis': 'Germany'},
 'Commanders and leaders': {'allies': 'Troy H. Middleton   Walter M. Robertson   Donald A. Stroh   Charles H. Gerhardt',
  'axis': 'Hermann-Bernhard Ramcke \xa0   Hans Kroh \xa0   Erwin Rauch'},
 'Strength': {'allies': 'VIII Corps \n \n 2nd Infantry Division \n 8th Infantry Division \n 29th Infantry Division \n 2nd Ranger Battalion \n 5th Ranger Battalion \n 79th Armoured Division  (elements) \n 6th Armored Division  (elements)',
  'axis': '2nd Fallschirmjäger-Division 266. Infanterie-Division 343. Infanterie-Division'},
 'Casualties and losses': {'allies': '9,831 killed or wounded [1]',
  'axis': '38,000 captured [1]'}}