In [1]:
import requests as rq
from bs4 import BeautifulSoup

# Test on one link

In [9]:
url = 'https://en.wikipedia.org//wiki/Operation_Skorpion'
# url = 'https://en.wikipedia.org//wiki/Siege_of_Giarabub'
# url = 'https://en.wikipedia.org/wiki/Siege_of_Giarabub'
# url = 'https://en.wikipedia.org/wiki/Battle_of_Dakar'
# url = 'https://en.wikipedia.org/wiki/Vilnius_Offensive'
url

'https://en.wikipedia.org//wiki/Operation_Skorpion'

In [10]:
def get_dom(url):
    response = rq.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.content, 'html.parser')

In [11]:
info = get_dom(url)

## Processing one link

- actors
- date
- geolocation
- who wins
- casualities
- map?

In [12]:
table = info.find('table','infobox vevent')

In [13]:
def _table_to_dict(table):
    result = {}
    for row in table.find_all('tr'):
        key = next(row.th.stripped_strings)
        value = row.td.get_text().strip()
        
        result[key] = value
        
    return result

In [14]:
def _get_main_info(table):
    main = [el for el in table.tbody.find_all('tr', recursive=False) if 'Location' in el.get_text()][0]
    return  {'main': _table_to_dict(main) }


In [15]:
_get_main_info(table)

{'main': {'Date': '26–27 May 1941',
  'Location': 'Halfaya Pass, Egypt31°30′N 25°11′E\ufeff / \ufeff31.500°N 25.183°E\ufeff / 31.500; 25.183Coordinates: 31°30′N 25°11′E\ufeff / \ufeff31.500°N 25.183°E\ufeff / 31.500; 25.183',
  'Result': 'Axis victory',
  'Territorial': 'Axis re-captured Halfaya Pass'}}

# Additional

In [319]:
def _parse_row(row):
    '''parse secondory info row
    as dict of info points - for allies and axis
    '''
    cells = row.find_all('td', recursive=False)
    return [cell.get_text().strip() for cell in cells]



    
def _find_row_by_header(table, string):
    header = table.tbody.find('tr', text=string)
    if header is not None:
        return header.next_sibling
    

def _additional(table):
    
    keywords = (
        'Belligerents',
        'Commanders and leaders',
        'Strength',
        'Casualties and losses',
    )
    
    result = {}
    for keyword in keywords:
        try:
            data = _find_row_by_header(table, keyword)
            if data:
                result[keyword] = _parse_row(data)
        except Exception as e:
            raise Exception(keyword, e)
        
    return result

In [320]:
_additional(table)

{'Belligerents': ['Soviet Union', 'Germany', 'Polish Home Army'],
 'Commanders and leaders': ['Ivan Chernyakhovsky Pavel Rotmistrov',
  'Walter Model Dietrich von Saucken Rainer Stahel Theodor Tolsdorff',
  'Aleksander Krzyżanowski Antoni Olechnowicz'],
 'Strength': ['~100,000', '7,700', '?'],
 'Casualties and losses': ['50-70 tanks\n\n?',
  '8,000 killed; 5,000 captured in Vilnius alone (Soviet est)',
  '?']}

## Test on  a few 

In [279]:
urls = {
'Dakar': 'https://en.wikipedia.org/wiki/Battle_of_Dakar',
'Brest': 'https://en.wikipedia.org/wiki/Battle_for_Brest',
'Torpedo Alley': 'https://en.wikipedia.org/wiki/Torpedo_Alley',
'Moravo': 'https://en.wikipedia.org/wiki/Battle_of_Morava%E2%80%93Ivan'
}

In [284]:
def _parse_battle_page(url):
    print(url)
    dom = _default_collect(url)
    dom['url'] = url
    
    table = dom.find('table','infobox vevent')
    if table is None:
        return {}
    
    data = _get_main_info(table)
    
    additional = _additional(table)
    data.update(additional)
    return data



In [285]:
_parse_battle_page(url)

https://en.wikipedia.org/wiki/Battle_of_Dakar


{'main': {'Date': '23–25 September 1940',
  'Location': 'Off Dakar, French West Africa',
  'Result': 'Vichy French victory'},
 'Belligerents': {'allies': 'United Kingdom\xa0Free France\xa0Australia',
  'axis': 'Vichy France\n\n French West Africa'},
 'Commanders and leaders': {'allies': 'John Cunningham Charles de Gaulle',
  'axis': 'Pierre François Boisson'},
 'Strength': {'allies': '2 battleships5 cruisers10 destroyers1 aircraft carrier',
  'axis': '1 battleship2 cruisers4 destroyers 3 submarinescoastal emplacements'},
 'Casualties and losses': {'allies': '1 battleship crippled1 battleship damaged 2 cruisers damaged1 armed trawler sunk6 torpedo planes lost',
  'axis': '1 destroyer grounded2 submarines sunk1 battleship damagedDanish freighter MS Tacoma sunk[1][2]'}}

In [286]:
result = {k:_parse_battle_page(v) for k, v in urls.items()}

https://en.wikipedia.org/wiki/Battle_of_Dakar
https://en.wikipedia.org/wiki/Battle_for_Brest
https://en.wikipedia.org/wiki/Torpedo_Alley
https://en.wikipedia.org/wiki/Battle_of_Morava%E2%80%93Ivan


In [287]:
result['Moravo']

{'main': {'Date': '14–23 November 1940',
  'Location': 'Morava mountains, south-east AlbaniaCoordinates: 40°35′N 20°40′E\ufeff / \ufeff40.583°N 20.667°E\ufeff / 40.583; 20.667',
  'Result': 'Greek victory \n\nGreeks capture Korçë and Argyrokastro.'},
 'Belligerents': {'allies': 'Italy', 'axis': 'Greece'},
 'Commanders and leaders': {'allies': 'Gabriele Nasci',
  'axis': 'Ioannis Pitsikas Georgios Kosmas'},
 'Strength': {'allies': '55,000200 field guns',
  'axis': '70,000198 field guns'},
 'Casualties and losses': {'allies': 'unknown killed & wounded1,000 captured',
  'axis': '624 killed2,348 wounded'}}

In [288]:
result['Brest']

{'main': {'Date': '7 August – 19 September 1944',
  'Location': 'Brittany, France',
  'Result': 'Allied victory'},
 'Belligerents': {'allies': 'United States\xa0United Kingdom',
  'axis': 'Germany'},
 'Commanders and leaders': {'allies': 'Troy H. Middleton',
  'axis': 'Hermann-Bernhard Ramcke\xa0 Hans Kroh'},
 'Strength': {'allies': 'VIII Corps\n\n2nd Infantry Division\n8th Infantry Division\n29th Infantry Division\n2nd Ranger Battalion\n5th Ranger Battalion\n79th Armoured Division (elements)\n6th Armored Division (elements)',
  'axis': '2nd Fallschirmjäger-Division266. Infanterie-Division343. Infanterie-Division'},
 'Casualties and losses': {'allies': '9,831 killed or wounded[1]',
  'axis': '38,000 captured[1]'}}