In [1]:
import json
from wiki import parse_battle_page
import time

# Load all Links

In [2]:
with open('./_all_battles.json', 'r') as f:
    campaigns = json.load(f)

In [3]:
campaigns['African Front']['Syria–Lebanon Campaign'].keys()

dict_keys(['url', 'time', 'level'])

# Parse all battles

In [4]:
def _parse_in_depth(element, name):
    '''attempts to scrape data for every
    element with url attribute - and all the children
    if there are any'''
    
    if 'children' in element:
        for k, child in element['children'].items():
            parsed = _parse_in_depth(child, k)
            element['children'][k].update(parsed)
    
    if 'url' in element:
        try:
            element.update(parse_battle_page(element['url']))       
        except Exception as e:
            raise Exception(name, e)
            
    time.sleep(.1) # let's be good citizens!
    return element

In [5]:
parse_battle_page('https://en.wikipedia.org/wiki/Operation_Brevity')

{'Date': '15–16 May 1941',
 'Location': 'Egyptian and Libyan border31°34′51″N 25°03′08″E\ufeff / \ufeff31.58083°N 25.05222°E\ufeff / 31.58083; 25.05222Coordinates: 31°34′51″N 25°03′08″E\ufeff / \ufeff31.58083°N 25.05222°E\ufeff / 31.58083; 25.05222',
 'Result': 'Inconclusive',
 'url': 'https://en.wikipedia.org/wiki/Operation_Brevity',
 'Belligerents': {'allies': 'United Kingdom \xa0 Australia',
  'axis': 'Germany \xa0 Italy'},
 'Commanders and leaders': {'allies': 'Archibald Wavell   William Gott',
  'axis': 'Erwin Rommel   Maximilian von Herff'},
 'Strength': {'allies': '3 infantry battalions 53 tanks',
  'axis': 'Elements of several battalions 30–50 tanks'},
 'Casualties and losses': {'allies': '206+ casualties 5 tanks destroyed 6 aircraft destroyed',
  'axis': '605+ casualties 3 tanks destroyed'}}

In [7]:
campaigns_parsed = {}

for fr_name, front in campaigns.items():
    print(fr_name)
    campaigns_parsed[fr_name] = {}
    for cp_name, campaign in front.items():
        print(f'    {cp_name}')
        
        parsed = _parse_in_depth(campaign, cp_name)
        if parsed is not None:
            campaigns_parsed[fr_name][cp_name] = parsed

African Front
    North African Campaign
    East African Campaign
    West African Campaign
    Iraq Campaign
    Syria–Lebanon Campaign
    Iran Campaign
    Operation Torch
    Tunisia Campaign
Mediterranean Front
    Battle of the Mediterranean Sea
    Siege of Malta
Western Front
    French offensive into Germany
    Operation Weserübung
    Battle of the Netherlands
    Battle of Belgium
    German invasion of Luxembourg
    Battle of France
    Italian invasion of France
    Battle of Britain
    The Blitz
    Operation Cerberus
    Operation Donnerkeil
    St. Nazaire Raid
    Dieppe Raid
    Battle of Berlin (air)
    Operation Overlord
    Operation Valkyrie
    Operation Dragoon
    Allied advance from Paris to the Rhine
    Clearing the Channel Coast
    Operation Market Garden
    Lorraine Campaign
    Battle of Moerbrugge
    Battle of Aachen
    Battle of Overloon
    Battle of Hürtgen Forest
    Operation Clipper
    Battle of Vianden
    Battle of Kesternich
    Battle



    Soviet Winter counter-offensive
    Operation Kremlin
    Case Blue
    Operation Uranus
    Operation Mars
    Operation Little Saturn
    Operation Winter Storm
    Battle for Velikiye Luki (1943)
    Operation Polyarnaya Zvezda
    Ostrogozhsk-Rossosh Operation
    Battle of Voronezh (1943)
    Third Battle of Kharkov
    Operation Citadel
    Battle of Smolensk (1943)
    Fourth Battle of Kharkov
    Battle of the Dnieper
    Concert (operation)
    Dnieper–Carpathian Offensive
    Leningrad–Novgorod Offensive
    Korsun-Cherkassy Pocket
    Battle of Narva (1944)
    Crimean Offensive (1944)
    Shyaulyay Offensive
    Battle of Someri
    Vyborg–Petrozavodsk Offensive
    Operation Bagration
    Battle of Tali-Ihantala
    Battle of the Bay of Viipuri
    Lvov–Sandomierz Offensive
    Battle of Radzymin (1944)
    Warsaw Uprising
    Battle of Studzianki
    Operation Doppelkopf
    Romanian campaign
    Slovak National Uprising
    Battle of the Dukla Pass
    Baltic Offensi



    Thailand invasion of Laos, Cambodia and French Indochina
    Japanese invasion of Thailand
    Japanese-Thai occupation of Malaya
    Bombing of Bangkok
    Japanese occupation of Singapore
    Burma Campaign
    Burma Campaign 1942–43
    Burma Campaign 1944
    Japanese invasion of India
    Burma Campaign 1944–1945
    Allies bombing of South-East Asia
    Battle of the Malacca Strait
    Operation Tiderace


In [8]:
print('!')

!


In [10]:
campaigns_parsed['Eastern Front']['Axis invasion of the Soviet Union']['children']['Defense of Brest Fortress']

{'url': 'https://en.wikipedia.org/wiki/Defense_of_Brest_Fortress',
 'time': 'June 1941',
 'level': 3,
 'Date': '22–29 June 1941',
 'Location': 'Brest, Belarusian SSR, Soviet Union',
 'Result': 'German victory',
 'Belligerents': {'allies': 'Germany', 'axis': 'Soviet Union'},
 'Commanders and leaders': {'allies': 'Fritz Schlieper',
  'axis': 'Pyotr Gavrilov \xa0 ( POW )   Ivan Zubachyov \xa0 ( POW )   Andrej Kižavataŭ \xa0 †   Yefim Fomin \xa0 [1] [2]'},
 'Strength': {'allies': 'about 17,000, 2 Panzer Divisions',
  'axis': 'over 9,000, 2  T-26  Cavalry Tanks,  BA-20  Bobik [3]'},
 'Casualties and losses': {'allies': '429 dead, 668 wounded [4]',
  'axis': 'more than 2,000 dead [5] about 6,800 captured [6]'}}

## Quality Control

In [11]:
STATISTICS = {
    'battles_checked':0,
    'location_null':0,
    'result_null':0,
    'territorial_null': 0,
    'total': {
        'Casualties and losses':0,
        'Commanders and leaders':0,
        'Strength':0
    },
    'none': {
        'Casualties and losses':0,
        'Commanders and leaders':0,
        'Strength':0
    }
}

def qa(battle, name='Unknown'):
    required = (
#         'Location'
#         'url', 
        'level',
    )
    
  
    
    for el in required:
        assert el in battle and battle[el] is not None, (name, el)

        
    STATISTICS['battles_checked'] +=1
    
    for el in 'Location', 'Result', 'Territorial':
        if el not in battle or battle[el] is None:
            STATISTICS[f'{el.lower()}_null'] += 1
        
    for el in 'Casualties and losses', 'Commanders and leaders', 'Strength':
        if el not in battle:
            STATISTICS['none'][el] += 1
            continue
        
        if 'total' in battle[el]:
            STATISTICS['total'][el] += 1
            
    if 'children' in battle:
         for name, child in battle['children'].items():
                qa(child, name)

In [12]:
for _, front in campaigns_parsed.items():
    for name, campaign in front.items():
        qa(campaign, name)

In [13]:
STATISTICS

{'battles_checked': 624,
 'location_null': 37,
 'result_null': 40,
 'territorial_null': 553,
 'total': {'Casualties and losses': 7,
  'Commanders and leaders': 3,
  'Strength': 2},
 'none': {'Casualties and losses': 83,
  'Commanders and leaders': 44,
  'Strength': 109}}

# Store all data

In [14]:
with open('_all_battles_parsed.json', 'w') as f:
    json.dump(campaigns_parsed, f)