In [3]:
%matplotlib inline
import pylab as plt
plt.style.use('fivethirtyeight')

import pandas as pd
import json
from copy import copy
import re

In [4]:
data = pd.read_csv('./data/African Front.csv')

In [5]:
data.head(3)

Unnamed: 0,Belligerents.allies,Belligerents.axis,Casualties and losses.allies,Casualties and losses.axis,Casualties and losses.total,Commanders and leaders.allies,Commanders and leaders.axis,Date,Location,Result,Strength.allies,Strength.axis,Territorial,level,name,time,url,front
0,United Kingdom Free French,Italy\n\n Italian Libya,40 killed,120 killed410 wounded,,Archibald Wavell William Gott John Campbell,Rodolfo Graziani Mario Berti Italo Gariboldi P...,9–16 September 1940,Egypt26°N 30°E﻿ / ﻿26°N 30°E﻿ / 26; 30Coordina...,Italian victory,1 reinforced brigade205 aircraftnaval support,about 4 divisions300 aircraft,,100,Italian invasion of Egypt,September 1940,https://en.wikipedia.org/wiki/Italian_invasion...,African Front
1,United Kingdom\n\n British India,Italy,56 killed27 tanks disabled/broken down,"819 killed1,338 wounded2,000 prisoners28 tanks",,Richard O'Connor Reginald Savory,Pietro Maletti †,9 December 1940,‘Alam Nibeiwa (Nibeiwa Hill)31°23′00″N 25°53′0...,British victory,"5,000 soldiers47 tanks","4,100 soldiers23 tanks",,100,Battle of Nibeiwa,December 1940,https://en.wikipedia.org/wiki/Battle_of_Nibeiwa,African Front
2,United Kingdom\n\n British India\n Australia F...,Italy,624 soldiers,"2,194 killed2,286 wounded38,300 prisoners237 g...",,Archibald Wavell Henry Maitland Wilson Richard...,Rodolfo Graziani Giuseppe Tellera Annibale Ber...,10–11 December 1940,"Sidi Barrani, Egypt31°36′39″N 25°55′32″E﻿ / ﻿3...",British victory,"36,000 soldiers120 guns275 tanks142 aircraft","60,000 soldiers250 guns120 tanks331 aircraft",,100,Battle of Sidi Barrani,December 1940,https://en.wikipedia.org/wiki/Battle_of_Sidi_B...,African Front


In [6]:
data.dtypes

Belligerents.allies              object
Belligerents.axis                object
Casualties and losses.allies     object
Casualties and losses.axis       object
Casualties and losses.total      object
Commanders and leaders.allies    object
Commanders and leaders.axis      object
Date                             object
Location                         object
Result                           object
Strength.allies                  object
Strength.axis                    object
Territorial                      object
level                             int64
name                             object
time                             object
url                              object
front                            object
dtype: object

# Cleaning

* level
* Date[s] -> datetime (or period?)
* Location

* Belligerents -> set
* Casualties and losses - type/number
* Commanders and leaders - > set 
* etc

## I. Dates

In [7]:
d = ('January', 'February', 'March', 'April', 'May', 
     'June', "July",' August', 'September', 'October', 'November', 'December')
len(d)

12

In [8]:
month_pattern = re.compile(f'({"|".join(d)})')

In [9]:
year_pattern = re.compile('19\d{2}')

In [10]:
from itertools import zip_longest

def _parse_month_year(string):
    if pd.isnull(string):
        return (None, None)
    months = month_pattern.findall(string)
    years = year_pattern.findall(string)
    if len(years)==0 or years[0] is None:
        raise ValueError(string)
    
    
    if len(months) == 0:
        return (None, None)
    
    result = [f'{v[0]} {v[1]}' for v in zip_longest(months, years, fillvalue=years[-1])]
    if len(result) == 1:
        return result*2
    else:
        return result
    
    
    
    

In [11]:
list(_parse_month_year('4–12 September 1945'))

['September 1945', 'September 1945']

In [12]:
se = data['Date'].apply(_parse_month_year)

In [13]:
data['start'] = se.str.get(0)
data['end'] = se.str.get(1)

In [14]:
data.loc[9, ['Date', 'start', 'end']]

Date     6 February – 25 May 1941
start               February 1941
end                      May 1941
Name: 9, dtype: object

In [15]:
for col in 'start', 'end':
    data[col] = pd.to_datetime(data[col])

## Belligerents

In [36]:
data['Belligerents.allies'] = data['Belligerents.allies'].str.replace('\n', ' ').str.replace('\xa0', ' ')

In [50]:
multi_word_t = ['Free France', 'Free French', 'United Kingdom', 'Brittish India', 'New Zealand', 
'Anglo-Egyptian Sudan', 'Soviet Union', 'ItalyRegio Corpo Truppe Coloniali', 'Assyrian Levies',
'French Congo', 'British Empire', 'United States', 'British India', 'British Commonwealth', 'Southern Rhodesia',
'South Africa']

In [51]:
for name in multi_word_t:
    n2 = name.replace(' ', '_')
    print(n2)
    data['Belligerents.allies'] = data['Belligerents.allies'].str.replace(name, n2)

Free_France
Free_French
United_Kingdom
Brittish_India
New_Zealand
Anglo-Egyptian_Sudan
Soviet_Union
ItalyRegio_Corpo_Truppe_Coloniali
Assyrian_Levies
French_Congo
British_Empire
United_States
British_India
British_Commonwealth
Southern_Rhodesia
South_Africa


In [95]:
data['Belligerents.allies_processed'] = data['Belligerents.allies'].fillna('').apply(lambda x: {el for el in x.split(' ') if el != ''})

## Level

In [53]:
levels = {
    1: 'Large Campaign',
    3: 'Campaign',
    5: 'Operation',
    7: 'Battle',
    9: 'Battle'
    
}

In [57]:
taxonomy = {
    ('Battle',): 'Battle',
    ('Invasion', 'Operation'): 'Operation',
    ('Campaign',): 'Campaign'
}

In [58]:
data.loc[data.level == 3, 'name'].tail(10)

26                   Western Desert Campaign
30    Italian conquest of British Somaliland
Name: name, dtype: object

## Location

In [74]:
data.Location.head(5).tolist()

['Egypt26°N 30°E\ufeff / \ufeff26°N 30°E\ufeff / 26; 30Coordinates: 26°N 30°E\ufeff / \ufeff26°N 30°E\ufeff / 26; 30',
 '‘Alam Nibeiwa (Nibeiwa Hill)31°23′00″N 25°53′00″E\ufeff / \ufeff31.38333°N 25.88333°E\ufeff / 31.38333; 25.88333Coordinates: 31°23′00″N 25°53′00″E\ufeff / \ufeff31.38333°N 25.88333°E\ufeff / 31.38333; 25.88333',
 'Sidi Barrani, Egypt31°36′39″N 25°55′32″E\ufeff / \ufeff31.61083°N 25.92556°E\ufeff / 31.61083; 25.92556Coordinates: 31°36′39″N 25°55′32″E\ufeff / \ufeff31.61083°N 25.92556°E\ufeff / 31.61083; 25.92556',
 'Giarabub, LibyaCoordinates: 29°44′33″N 24°31′01″E\ufeff / \ufeff29.74250°N 24.51694°E\ufeff / 29.74250; 24.51694',
 'Bardia, Libya31°46′N 25°06′E\ufeff / \ufeff31.767°N 25.100°E\ufeff / 31.767; 25.100Coordinates: 31°46′N 25°06′E\ufeff / \ufeff31.767°N 25.100°E\ufeff / 31.767; 25.100']

In [88]:
pattern = r'(\d+?°\d*′?[\d|\.]*″?[N|S]) (\d+°\d*′?[\d|\.]*″?[W|E])'

In [93]:
data.Location.loc[51]

'El Guettar, Tunisia'

In [96]:
data[['Lattitude', 'Longitude']] = data.Location.str.extract(pattern)

# CASUALTIES

In [97]:
cols = ['Casualties and losses.allies', 'Casualties and losses.axis']

In [None]:
819 killed1,338 wounded2,000 prisoners28 tanks

In [134]:
c_patterns = { 'killed': r'([\d|,]+) ?[killed|casualties]',
               'prisioners': r'([\d|,]+) ?prisoners',
               'tanks': r'([\d+|,]+) ?tanks',
               'guns': r'([\d+|,]+) ?guns',
               'aircrafts': r'([\d+|,]+) ?aircraft',
             }

In [135]:
e = pd.DataFrame(index=data.index)

In [139]:
data.loc[27, 'Casualties and losses.allies'].head(10)

'British CommonwealthEstimated 220,000 dead, wounded, missing andcaptured,[1] including35,478 confirmed dead.[2] Free French 16,000 killed, woundedand missing.[3]\n United States 2,715 killed;8,978 wounded;6,528 missing.[4][5]Principal material losses1,400 aircraft destroyed;2,000 tanks destroyed.'

In [137]:
for name, c  in c_patterns.items():
    for col in cols:
        side = col.split('.')[-1]
        e[ side + '.' + name] = data[col].str.extract(c)

In [138]:
e

Unnamed: 0,allies.killed,axis.killed,allies.prisioners,axis.prisioners,allies.tanks,axis.tanks,allies.guns,axis.guns,allies.aircrafts,axis.aircrafts
0,40.0,120.0,,,,,,,,
1,56.0,819.0,,2000.0,27,28,,,,
2,624.0,2194.0,,38300.0,,73,,237,,
3,17.0,250.0,,1300.0,,,,,,
4,130.0,1703.0,,,,,,,,
5,7.0,9.0,,,7,9,,,,
6,4.0,3.0,,,,,,,,3
7,,,,,,,,,,
8,500.0,133298.0,,,,420,,845,26.0,564
9,107.0,,,,107,,,,,
