In [77]:
%matplotlib inline
import pylab as plt
plt.style.use('fivethirtyeight')

import pandas as pd
import json

In [78]:
data = pd.read_csv('./data/Eastern Front.csv')

In [79]:
data.shape

(186, 23)

In [80]:
data.dtypes

Belligerents.allies                   object
Belligerents.axis                     object
Belligerents.third party              object
Casualties and losses.allies          object
Casualties and losses.axis            object
Casualties and losses.third party     object
Casualties and losses.total           object
Commanders and leaders.allies         object
Commanders and leaders.axis           object
Commanders and leaders.third party    object
Date                                  object
Location                              object
Result                                object
Strength.allies                       object
Strength.axis                         object
Strength.third party                  object
Strength.total                        object
Territorial                           object
level                                  int64
name                                  object
parent                                object
time                                  object
url       

# Battles only

In [81]:
battles = data[data.level == 100]
battles.shape

(147, 23)

In [82]:
columns = ['Location', 'name', 'Date', 'Result', 'Belligerents.allies',
           'Belligerents.axis', 'Casualties and losses.allies', 'Casualties and losses.axis']

In [83]:
battles['Location'].iloc[15]

'Wilno, Second Polish Republic (now Vilnius, Lithuania)54°40′N 25°19′E\ufeff / \ufeff54.667°N 25.317°E\ufeff / 54.667; 25.317Coordinates: 54°40′N 25°19′E\ufeff / \ufeff54.667°N 25.317°E\ufeff / 54.667; 25.317'

In [84]:
battles[columns].head()

Unnamed: 0,Location,name,Date,Result,Belligerents.allies,Belligerents.axis,Casualties and losses.allies,Casualties and losses.axis
0,"Westerplatte, harbor of Free City of Danzig54°...",Battle of Westerplatte,1–7 September 1939,German victory,Poland,Germany Danzig,15 deadat least 40 woundedRemainder captured,50 deadat least 150 wounded
1,"Mokra, Kielce Voivodeship, Poland",Battle of Mokra,"September 1, 1939",Polish victory,Germany,Poland,"800 killed, missing, captured, or wounded50 ta...","500 killed, missing or wounded300 horsessevera..."
2,"Near Mława, Warsaw Voivodeship, Poland",Battle of Mlawa,1–3 September 1939,German victory,Germany,Poland,"1,800 killed3,000 wounded1,000 missing72 tanks...","1,200 killed1,500 wounded"
3,"Near Tuchola Forest, Pomeranian Voivodeship, P...",Battle of Tuchola Forest,1–5 September 1939,German victory,Germany,Poland,506 killed \n\n743 wounded,1600 killed\n750 wounded\n\nUnknown number cap...
4,"Jordanów, Kraków Voivodeship, Poland",Battle of Jordanów,1–3 September 1939,Pyrrhic German victory,Poland,Germany,3+ tanks,70+ tanks and AFVs


# Location

In [9]:
pattern = r'/ ([\d|\.]+); ([\d|\.]+)'

In [10]:
# battles.iloc[9]['Location']

In [11]:
battles[['Lattitude', 'Longitude']] = battles.Location.str.extract(pattern)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [12]:
for col in  'Lattitude', 'Longitude':
    battles[col] =  battles[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Geocode

In [66]:
f"{(battles['Lattitude'].isnull().sum() / len(battles)):.1%}"

'78.2%'

So, we actually don't have coordinates for 78% of our battles, which is too hight. As an alternative, let's use the geocoding function we wrote in chapter 6! Of course, some addesses are obsolete, but most will, probably, get us to the coordinates

In [14]:
from geocode import nominatim_geocode # copied the file from folder "Chapter06"

In [15]:
from tqdm import tqdm, tqdm_notebook
tqdm().pandas()

0it [00:00, ?it/s]


In [16]:
geo_mask = battles['Lattitude'].isnull()

In [47]:
battles.loc[geo_mask, 'Location'].sample(15, random_state=2019)

46                                                   NaN
49     Southern shore of Lake Ladoga, near present-da...
54                  Kharkov, Ukrainian SSR, Soviet Union
99     175 km sector of the front between Uman and Ki...
27     Brest, Belarusian SSR, Soviet Union (nominally...
133                                near Radzymin, Poland
160                       Poznań and nearby area, Poland
25                                      Petsamo, Finland
132                       Western Ukraine/Eastern Poland
148    West Estonian archipelago (Moonsund archipelag...
126                       Eastern Poland/Western Belarus
56         Crimean Peninsula, Russian SFSR, Soviet Union
154                         Budapest, Kingdom of Hungary
7                           Piotrków Trybunalski, Poland
105       Leningrad region, Soviet Union; Narva, Estonia
Name: Location, dtype: object

In [49]:
location = battles['Location'].str.lower().str.replace('near ', '')

In [62]:
replacements = {
    'Ukrainian SSR, Soviet Union': 'Ukraine',
    'Russian SFSR, Soviet Union': 'Russia',
    'Russian SFSR': 'Russia',
    'Belorussian SSR': 'Belorus',
    'Soviet Union': '',
    'USSR': '',
    ', Poland (now Ukraine)': 'Ukraine',
    'east prussia (now kaliningrad oblast)': 'Kaliningrad Oblast, Russia',
    ', czechoslovakia': ', czech republic',
    'königsberg, germany (now: kaliningrad, russia)': 'Kaliningrad Oblast, Russia',
    'lwów, lwów voivodeship, poland': 'Lvov, Ukraine',
    'leningrad region, ; narva, estonia': 'Narva, Estonia',
    'Kingdom of Hungary': 'Hungary',
    'odessa region, ukraine': 'Odessa, Ukraine'
}

In [51]:
for k, v in replacements.items():
    location = location.str.replace(k.lower(), v.lower(), regex=False)

In [52]:
def vectorized_geocode(x):
    result = nominatim_geocode(x)
    if len(result) == 0:
        return dict()
    return {k:result[0][k] for k in ('lat', 'lon', 'importance', 'display_name')}

In [53]:
response = location[geo_mask].progress_apply(vectorized_geocode)

100%|██████████| 115/115 [02:42<00:00,  1.44s/it]


In [54]:
response.iloc[0]

{'lat': '50.85983',
 'lon': '20.576839',
 'importance': 0.5,
 'display_name': 'Mokra, Biesaki, Kielce, Świętokrzyskie Voivodeship, 25-707, Poland'}

In [55]:
battles.loc[geo_mask, 'Location'].iloc[0]

'Mokra, Kielce Voivodeship, Poland'

In [56]:
geo_df = pd.DataFrame(response.tolist(), index = response.index)
geo_df.rename(columns={'lat': 'Lattitude', 'lon': ' Longitude'}, inplace=True)

In [57]:
rmask = geo_df['importance'].isnull()

In [63]:
rmask.sum() / len(battles)


0.2585034013605442

In [61]:
location[geo_mask].loc[rmask]

2                      mława, warsaw voivodeship, poland
4                   jordanów, kraków voivodeship, poland
6                                   wizna, łomża, poland
8          hel peninsula, pomeranian voivodeship, poland
18                                          szackukraine
19                             wytyczno, włodawa, poland
27     brest, belarusian ssr,  (nominally part of pol...
28              byelorussian soviet socialist republic, 
30                                    baltic countries, 
33                   finland, karelia, and murmansk area
35               petsamo, litsa river, rybachy peninsula
36              salla, kestenga, alakurtti, verman river
38                                 uman, western ukraine
40                                odessa region, ukraine
49     southern shore of lake ladoga, present-day sin...
58                                rostov-on-don region, 
64                               demyansk, russian ssr, 
65        izium/barvenkovo area

In [88]:
location.loc[178]

'sambia, east prussia (now kaliningrad oblast)'

## Time

In [161]:
# pd.to_datetime('September 1939')

Timestamp('1939-09-01 00:00:00')

In [170]:
battles.loc[94, 'Date']

'3 November 1943 – 13 November 1943(Offensive operation) 13 November 1943 – 22 December 1943(Defensive operation)'

In [173]:
d = ('January', 'February', 'March', 'April', 'May', 
     'June', "July",' August', 'September', 'October', 'November', 'December')

month_pattern = r'(' + "|".join(d) + ')'

In [174]:
year_pattern = r'(19\d\d)'

In [196]:
year_extracted = battles['Date'].str.extractall(year_pattern).unstack()

In [198]:
# len(d.iloc[:, -1].notnull())
year_extracted[year_extracted.iloc[:, -1].notnull()]

Unnamed: 0_level_0,0,0,0,0
match,0,1,2,3
94,1943,1943,1943,1943


In [195]:
# d[d.iloc[:, -1].notnull()]

In [199]:
battles.loc[94, 'Date']

'3 November 1943 – 13 November 1943(Offensive operation) 13 November 1943 – 22 December 1943(Defensive operation)'

In [200]:
year_extracted = year_extracted.iloc[:, :2]

In [212]:
year_extracted

   match
0  0        1943
   1        1943
Name: 94, dtype: object

In [209]:
year_extracted.iloc[:, 1] = year_extracted.iloc[:, 1].fillna(year_extracted.iloc[:, 0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [214]:
month_extracted = battles['Date'].str.extractall(month_pattern).unstack()

In [219]:
for i in range(2, month_extracted.shape[1]+1):
    month_extracted.iloc[:, -1].fillna(month_extracted.iloc[:, -i], inplace=True)

In [221]:
month_extracted = month_extracted.iloc[:, [0, -1]]

In [235]:
a = b = 2

In [251]:
year_extracted.columns = month_extracted.columns = ['start', 'end']
I = battles.index
cols = 'start', 'end'

for col in cols:
    battles[col] = pd.to_datetime(month_extracted.loc[I, col] + ' ' + year_extracted.loc[I, col])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Casualties

In [88]:
battles['Casualties and losses.allies'].iloc[0]

'15 deadat least 40 woundedRemainder captured'

In [89]:
# all_matched = battles['Casualties and losses.allies'].str.extractall(r'([\d|,]+)\s*killed').unstack()
# for i in range(all_matched.shape[1]):
#     all_matched.iloc[:, i] = all_matched.iloc[:, i].fillna('').str.replace(',', '').replace('', pd.np.nan).astype(float)

In [90]:
# all_matched.sum(1)

In [112]:
digit_pattern = '([\d|,]+)\s*[{words}]'

keywords = { 'killed': ['dead', 'killed', 'men'], 
             'wounded': ['wounded', 'sick'], 
             'captured': ['captured', 'prisoners'],
             'tanks': ['tank'],
             'airplane': ['airplane'],
             'guns': ['artillery', 'gun'],
             'ships': ['warships', 'boats'],
             'submarines': ['submarines']
}


In [113]:
R = column.str.extractall(pattern).unstack()

In [114]:
def _shy_convert_numeric(v):
    if pd.isnull(v) or v == ',':
        return 0
    
    return int(v.replace(',', ''))
    

In [115]:
# R.applymap(_shy_convert_numeric).sum(1)

In [116]:
results = {
'allies' : pd.DataFrame(index=battles.index, columns=keywords.keys()),  # empty dataframes with the same index
'axis' : pd.DataFrame(index=battles.index, columns=keywords.keys())
}

for name, edf in results.items():
    column = battles[f'Casualties and losses.{name}']
    for tp, keys in keywords.items():
        pattern = digit_pattern.format(words="|".join(keys))
        print(pattern)
        extracted = column.str.extractall(pattern).unstack()
        edf[tp] = extracted.applymap(_shy_convert_numeric).sum(1)
    results[name] = edf.fillna(0).astype(int)

([\d|,]+)\s*[dead|killed|men]
([\d|,]+)\s*[wounded|sick]
([\d|,]+)\s*[captured|prisoners]
([\d|,]+)\s*[tank]
([\d|,]+)\s*[airplane]
([\d|,]+)\s*[artillery|gun]
([\d|,]+)\s*[warships|boats]
([\d|,]+)\s*[submarines]
([\d|,]+)\s*[dead|killed|men]
([\d|,]+)\s*[wounded|sick]
([\d|,]+)\s*[captured|prisoners]
([\d|,]+)\s*[tank]
([\d|,]+)\s*[airplane]
([\d|,]+)\s*[artillery|gun]
([\d|,]+)\s*[warships|boats]
([\d|,]+)\s*[submarines]


In [117]:
results['axis'].head(5)

Unnamed: 0,killed,wounded,captured,tanks,airplane,guns,ships,submarines
0,50,200,50,0,0,0,150,0
1,500,500,1,501,0,1,301,0
2,1200,2700,0,1200,0,0,1500,0
3,1600,2350,0,1600,0,0,750,0
4,0,0,0,0,0,0,0,0


In [120]:
battles.loc[[1], 'Casualties and losses.axis'].iloc[0]

'500 killed, missing or wounded300 horsesseveral guns1 tankette'

In [123]:
battles.loc[[1], 'Casualties and losses.axis'].str.extractall('([\d|\,]+)\s[tank]')

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
1,0,500
1,1,1


In [344]:
results['old_metrics'] = battles
new_dataset = pd.concat(results, axis=1)

In [346]:
new_dataset.to_csv('./data/EF_battles.csv')