## Data aggregation

In [1]:
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import arrow
from collections import Counter
from datetime import datetime
from bokeh.plotting import figure, output_notebook, show

In [2]:
deactivation_soup = BeautifulSoup(requests.get('https://wikimediafoundation.org/w/index.php?title=Special:Log&offset=&limit=500&type=block&user=&page=&tagfilter=&hide_thanks_log=1&hide_tag_log=1').text, 'html.parser')

In [3]:
# for logline in [li.text for li in deactivation_soup.find_all('li') if 'class' in li.attrs  and li.get('class')[0] == 'mw-logline-block']:
#     print(logline)

In [4]:
activation_soup = BeautifulSoup(requests.get('https://wikimediafoundation.org/w/index.php?title=Special:Log&offset=&limit=500&type=newusers&user=&page=&tagfilter=').text, 'html.parser')

In [5]:
# for logline in [li.text for li in activation_soup.find_all('li') if 'class' in li.attrs  and li.get('class')[0] == 'mw-logline-newusers']:
#     print(logline)

In [6]:
# Create activation staff DataFrame from activation log.

dates = []
names = []
for logline in [li.text for li in activation_soup.find_all('li') if 'class' in li.attrs  and li.get('class')[0] == 'mw-logline-newusers' and 'was created' in li.text]:
    date = ' '.join(logline.split(' ')[2:5])
#     print(date)
    date = arrow.get(date, 'D MMMM YYYY')
    dates.append(date)
    name = logline[logline.find("User account") + 13:logline.find("(talk") - 1]
#     print(name)
    names.append(name)

In [7]:
staff_activations = DataFrame(data={'Joined': dates}, index=names)
# staff_activations = DataFrame(data={'Name': names, 'Joined': dates})

In [8]:
# staff_activations

In [9]:
# Create deactivation staff DataFrame from activation log.

dates = []
names = []

for logline in [li.text for li in deactivation_soup.find_all('li') if 'class' in li.attrs  and li.get('class')[0] == 'mw-logline-block' and 'deactivated' in li.text]:
#     print(logline[logline.find(",") + 2:], "SPACE", logline[logline.find(",") + 2:logline.rfind("(talk")])
    date = ' '.join(logline.split(' ')[2:5])
#     print(date)
    date = arrow.get(date, 'D MMMM YYYY')
    dates.append(date)
    name = logline[logline.find('deactivated') + 12:]
    name = name[:name.find("(talk") - 1]
#     print(name)
    names.append(name)

In [10]:
staff_deactivations = DataFrame(index=names, data={'Left': dates})
# staff_deactivations = DataFrame(data={'Name': names, 'Left': dates})

In [11]:
# staff_deactivations

In [12]:
# staff = pd.concat([staff_activations, staff_deactivations])
# staff = staff_activations.append(staff_deactivations)
# staff = pd.merge(staff_activations, staff_deactivations, how='inner')
staff = staff_activations.join(staff_deactivations)

In [13]:
staff.sort_values(by = "Left", ascending=False)

Unnamed: 0,Joined,Left
AKoval (WMF),2013-07-17T00:00:00+00:00,2016-02-16T00:00:00+00:00
ACella (WMF),2015-07-06T00:00:00+00:00,2016-02-10T00:00:00+00:00
HZhang (WMF),2015-06-08T00:00:00+00:00,2016-02-10T00:00:00+00:00
AKoval (WMF),2013-07-17T00:00:00+00:00,2016-02-10T00:00:00+00:00
JChojnacki (WMF),2015-12-02T00:00:00+00:00,2016-02-10T00:00:00+00:00
JLiong (WMF),2015-05-27T00:00:00+00:00,2016-02-10T00:00:00+00:00
DHamilton (WMF),2015-06-10T00:00:00+00:00,2016-02-10T00:00:00+00:00
JUnikowski (WMF),2015-08-24T00:00:00+00:00,2016-02-10T00:00:00+00:00
LMartinez (WMF),2013-07-10T00:00:00+00:00,2016-02-10T00:00:00+00:00
BSher (WMF),2015-05-15T00:00:00+00:00,2016-02-10T00:00:00+00:00


Data is still dirtied by the presence of users who are not WMF staffers in the staff creation listings. There is no easy way to automatically check whether or not an account without a (WMF) at the end is a staff account or not (which is why that policy was eventually instituted in the first place).

Obviously accounts that later on got closed are those of WMF staffers (volunteer accounts are never closed). The remaining trouble spots still contain a number of extremely senior WMF staffers, but unfortunately they cannot really be accounted for. Overall experiental data is therefore probably a <5% undercount.

In [14]:
# [name for name in staff.index if 'WMF' not in name and str(staff.ix[name, 'Left']) == 'nan']

In [15]:
bad_names = [name for name in staff.index if 'WMF' not in name and str(staff.ix[name, 'Left']) == 'nan']
filtered_staff = staff.drop(bad_names)

In [16]:
filtered_staff[filtered_staff['Joined'] < arrow.Arrow(2011, 9, 1)]

Unnamed: 0,Joined,Left
Akapoor,2011-08-26T00:00:00+00:00,2013-08-15T00:00:00+00:00
Akapoor,2011-08-26T00:00:00+00:00,2013-01-24T00:00:00+00:00
GByrd (WMF),2011-08-02T00:00:00+00:00,2015-10-05T00:00:00+00:00
Gmeijssen,2011-08-18T00:00:00+00:00,2012-07-11T00:00:00+00:00
Ntandon,2011-08-23T00:00:00+00:00,2013-01-08T00:00:00+00:00
Pgehres (WMF),2011-08-08T00:00:00+00:00,2013-06-03T00:00:00+00:00
Smazeland-WMF,2011-08-25T00:00:00+00:00,
Sthottingal-WMF,2011-08-26T00:00:00+00:00,
Tfleming,2011-08-18T00:00:00+00:00,2012-07-24T00:00:00+00:00


## Data munging

With the data in place let's start processing it into something informative.

### Overall counts

In [17]:
joined_count_by_month = Counter()
for date in filtered_staff['Joined']:
    joined_count_by_month.update([date.floor('month')])

In [18]:
joined_count_by_month.most_common()

[(<Arrow [2015-05-01T00:00:00+00:00]>, 24),
 (<Arrow [2014-09-01T00:00:00+00:00]>, 16),
 (<Arrow [2013-07-01T00:00:00+00:00]>, 14),
 (<Arrow [2015-10-01T00:00:00+00:00]>, 12),
 (<Arrow [2016-01-01T00:00:00+00:00]>, 12),
 (<Arrow [2013-01-01T00:00:00+00:00]>, 11),
 (<Arrow [2015-06-01T00:00:00+00:00]>, 11),
 (<Arrow [2013-05-01T00:00:00+00:00]>, 11),
 (<Arrow [2015-04-01T00:00:00+00:00]>, 10),
 (<Arrow [2015-08-01T00:00:00+00:00]>, 10),
 (<Arrow [2012-05-01T00:00:00+00:00]>, 10),
 (<Arrow [2014-05-01T00:00:00+00:00]>, 10),
 (<Arrow [2011-11-01T00:00:00+00:00]>, 9),
 (<Arrow [2014-03-01T00:00:00+00:00]>, 9),
 (<Arrow [2014-01-01T00:00:00+00:00]>, 9),
 (<Arrow [2011-08-01T00:00:00+00:00]>, 9),
 (<Arrow [2012-10-01T00:00:00+00:00]>, 9),
 (<Arrow [2012-11-01T00:00:00+00:00]>, 8),
 (<Arrow [2014-10-01T00:00:00+00:00]>, 8),
 (<Arrow [2015-01-01T00:00:00+00:00]>, 8),
 (<Arrow [2013-06-01T00:00:00+00:00]>, 8),
 (<Arrow [2013-08-01T00:00:00+00:00]>, 7),
 (<Arrow [2015-07-01T00:00:00+00:00]>, 7),

In [19]:
left_count_by_month = Counter()
for date in filtered_staff['Left']:
    if str(date) != 'nan':
        left_count_by_month.update([date.floor('month')])

In [20]:
left_count_by_month.most_common()

[(<Arrow [2015-08-01T00:00:00+00:00]>, 37),
 (<Arrow [2015-09-01T00:00:00+00:00]>, 16),
 (<Arrow [2013-01-01T00:00:00+00:00]>, 11),
 (<Arrow [2012-07-01T00:00:00+00:00]>, 11),
 (<Arrow [2016-02-01T00:00:00+00:00]>, 10),
 (<Arrow [2013-04-01T00:00:00+00:00]>, 7),
 (<Arrow [2014-08-01T00:00:00+00:00]>, 6),
 (<Arrow [2013-03-01T00:00:00+00:00]>, 6),
 (<Arrow [2014-01-01T00:00:00+00:00]>, 6),
 (<Arrow [2014-03-01T00:00:00+00:00]>, 5),
 (<Arrow [2015-10-01T00:00:00+00:00]>, 5),
 (<Arrow [2014-10-01T00:00:00+00:00]>, 5),
 (<Arrow [2014-05-01T00:00:00+00:00]>, 5),
 (<Arrow [2015-03-01T00:00:00+00:00]>, 5),
 (<Arrow [2012-08-01T00:00:00+00:00]>, 4),
 (<Arrow [2013-07-01T00:00:00+00:00]>, 4),
 (<Arrow [2015-12-01T00:00:00+00:00]>, 4),
 (<Arrow [2013-06-01T00:00:00+00:00]>, 4),
 (<Arrow [2013-08-01T00:00:00+00:00]>, 3),
 (<Arrow [2012-12-01T00:00:00+00:00]>, 3),
 (<Arrow [2015-02-01T00:00:00+00:00]>, 3),
 (<Arrow [2014-09-01T00:00:00+00:00]>, 3),
 (<Arrow [2014-04-01T00:00:00+00:00]>, 3),
 (<Arr

In [21]:
time_index = [date.datetime for date in arrow.Arrow.range('month', arrow.get(2011, 8, 1), arrow.Arrow.now().floor('month'))]
# time_index

In [22]:
join_df = DataFrame(index=time_index, data={'Joined': joined_count_by_month})
left_df = DataFrame(index=time_index, data={'Left': left_count_by_month})
left_df = left_df.fillna(0)

In [35]:
output_notebook(hide_banner=True)

p = figure(plot_width=960,
           plot_height=500,
           title='Turnover at the Wikimedia Foundation',
           title_text_font_size="18px",
           x_axis_type="datetime"
)

p.line(
    join_df.index,
    join_df['Joined'],
    line_width=2,
    line_color='green'
)

p.line(
    left_df.index,
    left_df['Left'],
    line_width=2,
    line_color='red'
)

p.line(
    [arrow.Arrow(2014,6,1).datetime, arrow.Arrow(2014,6,1).datetime],
    [0, 20],
    line_width=2,
    line_color='black'
)

show(p)

<bokeh.io._CommsHandle at 0x892cef0>

The large spike the WMF HR people catching up with a number of accounts which were not closed properly at the time of their leaving the Foundation. These all occured during Tretikov's time, however. To percieve the before and after better we can average the before and after, as this graph is rather uninformative taken on its own.

In [40]:
sue, tretikov = left_df[left_df.index <= arrow.Arrow(2014,6,1).datetime], left_df[left_df.index > arrow.Arrow(2014,6,1).datetime]

In [48]:
sue['Left'].mean(), tretikov['Left'].mean()

(2.657142857142857, 5.0)

In [53]:
293/(12*5)

4.883333333333334

### Executive leadership