# Count CAP Beneficiaries (2014-2017)

Can we count them?

In [None]:
import pandas as pd

In [None]:
cap = pd.concat([
    pd.read_pickle('output/cap_{}.pkl.gz'.format(year))
    for year in range(2014, 2018)
])
cap.shape

In [None]:
cap.describe()

In [None]:
cap['postcode_area'] = cap['postcode_district'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
cap.head()

## Matching Beneficiaries

We seem to have a beneficiary code or beneficiary name for every row each year. Can we match them up between years?

In [None]:
[
    (cap.BeneficiaryName_F201 != cap.BeneficiaryName_F201.str.strip()).sum(),
    (cap.BeneficiaryName_F201 != cap.BeneficiaryName_F201.str.upper()).sum()
]

In [None]:
cap['beneficiary'] = cap.BeneficiaryName_F201.str.strip().str.upper()
(cap.beneficiary == '*******').sum()

In [None]:
cap.loc[cap.beneficiary == '*******', 'beneficiary'] = \
    'BC_' + cap.BeneficiaryCode[cap.beneficiary == '*******'].astype('str')
(cap.beneficiary == '*******').sum()

In [None]:
cap[cap.beneficiary.isna()]

In [None]:
cap = cap[~cap.beneficiary.isna()]

In [None]:
cap_beneficiaries = cap.groupby(['beneficiary', 'postcode_district', 'postcode_area', 'Year']).sum()
cap_beneficiaries

In [None]:
cap[cap.beneficiary.str.match(r'.*CLUNY ESTATE.*')]

In [None]:
cap[cap.beneficiary.str.match(r'.*ZIGGUS.*')]

In [None]:
cap[cap.beneficiary.str.match(r'.*ZURICH ASSURANCE.*')]

In [None]:
cap[cap.beneficiary.str.match(r'.*ZYOX.*')]

In [None]:
cap_beneficiaries.reset_index(inplace=True)

In [None]:
cap_beneficiaries[cap_beneficiaries.beneficiary.str.startswith('BC_')]

In [None]:
cap_beneficiaries.beneficiary.unique().shape[0]

In [None]:
cap_beneficiaries_by_area = cap_beneficiaries.groupby('postcode_area').beneficiary.nunique()
cap_beneficiaries_by_area = cap_beneficiaries_by_area.reset_index()
cap_beneficiaries_by_area

### Alternative Estimate: Average number of beneficiaries per year

In [None]:
cap_by_area_by_year = cap.groupby(['postcode_area', 'Year']).BeneficiaryName_F201.count()
cap_by_area_by_year

In [None]:
cap_mean_beneficiaries_by_area = cap_by_area_by_year.groupby('postcode_area').mean()
cap_mean_beneficiaries_by_area

In [None]:
pd.merge(cap_beneficiaries_by_area, cap_mean_beneficiaries_by_area.reset_index(), on='postcode_area')