# NHS Staffing Data

In [None]:
import numpy as np
import pandas as pd

## Staffing Data
### Load Spreadsheet

In [None]:
raw_staff = pd.read_excel(
    'input/HCHS_staff_with_an_EU_or_UK_nationality__by_staff_group_and_organisation.xls',
    skiprows=4, skipfooter=23)
raw_staff.head()

In [None]:
raw_staff.tail()

In [None]:
raw_staff.columns = [
    'region',
    'region_name',
    'organisation_name',
    'organisation',
    'all_staff_eu',
    'hchs_doctors_eu',
    'nurses_visitors_eu',
    'other_eu',
    'all_staff_uk',
    'hchs_doctors_uk',
    'nurses_visitors_uk',
    'other_uk'
]

In [None]:
raw_staff.head()

In [None]:
assert raw_staff.organisation_name[raw_staff.shape[0] - 1] == 'Yeovil District Hospital NHS Foundation Trust'

In [None]:
total_staff = raw_staff[raw_staff.region == 'England']
raw_staff = raw_staff[3:]
raw_staff.head()

In [None]:
row_is_region = ~raw_staff.region.isna()
raw_staff.region[row_is_region]

In [None]:
(raw_staff.region[row_is_region].str.strip() != raw_staff.region[row_is_region]).sum()

In [None]:
(raw_staff.region_name[row_is_region].str.strip() != raw_staff.region_name[row_is_region]).sum()

In [None]:
row_is_region.sum()

In [None]:
region_staff = raw_staff[row_is_region].drop(columns=['organisation_name', 'organisation'])
region_staff.head()

In [None]:
raw_staff.region = raw_staff.region.fillna(method='ffill')
raw_staff.region_name = raw_staff.region_name.fillna(method='ffill')
staff = raw_staff[~row_is_region]
staff.head()

In [None]:
assert row_is_region.sum() == staff.organisation.isna().sum() + 1

In [None]:
staff = staff[~staff.organisation.isna()]

### Check Staff Data  

#### Organisation Code and Name

In [None]:
[
    staff.shape[0],
    staff.organisation.nunique(),
    staff.organisation_name.nunique(),
    staff.organisation.isna().sum(),
    staff.organisation_name.isna().sum()
]

In [None]:
[
    (staff.organisation.str.strip() != staff.organisation).any(),
    (staff.organisation_name.str.strip() != staff.organisation_name).any()
]

#### Totals

The totals in the spreadsheet don't quite match the totals, even at region level, but they are pretty close. Maybe some rounding going on somehow? 

In [None]:
staff.describe()

In [None]:
staff.sum()

In [None]:
total_staff

In [None]:
staff.groupby(['region', 'region_name']).sum()

In [None]:
region_staff

### Trusts and CCGs

Trusts have IDs that seem to start with R or T.

In [None]:
[
    staff.organisation.str.startswith('R').sum(),
    staff.organisation.str.startswith('T').sum(),
    staff.organisation_name.str.contains('Trust').sum()
]

In [None]:
staff[staff.organisation_name.str.contains('Trust') & ~staff.organisation.str.startswith('R')]

In [None]:
[
    staff.organisation.str.match(r'^[RT]').sum(),
    (~staff.organisation.str.match(r'^[RT]')).sum(),
    staff.organisation_name.str.contains('CCG').sum()
]

In [None]:
staff[~staff.organisation.str.match('^[RT]')].head()

## Postcodes

In [None]:
etrust_column_names = [
    'organisation_code', 'name', 'national_grouping', 'high_level_health_geography',
    'address_1', 'address_2', 'address_3', 'address_4', 'address_5', 'raw_postcode',
    'open_date', 'close_date', 'null_13', 'null_14', 'null_15', 'null_16', 'null_17',
    'contact_telephone_number', 'null_19', 'null_20', 'null_21',
    'amended_record_indicator', 'null_23', 'gor_code', 'null_25', 'null_26', 'null_27'
]
etrust = pd.read_csv(
    'input/etrust.csv.gz', names=etrust_column_names, low_memory=False,
    dtype={'open_date': object, 'close_date': object})
etrust.head()

In [None]:
etrust.shape

In [None]:
etrust_null_columns = [name for name in etrust_column_names if name.startswith('null_')]
etrust_null_counts = [
    etrust[column].isna().sum()
    for column in etrust_null_columns
]
assert len(set(etrust_null_counts)) == 1
assert etrust.shape[0] == etrust_null_counts[0]

In [None]:
etrust.drop(columns=etrust_null_columns, inplace=True)
etrust.head()

### Check Organisation Codes

In [None]:
[
    etrust.organisation_code.isna().sum(),
    (etrust.organisation_code.str.strip() != etrust.organisation_code).sum(),
    (etrust.organisation_code.str.upper() != etrust.organisation_code).sum()
]

In [None]:
etrust_organisation_codes = set(etrust.organisation_code)
[
    etrust.shape,
    len(etrust_organisation_codes),
    set([len(code) for code in etrust_organisation_codes]),
    len([code for code in etrust_organisation_codes if code.startswith('R')])
]

In [None]:
etrust_organisation_codes_3 = set([
    code
    for code in etrust_organisation_codes
    if len(code) == 3
])
[len(etrust_organisation_codes_3), list(etrust_organisation_codes_3)[0:10]]

In [None]:
[
    staff.organisation.isin(etrust_organisation_codes).sum(),
    (staff.organisation.str.startswith('R') & ~staff.organisation.isin(etrust_organisation_codes)).sum()
]

The code prefixes should also match up. 

In [None]:
etrust['organisation'] = etrust.organisation_code.str[0:3]
etrust.organisation.head()

In [None]:
etrust_organisations = set(etrust.organisation)
[len(etrust_organisation_codes_3), list(etrust_organisations)[0:10]]

In [None]:
[
    etrust_organisations - etrust_organisation_codes_3,
    etrust_organisation_codes_3 - etrust_organisations
] 

In [None]:
etrust[etrust.organisation == 'RER']

### Check Postcodes

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
etrust.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
etrust['postcode'] = etrust.raw_postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
etrust.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
etrust.raw_postcode[~etrust.postcode.isin(ukpostcodes.postcode)].unique()

Spot checks: all the ones I checked were discontinued postcodes.

### Closed Facilities

In [None]:
[
    etrust.shape,
    etrust.close_date.isna().sum()
]

In [None]:
clean_etrust = etrust[
    etrust.close_date.isna() & ~etrust.postcode.isna() &
    etrust.name.str.contains('HOSPITAL') &
    (etrust.organisation_code != etrust.organisation)
].drop(columns='raw_postcode')
clean_etrust.shape

## Merge Postcodes

Take non-closed facilities with valid postcodes.

In [None]:
hospital_postcodes = pd.merge(
    staff[['organisation']],
    clean_etrust[['organisation', 'postcode']],
    validate='1:m'
)
hospital_postcodes.shape

In [None]:
hospital_postcodes.head()

## Save Data

In [None]:
staff.to_pickle('output/staff.pkl.gz')

In [None]:
hospital_postcodes.to_pickle('output/hospital_postcodes.pkl.gz')