In [None]:
%load_ext lab_black

In [2]:
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
import json
import re
from typing import Optional, List

import altair as alt
from dataclasses_json import dataclass_json
import numpy as np
import pandas as pd

In [3]:
output_dir = "/home/pvh/Documents/code/pvh-forks/presidential-employment-stimulus/data"

### Data structure

Each department has a total budget and total opportunities target. 

The overall programme has outcome targets
1. Jobs created
2. Jobs retained
3. Livelihoods supported

Each department has a "blurb" describing their programme.

Within each department there are multiple programmes that can contribute to each of these targets.

Each programme has a demographic split of outcomes, with gender and youth percentages.

Each programme has a per-province split of outcomes.

Files:

`Consolidated data (Dec) - Presidential Employment Stimulus.xlsx` - December sheet

`Consolidated Presidential Employment Stimulus Reporting Template.xlsx` - January sheet

In [4]:
december_excel = "Consolidated data (Dec) - Presidential Employment Stimulus.xlsx"
january_excel = "Consolidated Presidential Employment Stimulus Reporting Template.xlsx"

december_sheets = pd.ExcelFile(december_excel).sheet_names
january_sheets = pd.ExcelFile(january_excel).sheet_names

provinces = [
    "Eastern Cape",
    "Free State",
    "Gauteng",
    "KwaZulu-Natal",
    "Limpopo",
    "Mpumalanga",
    "North West",
    "Northern Cape",
    "Western Cape",
]
province_abbreviations = ["EC", "FS", "GP", "KZN", "LP", "NW", "NC", "WC"]

```
interface DepartmentMonth {
  month: number // 202101
  name: string // Basic Education
  lead: string // Strengthening the learning environment in schools
  paragraph: string
  sections: Array<{
    name: string // Budget allocated to date
    matrics: Array<{
      name: string // Educational and general assistants
      type: 'currency' | 'count'
      value: number
      valueTarget?: number
      time?: {
        name: string // spend
        values: Array<{
          month: number // 202101
          value: number
        }>
      }
      gender?: {
        name: string // opportunities
        values: Array<{
          gender: 'female' | 'male'
          value: number
        }>
      }
      age?: {
        name: string // opportunities
        values: Array<{
          age: string // 18-35
          value: number
        }>
      }
      province?: {
        name: string // opportunities
        values: Array<{
          province: 'EC' | 'FS' | 'GP' | 'KZN' | 'LP' | 'MP' | 'NC' | 'NW' | 'WC'
        }>
      }
    }>
  }>
}
```

## 

In [5]:
SectionEnum = Enum(
    "Section", "targets budget_allocated job_opportunities jobs_retain livelihoods"
)

MetricTypeEnum = Enum("MetricType", "currency count")

ProvinceEnum = Enum("Province", "EC FS GP KZN LP MP NC NW WC")

province_to_abbrev = {
    "Free State": "FS",
    "Gauteng": "GP",
    "KwaZulu-Natal": "KZN",
    "Limpopo": "LP",
    "Mpumalanga": "MP",
    "North West": "NW",
    "Northern Cape": "NC",
    "Western Cape": "WC",
    "Eastern Cape": "EC",
}

GenderEnum = Enum("Gender", "Male Female")


@dataclass_json
@dataclass
class TimeValue:
    month: int  # encoding month as in 202101
    name: str  # human readable time period name
    value: int


@dataclass_json
@dataclass
class AgeValue:
    age_category: str  # 18-35 or youth?
    value: int


@dataclass_json
@dataclass
class GenderValue:
    gender: str  # enum: 'female' or 'male'
    value: int


@dataclass_json
@dataclass
class ProvinceValue:
    province: str  # enum: 'EC' | 'FS' | 'GP' | 'KZN' | 'LP' | 'MP' | 'NC' | 'NW' | 'WC'
    value: int


@dataclass_json
@dataclass
class TimeValues:
    name: str
    values: List["TimeValue"]


@dataclass_json
@dataclass
class ProvinceValues:
    name: str
    values: List["ProvinceValue"]


@dataclass_json
@dataclass
class AgeValues:
    name: str
    values: List["AgeValue"]


@dataclass_json
@dataclass
class GenderValues:
    name: str
    values: List["GenderValue"]


@dataclass_json
@dataclass
class Metric:
    name: str
    metric_type: str  # enum of 'currency', 'count'
    value: int
    time: Optional[TimeValues]
    gender: Optional[GenderValues]
    age: Optional[AgeValues]
    province: Optional[ProvinceValues]
    value_target: int = -1


@dataclass_json
@dataclass
class Section:
    name: str
    section_type: str  # enum of 'targets', 'budget_allocated', 'job_opportunities', 'jobs_retain', 'livelihoods'
    metrics: List["Metric"]


@dataclass_json
@dataclass
class Department:
    month: int  # the month of latest data
    name: str
    sheet_name: str
    lead: str
    paragraph: str
    sections: List["Section"]


@dataclass_json
@dataclass
class DepartmentValue:
    department: str  # TODO: should be enum
    value: int


@dataclass_json
@dataclass
class DepartmentValues:
    name: str
    values: List["DepartmentValue"]


@dataclass_json
@dataclass
class OverviewMetric:
    name: str
    metric_type: str  # enum of 'currency', 'count'
    value: int
    time: Optional[TimeValues]
    department: DepartmentValues
    value_target: int = -1


@dataclass_json
@dataclass
class OverviewSection(Section):
    metrics: List["OverviewMetric"]


@dataclass_json
@dataclass
class Overview:
    month: int
    name: str  # Would normally be "Programme Overview"
    lead: str
    paragraph: str
    overview_sections: List["OverviewSection"]


@dataclass_json
@dataclass
class Everything:
    overview: Overview
    departments: List["Department"]

# Top level structure

In [6]:
all_data = Everything(
    overview=Overview(
        month=202103,
        name="Programme overview",
        lead="About the programme",
        paragraph="nothing yet",
        overview_sections=[],
    ),
    departments=[],
)

## DTIC

In [7]:
department_name = "Trade, Industry and Competition"
sheet_name = "DTIC"

department = Department(
    month=202012,
    name=department_name,
    sheet_name=sheet_name,
    lead="Piloting new models for re-shoring and expanding global business services",
    paragraph="The Global Business Services Sector has an impressive track record. Established in 2006/7 to provide offshore customer service delivery, the sector has built from a low base to achieve an average year-on-year export revenue growth of at least 20% since 2014.",
    sections=[],
)

budget_target = 120_000 * 1000
opportunities_target = 8_000

# Programme targets for this department
section = Section(
    name="Programme targets for this department",
    section_type=SectionEnum.targets.name,
    metrics=[
        Metric(
            name="Budget",
            metric_type=MetricTypeEnum.currency.name,
            value=budget_target,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
        Metric(
            name="Job opportunities",
            metric_type=MetricTypeEnum.count.name,
            value=opportunities_target,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
    ],
)

department.sections.append(section)

december_df = pd.read_excel(december_excel, sheet_name=sheet_name, header=None)

oct_nov_spend = december_df.iloc[48, 1]
dec_spend = december_df.iloc[49, 1]

programme_name = "Global Business Services Sector expansion"

# budget allocated to date
section = Section(
    name="Budget allocated to date",
    section_type=SectionEnum.budget_allocated.name,
    metrics=[
        Metric(
            name=programme_name,
            metric_type=MetricTypeEnum.currency.name,
            value=dec_spend,
            value_target=budget_target,
            time=TimeValues(
                name="Spent over time",
                values=[
                    TimeValue(month=202011, name="Nov '20", value=oct_nov_spend),
                    TimeValue(month=202012, name="Dec '20", value=dec_spend),
                ],
            ),
            gender=None,
            age=None,
            province=None,
        )
    ],
)

department.sections.append(section)

# Job opportunities created to date

total_programme_jobs = december_df.iloc[28:37, 1].sum()
percentage_male = december_df.iloc[41, 1] * 100
percentage_female = december_df.iloc[42, 1] * 100
percentage_youth = december_df.iloc[43, 1] * 100


by_province_df = december_df.iloc[29:37, :5]
by_province_df = by_province_df.set_index(0).sum(axis=1)

by_province_df = december_df.iloc[29:37, :5]
by_province_df = by_province_df.set_index(0).sum(axis=1)

province_values = ProvinceValues(name="Opportunities by Province", values=[])
for province in list(by_province_df.index):
    pv = ProvinceValue(
        province=province_to_abbrev[province], value=int(by_province_df.loc[province])
    )
    province_values.values.append(pv)

section = Section(
    name="Job opportunities created to date",
    section_type=SectionEnum.job_opportunities.name,
    metrics=[
        Metric(
            name=programme_name,
            metric_type=MetricTypeEnum.count.name,
            value=total_programme_jobs,
            time=TimeValues(
                name="Employed over time",
                values=[
                    TimeValue(month=202012, name="Dec '20", value=total_programme_jobs)
                ],
            ),
            gender=GenderValues(
                name="Jobs by Gender",
                values=[
                    GenderValue(gender=GenderEnum.Male.name, value=percentage_male),
                    GenderValue(gender=GenderEnum.Female.name, value=percentage_female),
                ],
            ),
            age=AgeValues(
                name="Opportunities for 18-35 year olds",
                values=[
                    AgeValue(
                        age_category="18-35",
                        value=int(percentage_youth / 100 * total_programme_jobs),
                    )
                ],
            ),
            province=province_values,
        )
    ],
)

department.sections.append(section)

all_data.departments.append(department)
print(
    department.to_json(indent=2),
    file=open(output_dir + "/" + sheet_name + ".json", "w"),
)
print(department.to_json(indent=2))

{
  "month": 202012,
  "name": "Trade, Industry and Competition",
  "sheet_name": "DTIC",
  "lead": "Piloting new models for re-shoring and expanding global business services",
  "paragraph": "The Global Business Services Sector has an impressive track record. Established in 2006/7 to provide offshore customer service delivery, the sector has built from a low base to achieve an average year-on-year export revenue growth of at least 20% since 2014.",
  "sections": [
    {
      "name": "Programme targets for this department",
      "section_type": "targets",
      "metrics": [
        {
          "name": "Budget",
          "metric_type": "currency",
          "value": 120000000,
          "time": null,
          "gender": null,
          "age": null,
          "province": null,
          "value_target": -1
        },
        {
          "name": "Job opportunities",
          "metric_type": "count",
          "value": 8000,
          "time": null,
          "gender": null,
          "ag

In [8]:
by_province_df = december_df.iloc[29:37, :5]
by_province_df = by_province_df.set_index(0).sum(axis=1)

In [9]:
province_values = ProvinceValues(name="Opportunities by Province", values=[])
for province in list(by_province_df.index):
    pv = ProvinceValue(
        province=province_to_abbrev[province], value=int(by_province_df.loc[province])
    )
    province_values.values.append(pv)
province_values.to_dict()

{'name': 'Opportunities by Province',
 'values': [{'province': 'FS', 'value': 0},
  {'province': 'GP', 'value': 872},
  {'province': 'KZN', 'value': 7166},
  {'province': 'LP', 'value': 0},
  {'province': 'MP', 'value': 0},
  {'province': 'NW', 'value': 0},
  {'province': 'NC', 'value': 0},
  {'province': 'WC', 'value': 7106}]}

## DBE

In [10]:
department_name = "Basic Education"
sheet_name = "DBE"

department = Department(
    month=202103,
    name=department_name,
    sheet_name=sheet_name,
    lead="Teachers assistants and other support for schools",
    paragraph="A key priority identified in the National Development Plan is the improvement of quality education, skills development, and innovation. One intervention that has seen some experimentation in South Africa, with significant potential to scale nationally, is the use of school assistants to strengthen the learning environment. An important rationale for school assistants is the need to support teachers in the classroom, freeing up time for teaching and providing additional support to learners to improve education outcomes.",
    sections=[],
)

budget_target = 7_000_000 * 1000
opportunities_target = 344_933


december_df = pd.read_excel(
    december_excel,
    sheet_name=sheet_name,
    header=None,
)
january_df = pd.read_excel(
    january_excel,
    sheet_name=sheet_name,
    header=None,
)

assistants_budget = january_df.iloc[42, 1]
post_saving_budget = january_df.iloc[42, 2]

# Programme targets for this department
section = Section(
    name="Programme targets for this department",
    section_type=SectionEnum.targets.name,
    metrics=[
        Metric(
            name="Total budget",
            metric_type=MetricTypeEnum.currency.name,
            value=budget_target,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
        Metric(
            name="Budget for Education and General Assistant Posts",
            metric_type=MetricTypeEnum.currency.name,
            value=assistants_budget,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
        Metric(
            name="Budget for Saving Vulnerable Posts",
            metric_type=MetricTypeEnum.currency.name,
            value=post_saving_budget,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
        Metric(
            name="Job opportunities",
            metric_type=MetricTypeEnum.count.name,
            value=opportunities_target,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
    ],
)

department.sections.append(section)


programme_spends = [
    dict(
        name="Education and general assistants",
        amounts=[december_df.iloc[44, 1], january_df.iloc[44, 1]],
    ),
    dict(
        name="Vulnerable  posts saved",
        amounts=[december_df.iloc[44, 2], january_df.iloc[44, 2]],
    ),
]

metrics = []
months = [202101, 202103]
month_names = ["Jan '21", "Mar '21"]

for programme in programme_spends:
    values = []
    total_value = 0
    for i, value in enumerate(programme["amounts"]):
        values.append(TimeValue(month=months[i], name=month_names[i], value=value))
        total_value += value
    metric = Metric(
        name=programme["name"],
        metric_type=MetricTypeEnum.currency.name,
        value=total_value,
        value_target=budget_target,
        time=TimeValues(name="Spent over time", values=values),
        gender=None,
        age=None,
        province=None,
    )
    metrics.append(metric)

programme_name = "Global Business Services Sector expansion"

# budget allocated to date
section = Section(
    name="Budget allocated to date",
    section_type=SectionEnum.budget_allocated.name,
    metrics=metrics,
)

department.sections.append(section)

# Job opportunities created to date

ed_assistants_by_prov_df = january_df.iloc[10:19, :2].set_index(0)
gen_assistants_by_prov_df = january_df.iloc[10:19, :3].drop(1, axis=1).set_index(0)

job_opportunity_programmes = [
    dict(
        name="Education assistants",
        totals=[december_df.iloc[19, 1], january_df.iloc[19, 1]],
        province=ProvinceValues(
            name="Opportunities by Province",
            values=[
                ProvinceValue(
                    province=province_to_abbrev[province],
                    value=ed_assistants_by_prov_df.loc[province],
                )
                for province in list(ed_assistants_by_prov_df.index)
            ],
        ),
    ),
    dict(
        name="General assistants",
        totals=[december_df.iloc[19, 2], january_df.iloc[19, 2]],
        province=ProvinceValues(
            name="Opportunities by Province",
            values=[
                ProvinceValue(
                    province=province_to_abbrev[province],
                    value=gen_assistants_by_prov_df.loc[province],
                )
                for province in list(gen_assistants_by_prov_df.index)
            ],
        ),
    ),
]

metrics = []
for programme in job_opportunity_programmes:
    metric = Metric(
        name=programme["name"],
        metric_type=MetricTypeEnum.count.name,
        time=TimeValues(
            name="Employed over time",
            values=[
                TimeValue(month=months[i], name=month_names[i], value=value)
                for i, value in enumerate(programme["totals"])
            ],
        ),
        gender=None,
        age=None,
        province=programme["province"],
        value=programme["totals"][-1],
    )
    metrics.append(metric)

section = Section(
    name="Job opportunities created to date",
    section_type=SectionEnum.job_opportunities.name,
    metrics=metrics,
)

department.sections.append(section)

# Jobs retained

jobs_retained_over_time = [december_df.iloc[19, 3], january_df.iloc[19, 3]]
jobs_retained_over_time_df = (
    january_df.iloc[10:19, :4].drop([1, 2], axis=1).set_index(0)
)
section = Section(
    name="Jobs Retained",
    section_type=SectionEnum.jobs_retain.name,
    metrics=[
        Metric(
            name="Vulnerable posts saved",
            metric_type=MetricTypeEnum.count.name,
            time=TimeValues(
                name="Jobs saved over time",
                values=[
                    TimeValue(
                        month=months[i],
                        name=month_names[i],
                        value=[
                            value for i, value in enumerate(jobs_retained_over_time)
                        ],
                    )
                ],
            ),
            value=jobs_retained_over_time[-1],
            gender=None,
            age=None,
            province=ProvinceValues(
                name="Posts saved by Province",
                values=[
                    ProvinceValue(
                        province=province_to_abbrev[province],
                        value=gen_assistants_by_prov_df.loc[province],
                    )
                    for province in list(gen_assistants_by_prov_df.index)
                ],
            ),
        )
    ],
)

department.sections.append(section)

all_data.departments.append(department)
print(
    department.to_json(indent=2),
    file=open(output_dir + "/" + sheet_name + ".json", "w"),
)
print(department.to_json(indent=2))

{
  "month": 202103,
  "name": "Basic Education",
  "sheet_name": "DBE",
  "lead": "Teachers assistants and other support for schools",
  "paragraph": "A key priority identified in the National Development Plan is the improvement of quality education, skills development, and innovation. One intervention that has seen some experimentation in South Africa, with significant potential to scale nationally, is the use of school assistants to strengthen the learning environment. An important rationale for school assistants is the need to support teachers in the classroom, freeing up time for teaching and providing additional support to learners to improve education outcomes.",
  "sections": [
    {
      "name": "Programme targets for this department",
      "section_type": "targets",
      "metrics": [
        {
          "name": "Total budget",
          "metric_type": "currency",
          "value": 7000000000,
          "time": null,
          "gender": null,
          "age": null,
       

In [11]:
post_saving_budget = january_df.iloc[42, 2]

In [12]:
assistants_budget + post_saving_budget

9430615000

In [13]:
budget_target

7000000000

In [14]:
december_df.iloc[44, 2]

151913

In [15]:
january_df.iloc[44, 2]

379998000

In [16]:
january_df.iloc[44]

0    Total amount spent this month
1                       2828913000
2                        379998000
3                              NaN
Name: 44, dtype: object

In [17]:
ed_assistants = december_df.iloc[19, 1]
gen_assistants = december_df.iloc[19, 2]
print(ed_assistants, gen_assistants)
ed_assistants = january_df.iloc[19, 1]
gen_assistants = january_df.iloc[19, 2]
print(ed_assistants, gen_assistants)

158437 120735
155341 128894


In [18]:
ed_assistants_by_prov_df = january_df.iloc[10:19, :2].set_index(0)

In [19]:
january_df.iloc[10:19, :3].drop(1, axis=1)

Unnamed: 0,0,2
10,Eastern Cape,25175
11,Free State,6989
12,Gauteng,15724
13,KwaZulu-Natal,30888
14,Limpopo,21408
15,Mpumalanga,10082
16,North West,8659
17,Northern Cape,4198
18,Western Cape,5771


In [20]:
ed_assistants_by_prov_df.loc["Eastern Cape"]

1    30628
Name: Eastern Cape, dtype: object

In [21]:
december_df.iloc[19, 3], january_df.iloc[19, 3]

(21988, 26779)

In [22]:
january_df.iloc[10:19, :4].drop([1, 2], axis=1).set_index(0)

Unnamed: 0_level_0,3
0,Unnamed: 1_level_1
Eastern Cape,5118
Free State,1313
Gauteng,3884
KwaZulu-Natal,2725
Limpopo,1743
Mpumalanga,1229
North West,919
Northern Cape,3821
Western Cape,6027


## DSD

In [23]:
department_name = "Social Development"
sheet_name = "DSD"

department = Department(
    month=202103,
    name=department_name,
    sheet_name=sheet_name,
    lead="Income support to practitioners and to the implementation of Covid compliance measures",
    paragraph="Livelihoods from the provision of Early Childhood Development services were severely disrupted by the pandemic, with providers facing challenges with re-opening. There are costs associated with doing so safely, and some parents can no longer afford to pay fees as a result of job losses.",
    sections=[],
)


budget_target = 588_728 * 1000
beneficiaries_target = 111_142


december_df = pd.read_excel(
    december_excel,
    sheet_name=sheet_name,
    header=None,
)
january_df = pd.read_excel(
    january_excel,
    sheet_name=sheet_name,
    header=None,
)

social_workers_budget = january_df.iloc[42, 1]
registration_support_budget = int(
    float(re.sub(r"[^\d]+([\d.]+).*", r"\1", december_df.iloc[42, 2])) * 1_000_000
)  # pull number out of R 16.5 million
# Programme targets for this department
section = Section(
    name="Programme targets for this department",
    section_type=SectionEnum.targets.name,
    metrics=[
        Metric(
            name="Total budget",
            metric_type=MetricTypeEnum.currency.name,
            value=budget_target,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
        Metric(
            name="Budget for social workers",
            metric_type=MetricTypeEnum.currency.name,
            value=social_workers_budget,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
        Metric(
            name="Budget for registration support officers",
            metric_type=MetricTypeEnum.currency.name,
            value=registration_support_budget,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
        Metric(
            name="Programme beneficiaries",
            metric_type=MetricTypeEnum.count.name,
            value=beneficiaries_target,
            time=None,
            gender=None,
            age=None,
            province=None,
        ),
    ],
)

department.sections.append(section)

# budget allocated to date

social_worker_spend_dec = int(
    float(december_df.iloc[44, 1].replace("R", "").replace(",", ""))
)
social_worker_spend_jan = january_df.iloc[44, 1]
social_worker_spends = [social_worker_spend_dec, social_worker_spend_jan]

months = [202012, 202101]
month_names = ["Dec '20", "Jan '21"]
programme_spends = [
    Metric(
        name="Budget for social workers",
        metric_type=MetricTypeEnum.currency.name,
        value=sum(social_worker_spends),
        value_target=social_workers_budget,
        time=TimeValues(
            name="Spend this month",
            values=[
                TimeValue(month=months[i], name=month_names[i], value=value)
                for i, value in enumerate(social_worker_spends)
            ],
        ),
        gender=None,
        age=None,
        province=None,
    ),
]

budget_allocation_section = Section(
    name="Budget allocated to date",
    section_type=SectionEnum.budget_allocated.name,
    metrics=programme_spends,
)

department.sections.append(budget_allocation_section)

# Jobs added

# TODO: no info yet

# Jobs retained

dec_soc_worker_jobs = december_df.iloc[19, 1]
jan_soc_worker_jobs = january_df.iloc[19, 1]
social_worker_jobs = [dec_soc_worker_jobs, jan_soc_worker_jobs]
social_worker_job_target = 1809
soc_worker_dec_df = december_df.iloc[10:19, :2].set_index(0)
soc_worker_jan_df = january_df.iloc[10:19, :2].set_index(0)
jobs_retained = [
    Metric(
        name="Social workers retained",
        metric_type=MetricTypeEnum.count.name,
        value=social_worker_jobs[-1],
        value_target=social_worker_job_target,
        gender=GenderValues(
            name="Jobs retained by gender",
            values=[
                GenderValue(gender=GenderEnum.Male.name, value=january_df.iloc[36, 1]),
                GenderValue(
                    gender=GenderEnum.Female.name, value=january_df.iloc[37, 1]
                ),
            ],
        ),
        age=AgeValues(
            name="Jobs retained by age",
            values=[AgeValue(age_category="18-35", value=january_df.iloc[38, 1])],
        ),
        time=TimeValues(
            name="Jobs retained over time",
            values=[
                TimeValue(month=months[i], name=month_names[i], value=value)
                for i, value in enumerate(social_worker_jobs)
            ],
        ),
        province=ProvinceValues(
            name="Jobs retained by province",
            values=[
                ProvinceValue(
                    province=province_to_abbrev[province],
                    value=int(soc_worker_jan_df.loc[province]),
                )
                for province in list(soc_worker_jan_df.index)
            ],
        ),
    )
]

jobs_retained_section = Section(
    name="Jobs retained",
    section_type=SectionEnum.jobs_retain.name,
    metrics=jobs_retained,
)

department.sections.append(jobs_retained_section)

# Livelihoods section

# TODO: no data yet

# save all the data

all_data.departments.append(department)
print(
    department.to_json(indent=2),
    file=open(output_dir + "/" + sheet_name + ".json", "w"),
)

print(department.to_json(indent=2))

{
  "month": 202103,
  "name": "Social Development",
  "sheet_name": "DSD",
  "lead": "Income support to practitioners and to the implementation of Covid compliance measures",
  "paragraph": "Livelihoods from the provision of Early Childhood Development services were severely disrupted by the pandemic, with providers facing challenges with re-opening. There are costs associated with doing so safely, and some parents can no longer afford to pay fees as a result of job losses.",
  "sections": [
    {
      "name": "Programme targets for this department",
      "section_type": "targets",
      "metrics": [
        {
          "name": "Total budget",
          "metric_type": "currency",
          "value": 588728000,
          "time": null,
          "gender": null,
          "age": null,
          "province": null,
          "value_target": -1
        },
        {
          "name": "Budget for social workers",
          "metric_type": "currency",
          "value": 75978000,
          "tim

In [24]:
re.sub(r"[^\d]+([\d.]+).*", r"\1", december_df.iloc[42, 2])

'16.5'

In [25]:
december_df.iloc[42, 2]

'R16.5 million '

In [26]:
int(float(december_df.iloc[44, 1].replace("R", "").replace(",", "")))

4494000

In [27]:
december_df.iloc[10:19, :2]

Unnamed: 0,0,1
10,Eastern Cape,254
11,Free State,0
12,Gauteng,70
13,KwaZulu-Natal,232
14,Limpopo,100
15,Mpumalanga,86
16,North West,0
17,Northern Cape,7
18,Western Cape,0


In [28]:
int(december_df.iloc[10:19, :2].set_index(0).loc["Eastern Cape"])

254

In [29]:
january_df.iloc[10:20]

Unnamed: 0,0,1,2,3,4,5,6,7
10,Eastern Cape,282,,,,,,
11,Free State,0,,,,,,
12,Gauteng,305,,,,,,
13,KwaZulu-Natal,235,,,,,,
14,Limpopo,99,,,,,,
15,Mpumalanga,116,,,,,,
16,North West,43,,,,,,
17,Northern Cape,6,,,,,,
18,Western Cape,0,,,,,,
19,Total,1086,0.0,,,,,


In [30]:
december_df.iloc[19, 1]

749

In [31]:
soc_work_percent_male = january_df.iloc[36, 1]
soc_work_percent_male

0.13

In [32]:
january_df.iloc[37, 1]

0.87

# Save final data

In [33]:
output_filename = output_dir + "/all_data.json"
open(output_filename, "w").write(all_data.to_json())
print(all_data.to_json(indent=2))

{
  "overview": {
    "month": 202103,
    "name": "Programme overview",
    "lead": "About the programme",
    "paragraph": "nothing yet",
    "overview_sections": []
  },
  "departments": [
    {
      "month": 202012,
      "name": "Trade, Industry and Competition",
      "sheet_name": "DTIC",
      "lead": "Piloting new models for re-shoring and expanding global business services",
      "paragraph": "The Global Business Services Sector has an impressive track record. Established in 2006/7 to provide offshore customer service delivery, the sector has built from a low base to achieve an average year-on-year export revenue growth of at least 20% since 2014.",
      "sections": [
        {
          "name": "Programme targets for this department",
          "section_type": "targets",
          "metrics": [
            {
              "name": "Budget",
              "metric_type": "currency",
              "value": 120000000,
              "time": null,
              "gender": null,
  