In [1]:
import functools
import json
import math
import operator
import re
import sys
from datetime import datetime

import numpy as np
import pandas as pd

sys.path.append("../python-src")
from presidential_employment import *

In [2]:
print(sys.argv[0])

/home/pvh/miniconda3/envs/altair/lib/python3.9/site-packages/ipykernel_launcher.py


In [3]:
output_dir = "/home/pvh/Documents/code/pvh-forks/presidential-employment-stimulus/data"

### Data structure

Each department has a total budget and total opportunities target. 

The overall programme has outcome targets
1. Jobs created
2. Jobs retained
3. Livelihoods supported

Each department has a "blurb" describing their programme.

Within each department there are multiple programmes that can contribute to each of these targets.

Each programme has a demographic split of outcomes, with gender and youth percentages.

Each programme has a per-province split of outcomes.

Files:

`Consolidated data (Dec) - Presidential Employment Stimulus.xlsx` - December sheet

`Consolidated Presidential Employment Stimulus Reporting Template.xlsx` - January sheet

In [4]:
# dump metric titles (defined in python_src/presidential_employment.py) into metric_title.json
json.dump(metric_titles, open(output_dir + "/metric_titles.json", "w"), indent=2)

In [5]:
# this is where we define the input excel

july_excel = "Dashboard input_PES targets and opportunities per month 130721 Final.xlsx"

august_excel = 'Dashboard input_PES targets and opportunities per month 030821 Final.xlsx'

august_excel2 = 'Dashboard input_PES targets and opportunities per month 090821 Final.xlsx'

august_excel3 = 'Dashboard input_PES targets and opportunities per month 100821 Final.xlsx'

august_excel4 = 'Dashboard input_PES targets and opportunities per month 100821_2 Final.xlsx'

september_excel = 'Dashboard input_PES targets and opportunities September data final 07.10.2021.xlsx'

consolidated_dashboard_excel = september_excel

opportunity_targets_df = pd.read_excel(
    consolidated_dashboard_excel, sheet_name="Targets", header=None
).fillna(0)
opportunity_achievements_df = pd.read_excel(
    consolidated_dashboard_excel, sheet_name="Trends", header=None
).fillna(0)

implementation_status_df = pd.read_excel(
    consolidated_dashboard_excel,
    sheet_name="Implementation status",
    skiprows=2,
    usecols=range(4),
    names=["department", "programme", "status", "detail"],
)
implementation_status_df.department = implementation_status_df.department.fillna(method='pad')

opportunity_type_df = pd.concat(
    [opportunity_targets_df.iloc[2:56, 1], opportunity_targets_df.iloc[2:56, 4]], axis=1
).set_index(1)

description_df = pd.read_excel(
    consolidated_dashboard_excel,
    sheet_name="Department Descriptions",
    index_col=0,
    usecols=range(4),
)

In [6]:
department_names = (
    pd.read_excel(consolidated_dashboard_excel, sheet_name="Targets", skiprows=1)
    .loc[:, "Department"]
    .dropna()
    .iloc[:-1]
)

# programmes_names = (
#     pd.read_excel(consolidated_dashboard_excel, sheet_name="Targets", skiprows=1)
#     .loc[:, "Programme"]
#     .dropna()
#     .iloc[:-1]
# )

# programmes_by_department = {}
# for row in (
#     pd.read_excel(consolidated_dashboard_excel, sheet_name="Targets", skiprows=1)
#     .loc[:, "Department":"Programme"]
#     .iloc[
#         :-1,
#     ]
#     .iterrows()
# ):
#     if not pd.isna(row[1][0]):
#         department_name = row[1][0]
#     programme_name = row[1][1]
#     programme_names = programmes_by_department.get(department_name, [])
#     programme_names.append(programme_name)
#     programmes_by_department[department_name] = programme_names
# list(department_names)

In [7]:
leads = description_df.lead.to_dict()
paragraphs = description_df.paragraph.to_dict()

In [8]:
targets_df = pd.read_excel(
    consolidated_dashboard_excel,
    sheet_name="Targets",
    skiprows=1,
    usecols=list(range(6)),
    names=["department", "programme", "target", "unk", "section", "display_name"],
).drop("unk", axis=1)
targets_df.department = targets_df.department.fillna(method="pad")
targets_df.section = targets_df.section.fillna(
    method="pad"
)  # this is a hack to deal with the fact that DPWI section identifier is missing for most of their programmes

# the line below was removed because it is better to special-case those programmes
# targets_df = targets_df.fillna(-1)  # this is to ensure that targets are -1 when no value is available

trends_df = pd.read_excel(
    consolidated_dashboard_excel,
    sheet_name="Trends",
    skiprows=5,
    usecols=list(range(total_achievement_column+1)),
)
trends_df.columns = [c.lower() for c in trends_df.columns]
trends_df.department = trends_df.department.fillna(method="pad")
trends_df = trends_df.fillna(0)

provincial_df = pd.read_excel(
    consolidated_dashboard_excel,
    sheet_name="Provincial (beneficiaries)",
    skiprows=4,
    usecols=list(range(12)),
)
provincial_df.columns = [
    c.lower().replace(" ", "_").replace("-", "_") for c in provincial_df.columns
]
provincial_df.department = provincial_df.department.fillna(method="pad")
provincial_df = provincial_df.fillna(0)

demographic_df = pd.read_excel(
    consolidated_dashboard_excel,
    sheet_name="Demographic data",
    skiprows=8,
    usecols=list(range(9)),
)
demographic_df.columns = [
    c.lower().replace(" ", "_").replace("%", "perc").replace('no.', 'no') for c in demographic_df.columns
]
demographic_df.department = demographic_df.department.fillna(method="pad")
# demographic_df = demographic_df.fillna(0)

```
interface DepartmentMonth {
  month: number // 202101
  name: string // Basic Education
  lead: string // Strengthening the learning environment in schools
  paragraph: string
  sections: Array<{
    name: string // Budget allocated to date
    matrics: Array<{
      name: string // Educational and general assistants
      type: 'currency' | 'count'
      value: number
      valueTarget?: number
      time?: {
        name: string // spend
        values: Array<{
          month: number // 202101
          value: number
        }>
      }
      gender?: {
        name: string // opportunities
        values: Array<{
          gender: 'female' | 'male'
          value: number
        }>
      }
      age?: {
        name: string // opportunities
        values: Array<{
          age: string // 18-35
          value: number
        }>
      }
      province?: {
        name: string // opportunities
        values: Array<{
          province: 'EC' | 'FS' | 'GP' | 'KZN' | 'LP' | 'MP' | 'NC' | 'NW' | 'WC'
        }>
      }
    }>
  }>
}
```

## 

# Top level structure

In [9]:
all_data = Everything(
    overview=Overview(
        month=202102,
        name="Programme overview",
        lead=leads["overview"],
        paragraph=paragraphs["overview"],
        footer_header="",
        footer_paragraph="",
        sections=[],
    ),
    departments=[],
)


def add_or_replace(departments, department):
    # if a department with sheet_name exists in the list, replace it with the new department, else append to list
    for i, el in enumerate(departments):
        if el.sheet_name == department.sheet_name:
            departments[i] = department
            break
    else:
        departments.append(department)
    return departments

In [10]:
all_data.departments=[]
def make_dim(dim_type, lookup_type, df, col_start, col_end, key_lookup):
    row = df.loc[(df.department == department_name) & (df.programme == programme_name)]
    values = []
    if len(df.loc[(df.department == department_name) & (df.programme == programme_name)]) == 0:
        data_missing = True
    else:
        nonzero = False
        for key in list(row)[col_start:col_end]:
            value = int(row.loc[:, key])
            if value > 0:
                nonzero = True
            values.append(MetricValue(key=key_lookup(key), value=value))
        if not nonzero:
            data_missing = True
            values = []
        else:
            data_missing = False

    dim = Dimension(
        name=metric_titles[section_abbrev_to_name[section]][
            MetricTypeEnum.count.name + "_" + dim_type
        ],
        lookup=dim_type,
        viz=lookup_type,
        values=values,
        data_missing=data_missing,
    )
    return dim


desc_abbrevs = {"DoH": "DOH"}  # deal with special cases in description lookup
departments = {}
for department_name in department_names:
    department_implementation_details = []
    target_section = Section(
        name=section_titles[SectionEnum.targets.name],
        section_type=SectionEnum.targets.name,
        metrics=[
            Metric(
                name=metric_titles[SectionEnum.targets.name][
                    MetricTypeEnum.currency.name
                ],
                metric_type=MetricTypeEnum.currency.name,
                value_target=department_budget_targets[department_name],
                value=-1,
                dimensions=[],
            ),
            Metric(
                name=metric_titles[SectionEnum.targets.name][MetricTypeEnum.count.name],
                metric_type=MetricTypeEnum.count.name,
                value_target=targets_df.loc[
                    targets_df.department == department_name
                ].target.sum(),  # overall target of beneficiaries
                value=trends_df.loc[trends_df.department == department_name]
                .iloc[:, -1]
                .sum(),  # get the achievement by summing the last column in trends
                dimensions=[],
            ),
        ],
    )
    sections = [target_section]
    for section in ["CRE", "LIV", "RET"]:
        programme_names = list(
            targets_df.loc[
                (targets_df.section == section)
                & (targets_df.department == department_name)
            ].programme
        )
        if section == 'CRE' and department_name == 'Agriculture, Land Reform and Rural Development':
            # this does not have a target so needs to be added manually
            programme_names += ['Graduate verifiers']
        metrics = []

        for programme_name in programme_names:
            if department_name == 'Public Works and Infrastructure' and programme_name == 'Project Administrators':
                # this programme is mentioned in Targets and has a line in Implementation Status but has no other data
                continue
            imp_status_row = implementation_status_df.loc[
                (implementation_status_df.department == department_name)
                & (implementation_status_df.programme == programme_name)
            ]
            if len(imp_status_row) == 0 or pd.isna(imp_status_row.status.iloc[0]):
                imp_detail = None
            else:
                imp_detail = ImplementationDetail(
                    programme_name=programme_name,
                    status=implementation_status_to_enum[imp_status_row.status.iloc[0].strip()],
                    detail=imp_status_row.detail.iloc[0].strip(),
                )

            if (
                department_name == "Public Works and Infrastructure"
                and programme_name
                == "Graduate programmes (Property Management Trading Entity)"
            ) or (
                department_name == "Agriculture, Land Reform and Rural Development"
                and programme_name == "Subsistence producer relief fund"
            ):
                department_implementation_details.append(imp_detail)
                continue  # these programmes have no detailed metrics
            else:
                try:
                    # collect detailed metrics for programme
                    dimensions = []
                    time_dimension_row = trends_df.loc[
                        (trends_df.department == department_name)
                        & (trends_df.programme == programme_name)
                    ]

                    dimensions.append(make_dim(LookupTypeEnum.province.name, VizTypeEnum.bar.name, provincial_df, 2, -1, lambda key: province_header_to_abbrev[key]))
                    dimensions.append(make_dim(LookupTypeEnum.time.name, VizTypeEnum.line.name, trends_df, 2, None, lambda key: month_lookup[key]))

                    demographic_row = demographic_df.loc[
                        (demographic_df.department == department_name)
                        & (demographic_df.programme == programme_name)
                    ]

                    values = []
                    if len(demographic_row) == 0:
                        data_missing = True
                    else:
                        male_perc = demographic_row.loc[:, "perc_male"].iloc[0]
                        female_perc = demographic_row.loc[:, "perc_female"].iloc[0]
                        if male_perc + female_perc == 0:
                            data_missing = True
                        else:
                            values=[
                                MetricValue(
                                    key=GenderEnum.Male.name,
                                    value=male_perc,
                                ),
                                MetricValue(
                                    key=GenderEnum.Female.name,
                                    value=female_perc,
                                ),
                            ]
                            if male_perc + female_perc != 1.0:
                                print(programme_name, male_perc, female_perc, male_perc + female_perc)
                            data_missing=False

                    gender_dim = Dimension(
                        name=metric_titles[section_abbrev_to_name[section]][
                            MetricTypeEnum.count.name + "_gender"
                        ],
                        lookup=LookupTypeEnum.gender.name,
                        viz=VizTypeEnum.two_value.name,
                        values=values,
                        data_missing=data_missing
                    )
                    dimensions.append(gender_dim)

                    values = []
                    if len(demographic_row) == 0:
                        data_missing = True
                    else:
                        age_perc = demographic_row.loc[:, "perc_youth"].iloc[0]
                        if age_perc == 0:
                            data_missing = True
                            values = []
                        else:
                            values=[
                                MetricValue(
                                    key="18-35",
                                    value=age_perc,
                                )
                            ]
                            data_missing = False
                    youth_dim = Dimension(
                        name=metric_titles[section_abbrev_to_name[section]][
                            MetricTypeEnum.count.name + "_age"
                        ],
                        lookup=LookupTypeEnum.age.name,
                        viz=VizTypeEnum.percentile.name,
                        values=values,
                        data_missing=data_missing
                    )
                    dimensions.append(youth_dim)

                    # TODO: Rationalise this - disabled and military vets share a lot of code
                    disabled = demographic_row.no_disability.iloc[0]
                    if disabled > 0:
                        disabled_dim = Dimension(
                            name=metric_titles[section_abbrev_to_name[section]][MetricTypeEnum.count.name + '_disabled'],
                            lookup=LookupTypeEnum.disabled.name,
                            viz=VizTypeEnum.count.name,
                            values=[MetricValue(key='disabled', value=disabled)]
                        )
                        dimensions.append(disabled_dim)
                    
                    military_vets = demographic_row.no_military_veterans.iloc[0]
                    if military_vets > 0:
                        mv_dim = Dimension(
                            name=metric_titles[section_abbrev_to_name[section]][MetricTypeEnum.count.name + '_vets'],
                            lookup=LookupTypeEnum.vets.name,
                            viz=VizTypeEnum.count.name,
                            values=[MetricValue(key='vets', value=military_vets)]
                        )
                        dimensions.append(mv_dim)
                                                                                
                        
                    total_value = int(time_dimension_row.iloc[:,-1].iloc[0])
                    target_row = targets_df.fillna(0).loc[
                            (targets_df.department == department_name)
                            & (targets_df.programme == programme_name)
                        ].target
                    if len(target_row) == 0:
                        # e.g. Graduate verifiers programme doesn't have a target
                        target = -1
                    else:
                        target = target_row.iloc[0]
                    programme_metric = Metric(
                        name=programme_name,
                        metric_type=MetricTypeEnum.count.name,
                        value=total_value,
                        value_target=target,
                        dimensions=dimensions,
                        implementation_detail=imp_detail,
                    )
                    metrics.append(programme_metric)
                except IndexError as e:
                    print("IndexError on", section, department_name, programme_name, str(e))

        sections.append(
            Section(
                name=section_titles[section_abbrev_to_name[section]],
                section_type=section_abbrev_to_name[section],
                metrics=metrics,
            )
        )
    abbrev = department_name_to_abbreviation[department_name]
    month = description_df.loc[
        desc_abbrevs.get(abbrev, abbrev), "Data captured until"
    ].strftime("%Y%m")
    departments[department_name] = Department(
            month=month,
            name=department_name,
            sheet_name=abbrev,
            lead=leads[desc_abbrevs.get(abbrev, abbrev)],
            paragraph=paragraphs[desc_abbrevs.get(abbrev, abbrev)],
            sections=sections,
            target_lines=[],
            achievement_lines=[],
            implementation_details=department_implementation_details
        )
    
for name in sorted(departments.keys()):
    all_data.departments.append(departments[name])
    
# print(all_data.to_json(indent=2))

Education Assistants 196949         department             programme  oct       nov       dec       jan  \
0  Basic Education  Education Assistants  0.0  126055.0  158437.0  175221.0   

        feb       mar       apr       may      june      july    august  \
0  196034.0  196949.0  196949.0  196949.0  196949.0  196949.0  196949.0   

   september  
0   196949.0  
General Assistants 122533         department           programme  oct      nov       dec       jan  \
1  Basic Education  General Assistants  0.0  82054.0  120735.0  109014.0   

        feb       mar       apr       may      june      july    august  \
1  121964.0  122533.0  122533.0  122533.0  122533.0  122533.0  122533.0   

   september  
1   122533.0  
Retain vulnerable teaching posts 31114         department                         programme  oct      nov      dec  \
2  Basic Education  Retain vulnerable teaching posts  0.0  21729.0  21988.0   

       jan      feb      mar      apr      may     june     july   august 

In [11]:
total_male = total_female = total_unknown_gender = total_beneficiaries = 0
total_youth = total_unknown_youth = 0
total_provincial = {}
total_unknown_province = 0
for abbreviation in province_abbreviations:
    total_provincial[abbreviation] = 0

for department in all_data.departments:
    department_male = department_female = department_beneficiaries = 0
    for section in department.sections:
        for metric in section.metrics:
            if section.section_type == SectionEnum.targets.name and metric.name == "Beneficiaries":
                total_beneficiaries += metric.value
                department_beneficiaries = metric.value
                continue
            if metric.value == -1:
                continue
            total_value = metric.value
            gender_found = False
            age_found = False
            province_found = False
            for dimension in metric.dimensions:
                if dimension.data_missing:
                    continue
                if dimension.lookup == LookupTypeEnum.gender.name:
                    gender_found = True
                    for value in dimension.values:
                        if value.key == 'Male':
                            department_male += total_value * value.value
                            total_male += total_value * value.value
                        elif value.key == 'Female':
                            department_female += total_value * value.value
                            total_female += total_value * value.value
                elif dimension.lookup == LookupTypeEnum.age.name:
                    age_found = True
                    youth_value = dimension.values[0].value
                    total_youth += youth_value * total_value
                elif dimension.lookup == LookupTypeEnum.province.name:
                    province_found = True
                    for value in dimension.values:
                        total_provincial[value.key] += value.value
            if metric.value <= 0:
                continue
            if not gender_found:
                total_unknown_gender += metric.value
            if not age_found:
                total_unknown_youth += metric.value
            if not province_found:
                total_unknown_province += metric.value
print(total_beneficiaries, total_unknown_gender, round(total_unknown_gender / total_beneficiaries, 2), 
      total_unknown_youth, round(total_unknown_youth / total_beneficiaries, 2), 
      total_unknown_province, round(total_unknown_province / total_beneficiaries, 2))

552838.0 83711 0.15 83711 0.15 19253 0.03


## Overview picture

In [12]:
programmes_by_type = {
    SectionEnum.job_opportunities.name: {},
    SectionEnum.livelihoods.name: {},
    SectionEnum.jobs_retain.name: {},
}

achievements_by_type_by_month = {}
for section_type in [
    e.name for e in SectionEnum if e.name != "targets" and e.name != "budget_allocated"
]:
    achievements_by_type_by_month[section_type] = {}
    for month in months:
        achievements_by_type_by_month[section_type][month] = 0

achievements_df = opportunity_achievements_df.iloc[3:, 1:].set_index(1)
for department in all_data.departments:
    section_value = 0
    section_target_value = 0
    for section in department.sections:
        if section.section_type == SectionEnum.targets.name:
            continue
        total_value = 0
        total_target_value = 0
        for metric in section.metrics:
            #             if (
            #                 department.sheet_name == "DALRRD"
            #                 and metric.name == "Graduate Employment"
            #             ):
            #                 continue
            if metric.name not in achievements_df.index:
                print(
                    "Metric not found in achievements_df", department.name, metric.name
                )
            total_value += metric.value
            if metric.value_target > 0:
                total_target_value += metric.value_target
            for dimension in metric.dimensions:
                if dimension.lookup == LookupTypeEnum.time.name:
                    for metric_value in dimension.values:
                        month = metric_value.key
                        value = metric_value.value
                        achievements_by_type_by_month[section.section_type][
                            month
                        ] += value

        if (
            department.name == "Agriculture, Land Reform and Rural Development"
            and section.section_type == SectionEnum.livelihoods.name
        ):
            total_target_value = int(opportunity_targets_df.iloc[8, 2])
        elif (
            department.name == "Public Works and Infrastructure"
            and section.section_type == SectionEnum.job_opportunities.name
        ):
            total_target_value = int(opportunity_targets_df.iloc[47, 2])
        #         print(department.name, section.name, total_value, total_target_value)
        programmes_by_type[section.section_type][department.sheet_name] = {
            "value": total_value,
            "value_target": total_target_value,
        }
        if "Total" not in programmes_by_type[section.section_type]:
            programmes_by_type[section.section_type]["Total"] = dict(
                value=0, value_target=0
            )
        programmes_by_type[section.section_type]["Total"]["value"] += total_value
        programmes_by_type[section.section_type]["Total"][
            "value_target"
        ] += total_target_value
        section_value += total_value
        section_target_value += total_target_value

achievement_totals_df = pd.read_excel(consolidated_dashboard_excel, sheet_name='Demographic data', skiprows=2, usecols=range(2), nrows=3, names=['section', 'total'], index_col=0)
        
assert (
    programmes_by_type[SectionEnum.job_opportunities.name]["Total"]["value_target"]
    == opportunity_targets_df.iloc[6, 7]
), f'{SectionEnum.job_opportunities.name} total mismatch: {programmes_by_type[SectionEnum.job_opportunities.name]["Total"]["value_target"]} vs {opportunity_targets_df.iloc[6, 7]}'

assert (
    programmes_by_type[SectionEnum.job_opportunities.name]["Total"]["value"] == achievement_totals_df.loc["Jobs created","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch {programmes_by_type[SectionEnum.job_opportunities.name]["Total"]["value"]} vs {achievement_totals_df.loc["Jobs created"]}'

assert (
    programmes_by_type[SectionEnum.livelihoods.name]["Total"]["value_target"]
    == opportunity_targets_df.iloc[7, 7]
), f'{SectionEnum.livelihoods.name} total mismatch: {programmes_by_type[SectionEnum.livelihoods.name]["Total"]["value_target"]} vs {opportunity_targets_df.iloc[7, 7]}'

assert (
    programmes_by_type[SectionEnum.livelihoods.name]["Total"]["value"] == achievement_totals_df.loc["Livelihoods supported","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch {programmes_by_type[SectionEnum.livelihoods.name]["Total"]["value"]} vs {achievement_totals_df.loc["Livelihoods supported"]}'

assert (
    programmes_by_type[SectionEnum.jobs_retain.name]["Total"]["value_target"]
    == opportunity_targets_df.iloc[8, 7]
), f'{SectionEnum.jobs_retain.name} total mismatch: {programmes_by_type[SectionEnum.jobs_retain.name]["Total"]["value_target"]} vs {opportunity_targets_df.iloc[8, 7]}'

assert (
    programmes_by_type[SectionEnum.jobs_retain.name]["Total"]["value"] == achievement_totals_df.loc["Jobs retained","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch {programmes_by_type[SectionEnum.jobs_retain.name]["Total"]["value"]} vs {achievement_totals_df.loc["Jobs retained"]}'

overview_metrics = [
                Metric(
                    name=section_titles[section_name],
                    metric_type=section_name,
                    value=programmes_by_type[section_name]["Total"]["value"],
                    value_target=programmes_by_type[section_name]["Total"][
                        "value_target"
                    ],
                    dimensions=[
                        Dimension(
                            name="by department",
                            viz=VizTypeEnum.bar.name,
                            lookup=LookupTypeEnum.department.name,
                            values=[
                                MetricValue(
                                    key=department_name,
                                    value=outputs["value"],
                                    value_target=outputs["value_target"],
                                )
                                for department_name, outputs in sorted(
                                    department_info.items(),
                                    key=lambda e: e[1]["value"],
                                    reverse=True,
                                )
                                if not (
                                    department_name.startswith("value")
                                    or department_name == "Total"
                                )
                            ],
                        ),
                        Dimension(
                            name="over time",
                            viz=VizTypeEnum.line.name,
                            lookup=LookupTypeEnum.time.name,
                            values=[
                                MetricValue(key=key, value=value)
                                for key, value in achievements_by_type_by_month[
                                    section_name
                                ].items()
                            ],
                        ),
                    ],
                )
                for section_name, department_info in programmes_by_type.items()
                if not section_name.startswith("value")
            ]

current_target = sum([metric.value if metric.value > 0 else 0 for metric in overview_metrics])
current_achievement = sum([metric.value_target if metric.value_target > 0 else 0 for metric in overview_metrics])

overview_metrics.extend(
            [
                    Metric(
                        name="Beneficiaries by Gender",
                        metric_type="targets_count",
                        value=-1,
                        value_target=-1,
                        dimensions=[
                            Dimension(
                                name="Beneficiaries by Gender",
                                viz=VizTypeEnum.two_value.name,
                                lookup=LookupTypeEnum.gender.name,
                                values=[
                                    MetricValue(
                                        key=GenderEnum.Male.name,
                                        value=total_male / (total_beneficiaries - total_unknown_gender)
                                    ),
                                    MetricValue(
                                        key=GenderEnum.Female.name,
                                        value=total_female / (total_beneficiaries - total_unknown_gender)
                                    )
                                ],
                                data_missing=False
                            ),
                        ]
                    ),
                    Metric(
                        name="Beneficiaries that are Youth",
                        metric_type="targets_count",
                        value=-1,
                        value_target=-1,
                        dimensions=[
                            Dimension(
                                name="Beneficiaries that are Youth",
                                viz=VizTypeEnum.two_value.name,
                                lookup=LookupTypeEnum.age.name,
                                values=[
                                    MetricValue(key="18-35", 
                                                value=total_youth / (total_beneficiaries - total_unknown_youth)
                                               ),
                                    MetricValue(key="36+",
                                               value=1 - (total_youth / (total_beneficiaries - total_unknown_youth))
                                               )
                                ],
                                data_missing=False
                            ),
                        ]
                    ),Metric(
                        name="Beneficiaries by province",
                        metric_type="targets_count",
                        value=-1,
                        value_target=-1,
                        dimensions=[
                            Dimension(
                                name="Beneficiaries by province",
                                viz=VizTypeEnum.bar.name,
                                lookup=LookupTypeEnum.province.name,
                                values=[
                                    MetricValue(key=abbrev, value=total_provincial[abbrev]) for abbrev in province_abbreviations
                                ],
                                data_missing=False
                            )                        
                        ]
                    )
            ])
overview = Overview(
    month=(description_df.loc["overview", "Data captured until"]).strftime("%Y%m"),
    name="Programme overview",
    lead=leads["overview"],
    paragraph=paragraphs["overview"],
    footer_header=leads["Disclaimer"],
    footer_paragraph=paragraphs["Disclaimer"],
    sections=[
        Section(
            name="Current status",
            section_type=SectionEnum.overview.name,
            metrics=overview_metrics,
            value=current_target,
            value_target=current_achievement,
        )
    ],
)

assert (
    opportunity_achievements_df.iloc[6:59, total_achievement_column].sum()
    == opportunity_achievements_df.iloc[59, total_achievement_column]
), "Sum of achievements does not add up to reported total"
assert (
    opportunity_targets_df.iloc[2:56, 2].sum() == opportunity_targets_df.iloc[56, 2]
), "Sum of targets does not add up to reported total"
overview.sections.insert(
    0,
    Section(
        name=section_titles[SectionEnum.targets.name + "_overview"],
        section_type=SectionEnum.targets.name,
        metrics=[
            Metric(
                name=metric_titles[SectionEnum.targets.name][
                    MetricTypeEnum.currency.name
                ],
                metric_type=MetricTypeEnum.currency.name,
                dimensions=[],
                # value=int(opportunity_targets_df.iloc[2, 7] * 1000),
                value=0,
                value_target=(opportunity_targets_df.iloc[2, 6] * 1000),
            ),
            Metric(
                name=metric_titles[SectionEnum.targets.name][MetricTypeEnum.count.name],
                metric_type=MetricTypeEnum.count.name,
                dimensions=[],
                value=int(
                    opportunity_achievements_df.iloc[59, total_achievement_column]
                ),
                value_target=int(opportunity_targets_df.iloc[56, 2]),
            ),
            Metric(
                name="Opportunities in process",
                metric_type=MetricTypeEnum.count.name,
                dimensions=[],
                value_target=int(opportunity_achievements_df.iloc[2, 1]),
                value=0,
            ),
        ],
        value=None,
        value_target=None,
    ),
)

all_data.overview = overview
# print(overview.to_json(indent=2))

Agriculture, Land Reform and Rural Development Vegetables and Fruits 19877 19877
Agriculture, Land Reform and Rural Development Maize/soya/sugar/other production 11071 30948
Agriculture, Land Reform and Rural Development Poultry: Layers and Boilers 12835 43783
Agriculture, Land Reform and Rural Development Small livestock 8493 52276
Agriculture, Land Reform and Rural Development Large livestock 3873 56149
Social Development ECD income and compliance support 32340 32340
Sports, Arts and Culture Support to art and culture practitioners - National Arts Council 23989 23989
Sports, Arts and Culture Support to art and culture practitioners - National Film and Video Foundation 8156 32145


311

In [14]:
ad_set = set()
for department in all_data.departments:
    for section in department.sections:
        if section.section_type == SectionEnum.targets.name:
            continue
        for metric in section.metrics:
            ad_set.add(metric.name)

ot_set = set(list(opportunity_targets_df.iloc[2:55, 1]))
imp_set = set(list(implementation_status_df.iloc[:53, 1]))
ot_set.difference(ad_set)

{'Graduate programmes (Property Management Trading Entity)',
 'Subsistence producer relief fund'}

# Save final data

In [15]:
output_filename = output_dir + "/all_data.json"
all_data.departments.sort(key=operator.attrgetter("sheet_name"))
open(output_filename, "w").write(all_data.to_json(indent=2))
# print(all_data.to_json(indent=2))
print("DONE")

DONE


In [16]:
# programme_status_df = pd.read_excel(
#     mar_opportunities_excel, sheet_name="Implementation status", header=None
# )

# to_camel_case = lambda match: match.group(1) + match.group(2).upper() + match.group(3)

# [
#     re.sub(r"(\S*) (\w)(.*)", to_camel_case, status)
#     for status in implementation_status_df.iloc[3:, 2].dropna().unique()
# ]

In [17]:
# for dept in all_data.departments:
#     print(f"\t'{dept.name}': '{dept.sheet_name}',")

In [18]:
# json.dump(metric_titles, open(output_dir + "/metric_titles.json", "w"), indent=2)