# Input file definitions: Edit this as data is updated

In [1]:
# this is where we define the input excel

final_phase_1_excel = 'Phase 1 Dashboard input_PES targets and opportunities per month 09022022 Updated with Implementation 2.xlsx'

phase2_excel = 'New version dashboard edited KP 12.09.2022 RM 01.10.2022 PVH.xlsx'

# For updates, don't edit below this point

In [2]:
import functools
import json
import math
import operator
import re
import sys
from datetime import datetime

import numpy as np
import pandas as pd

from pprint import PrettyPrinter
sys.path.append("../python-src")
from presidential_employment import *

In [3]:
pp = PrettyPrinter(indent=2)

In [4]:
output_dir = "/home/pvh/Documents/code/pvh-forks/presidential-employment-stimulus/data"

### Data structure

Each department has a total budget and total opportunities target. 

The overall programme has outcome targets
1. Jobs created
2. Jobs retained
3. Livelihoods supported

Each department has a "blurb" describing their programme.

Within each department there are multiple sections programmes that can contribute to each of these targets.

Each programme has a demographic split of outcomes, with gender and youth percentages.

Each programme has a per-province split of outcomes.

Files:

`Phase 1 Dashboard input_PES targets and opportunities per month 09022022 Updated with Implementation 2.xlsx` - final phase 1 Excel
`April dashboard_2.xlsx` - latest phase 2 Excel


In [5]:
# dump metric titles (defined in python_src/presidential_employment.py) into metric_title.json
json.dump(metric_titles, open(output_dir + "/metric_titles.json", "w"), indent=2)

In [6]:
(opportunity_targets_df,
 opportunity_achievements_df,
 implementation_status_df,
 description_df,
 phase1_departments,
 phase2_departments,
 targets_df,
 trends_df,
 provincial_df,
 cities_df,
 universities_df,
 demographic_df,
 achievement_totals_df) = load_sheets(final_phase_1_excel, phase2_excel)

In [7]:
department_names = list(set(phase1_departments).union(phase2_departments))

leads = description_df.lead.to_dict()
paragraphs = description_df.paragraph.to_dict()

In [8]:
targets_df[1]

Unnamed: 0,department,programme,target,unk,section,display_name
0,Basic Education,Education Assistants,191392.0,,CRE,Education Assistants
1,Basic Education,General Assistants,95608.0,,CRE,General Assistants
2,Social Development,ECD Employment Stimulus: Complete payments ove...,42718.0,,LIV,ECD Employment Stimulus: Complete payments ove...
3,Social Development,Appointment of social workers to assist SASSA ...,2000.0,,CRE,Appointment of social workers to assist SASSA ...
4,Social Development,NDA Volunteer Programme,1880.0,,CRE,NDA Volunteer Programme
5,"Agriculture, Land Reform and Rural Development",Subsistence Producer Relief Fund,67378.0,HAS_SP,LIV,Subsistence producer relief fund
6,"Agriculture, Land Reform and Rural Development",Vegetables and Fruits,,,LIV,Vegetables and Fruits
7,"Agriculture, Land Reform and Rural Development",Maize/soya/sugar/other production,,,LIV,Maize/soya/sugar/other production
8,"Agriculture, Land Reform and Rural Development",Poultry: Layers and Boilers,,,LIV,Poultry: Layers and Boilers
9,"Agriculture, Land Reform and Rural Development",Small livestock,,,LIV,Small livestock


## 

In [9]:
def add_or_replace(departments, department):
    # if a department with sheet_name exists in the list, replace it with the new department, else append to list
    for i, el in enumerate(departments):
        if el.sheet_name == department.sheet_name:
            departments[i] = department
            break
    else:
        departments.append(department)
    return departments

## Compute per department data structures

In [10]:
all_data_departments = compute_all_data_departments(phase1_departments, phase2_departments, 
                                                    implementation_status_df, demographic_df, description_df,
                                                    targets_df, trends_df, department_names, provincial_df,
                                                    cities_df, universities_df, leads, paragraphs)

M/F PERC PROBLEM: Sports, Arts and Culture Job retention in cultural and creative institutions - National Film and Video Foundation 0 0.636 0.36 0.996
Implementation status missing for:  0 : Public Works and Infrastructure : Water and Energy Efficiency
Implementation status missing for:  0 : Public Works and Infrastructure : Water and Sanitation Facilities Management
Implementation status missing for:  0 : Public Works and Infrastructure : Welisizwe Rural Bridges Programme
Implementation status missing for:  0 : Public Works and Infrastructure : Facilities Management
Implementation status missing for:  0 : Public Works and Infrastructure : Real Estate
Implementation status missing for:  0 : Public Works and Infrastructure : In-House Construction projects
Implementation status missing for:  0 : Public Works and Infrastructure : Public Private Collaborations
Implementation status missing for:  0 : Agriculture, Land Reform and Rural Development : Graduate verifiers
Implementation status m

## Compute breakdown of all programmes by demographic dimensions

In [11]:
(total_male, 
 total_female, 
 total_unknown_gender,
 total_beneficiaries,
 total_youth, 
 total_unknown_youth, 
 total_provincial, 
 total_unknown_province) = compute_breakdowns(all_data_departments)

## Overview picture

In [12]:
(programmes_by_type,
 programmes_by_type_summarised,
 achievements_by_type_by_month,
 provincial_breakdown) = compute_programmes_by_type(all_data_departments, opportunity_achievements_df, opportunity_targets_df)

### Check that total add up to totals listed in the spreadsheet

In [13]:
# check targets for phase 1 - job opportunities
assert (
    programmes_by_type[SectionEnum.job_opportunities.name][0]["Total"]["value_target"]
    == opportunity_targets_df[0].iloc[6, 7]
), f'{SectionEnum.job_opportunities.name} total mismatch: {programmes_by_type[SectionEnum.job_opportunities.name][0]["Total"]["value_target"]} vs {opportunity_targets_df[0].iloc[6, 7]}'

# check targets for phase 2 - job opportunities
assert (
    programmes_by_type[SectionEnum.job_opportunities.name][1]["Total"]["value_target"]
    == opportunity_targets_df[1].iloc[5, 7]
), f'{SectionEnum.job_opportunities.name} total mismatch: {programmes_by_type[SectionEnum.job_opportunities.name][1]["Total"]["value_target"]} vs {opportunity_targets_df[1].iloc[5, 7]}'


In [14]:
# check achievements for phase 1 - job opportunities
assert (
    programmes_by_type[SectionEnum.job_opportunities.name][0]["Total"]["value"] == achievement_totals_df[0].loc["Jobs created","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch {programmes_by_type[SectionEnum.job_opportunities.name][0]["Total"]["value"]} vs {achievement_totals_df[0].loc["Jobs created"]}'

# check achiements for phase 2 - job opportunities
assert (
    programmes_by_type[SectionEnum.job_opportunities.name][1]["Total"]["value"] == achievement_totals_df[1].loc["Jobs created","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch {programmes_by_type[SectionEnum.job_opportunities.name][1]["Total"]["value"]} vs {achievement_totals_df[1].loc["Jobs created"]}'


In [15]:
# check targets for phase 1 - livelihoods support
assert (
    programmes_by_type[SectionEnum.livelihoods.name][0]["Total"]["value_target"]
    == opportunity_targets_df[0].iloc[7, 7]
), f'{SectionEnum.livelihoods.name} total mismatch: phase 1 target {programmes_by_type[SectionEnum.livelihoods.name][0]["Total"]["value_target"]} vs {opportunity_targets_df[0].iloc[7, 7]}'

# check targets for phase 2 - livelihoods support
assert (
    programmes_by_type[SectionEnum.livelihoods.name][1]["Total"]["value_target"]
    == opportunity_targets_df[1].iloc[6, 7]
), f'{SectionEnum.livelihoods.name} total mismatch: phase 2 target {programmes_by_type[SectionEnum.livelihoods.name][1]["Total"]["value_target"]} vs {opportunity_targets_df[1].iloc[6, 7]}'


# check achievements for phase 1 - livelihoods support
assert (
    programmes_by_type[SectionEnum.livelihoods.name][0]["Total"]["value"] == achievement_totals_df[0].loc["Livelihoods supported","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch - phase 1 {programmes_by_type[SectionEnum.livelihoods.name][0]["Total"]["value"]} vs {achievement_totals_df[0].loc["Livelihoods supported"]}'

# check achievements for phase 2 - livelihoods support
assert (
    programmes_by_type[SectionEnum.livelihoods.name][1]["Total"]["value"] == achievement_totals_df[1].loc["Livelihoods supported","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch - phase 2 {programmes_by_type[SectionEnum.livelihoods.name][1]["Total"]["value"]} vs {achievement_totals_df[1].loc["Livelihoods supported"]}'


In [16]:
# check targets for phase 1 - jobs retained
assert (
    programmes_by_type[SectionEnum.jobs_retain.name][0]["Total"]["value_target"]
    == opportunity_targets_df[0].iloc[8, 7]
), f'{SectionEnum.jobs_retain.name} total mismatch: {programmes_by_type[SectionEnum.jobs_retain.name][0]["Total"]["value_target"]} vs {opportunity_targets_df[0].iloc[8, 7]}'

# check achivements for phase 2 - jobs retained
assert (
    programmes_by_type[SectionEnum.jobs_retain.name][0]["Total"]["value"] == achievement_totals_df[0].loc["Jobs retained","total"]
), f'{SectionEnum.job_opportunities.name} total mismatch {programmes_by_type[SectionEnum.jobs_retain.name][0]["Total"]["value"]} vs {achievement_totals_df[0].loc["Jobs retained"]}'


### Compute breakdowns used in overview and metrics used in overview

In [17]:
(breakdown_metrics, current_targets, current_achievements) = compute_overview_breakdown(programmes_by_type_summarised,
                                                                                        achievements_by_type_by_month,
                                                                                        provincial_breakdown)

overview_metrics = compute_overview_metrics(total_female, total_beneficiaries, total_unknown_gender,
                             opportunity_targets_df, programmes_by_type,
                             total_youth, total_unknown_youth)


### Assemble overview and put together final combined data

In [18]:
overview = compute_overview(description_df, leads, paragraphs, overview_metrics,
                            current_targets, current_achievements, breakdown_metrics)
all_data = Everything(
    overview=overview,
    departments=all_data_departments
)


In [19]:
# ad_set = set()
# for department in all_data.departments:
#     for section in department.sections:
#         if section.section_type == SectionEnum.targets.name:
#             continue
#         for metric in section.metrics:
#             ad_set.add(metric.name)

# ot_set = set(list(opportunity_targets_df.iloc[2:55, 1]))
# imp_set = set(list(implementation_status_df.iloc[:53, 1]))
# ot_set.difference(ad_set)

In [20]:
# for department in all_data.departments:
#     print(department.name)
#     for section in department.phases[1].sections:
#         print(section.name)
#         for metric in section.metrics:
#             print(metric.name)
#             metric.to_json()
# import inspect
# print("mod", inspect.getmodule(type(all_data.departments[0].phases[1].sections[0].metrics[1].value_target)))
# inspect.getmodule(type(all_data.departments[0].phases[1].sections[0].metrics[1].value_target)) == np

# Save final data

In [21]:
# to work around the fact that data from pandas sometimes appears as numpy types, this uses a
# version of dataclasses-json core.py (https://github.com/pvanheus/dataclasses-json/blob/master/dataclasses_json/core.py)
# see this PR: https://github.com/lidatong/dataclasses-json/pull/352
output_filename = output_dir + "/all_data.json"
# all_data.departments.sort(key=operator.attrgetter("sheet_name"))
open(output_filename, "w").write(all_data.to_json(indent=2))
# print(all_data.to_json(indent=2))
print("DONE")

DONE


In [22]:
# programme_status_df = pd.read_excel(
#     mar_opportunities_excel, sheet_name="Implementation status", header=None
# )

# to_camel_case = lambda match: match.group(1) + match.group(2).upper() + match.group(3)

# [
#     re.sub(r"(\S*) (\w)(.*)", to_camel_case, status)
#     for status in implementation_status_df.iloc[3:, 2].dropna().unique()
# ]

In [23]:
# for dept in all_data.departments:
#     print(f"\t'{dept.name}': '{dept.sheet_name}',")

In [24]:
# json.dump(metric_titles, open(output_dir + "/metric_titles.json", "w"), indent=2)

In [25]:
implementation_status_df[1]

Unnamed: 0,department,programme,status,detail
0,Basic Education,Education Assistants,On track,Most of the second cohort of school assistants...
1,Basic Education,General Assistants,On track,Much effort is going into supporting the schoo...
2,Social Development,ECD Employment Stimulus: Complete payments ove...,Critical challenges,Additional payments made since October 2021 we...
3,Social Development,Appointment of social workers to assist SASSA ...,On track,The programme exceeded its target.
4,Social Development,NDA Volunteer Programme,On track,The NDA is partnering with Civil Society Organ...
5,"Agriculture, Land Reform and Rural Development",Subsistence Producer Relief Fund,On track,"51,559 beneficiaries of production input vouch..."
6,"Agriculture, Land Reform and Rural Development",Vegetables and Fruits,,
7,"Agriculture, Land Reform and Rural Development",Maize/soya/sugar/other production,,
8,"Agriculture, Land Reform and Rural Development",Poultry: Layers and Boilers,,
9,"Agriculture, Land Reform and Rural Development",Small livestock,,
