In [9]:
#| code-summary: "Show python imports"

import sys
import os
from pathlib import Path

# Add root directory as python path
root_dir = os.path.abspath(Path(sys.executable).parents[2])
sys.path.append(root_dir)

%reload_ext autoreload
%autoreload 2

# Other imports
import pandas as pd
from pyppeteer.page import Page
from python_utils.web_screenshot import web_screenshot_async
import io
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
from datetime import datetime
from calendar import monthrange, month_abbr
from matplotlib import pyplot as plt
import matplotlib.dates as mdates

In [None]:
async def page_action_fn(page: Page):
    return await page.waitForSelector('.elementor-widget-container > [role="tablist"]')

# Take a screenshot
await web_screenshot_async(
    "https://brs.go.ke/companies-registry-statistics/",
    action = page_action_fn,
    width = 1200,
    height = 1200,
    screenshot_options = {'fullPage': False })

In [3]:
html: str = urlopen("https://brs.go.ke/companies-registry-statistics/").read()
html_parser = BeautifulSoup(html, "html.parser").select_one('.elementor-widget-container > [role="tablist"]')

In [4]:
years = { i.attrs['data-tab']: i.get_text(separator='', strip=True) for i in html_parser.select(".ha-tabs__nav .ha-tab__title")}
records = { i.attrs['data-tab']: i.find('table') for i in html_parser.select('.ha-tabs__content [role="tabpanel"]') }
years_records = [(year, records[id]) for id, year in years.items()]

In [None]:
def get_date(month, year):
    # Convert month name to number
    month_num = [i.lower() for i in month_abbr].index(month.lower())
    # Get the last day of the month
    _, last_day = monthrange(int(year), month_num)
    return datetime.date(int(year), month_num, last_day)

def get_table(index: int):
    (finantial_year, table_str) = years_records[index]
    (finantial_year_1, finantial_year_2) = finantial_year.split('/')
    df = pd.read_html(io.StringIO(str(table_str)))[0]
    first_column = df.columns[0]
    # Remove last row (`Total Entities Registered`) and last column (`Grand Total`)
    df = df[df[first_column] != "Total Entities Registered"].drop("Grand Total", axis=1)
    # replace - with NaN
    df = df.replace("-", np.nan)
    df[first_column] = df[first_column].astype(str)
    for column in df.columns[1:]:
        df[column] = df[column].astype(float)
    indexes = [get_date(month, finantial_year_1) for month in df.columns[1:7]] + \
        [get_date(month, finantial_year_2) for month in df.columns[7:]]
    df = df.set_index(first_column).T
    df.index = indexes
    df.columns = [i.lower().strip() for i in df.columns]
    return df

get_table(9)

In [None]:
all_registrations = pd.concat([get_table(i) for i in range(len(years_records))]).sort_index(ascending=True)
all_registrations

In [None]:
fig, ax = plt.subplots(figsize=(20, 12))
ax.set_title(
    f'Summary of Registered Entities in Kenya, between {min(all_registrations.index)} and {max(all_registrations.index)} ', 
    fontsize=26,
    pad=20)
all_registrations.rolling(9).mean().plot(ax=ax)
# Add a vertical line at the split date
election_date = datetime(2022, 9, 13)
ax.axvline(x=election_date, color='green', linestyle='--', linewidth=2, zorder=4)
# Fill the regions
# Convert dates to matplotlib date format
dates_mpl = mdates.date2num(all_registrations.index)
split_date_mpl = mdates.date2num(election_date)
ax.fill_between(dates_mpl, 0, 100, where=dates_mpl < split_date_mpl, 
                facecolor='#f62f3c', alpha=0.3, transform=ax.get_xaxis_transform())
ax.fill_between(dates_mpl, 0, 100, where=dates_mpl >= split_date_mpl, 
                facecolor='#f8c811', alpha=0.3, transform=ax.get_xaxis_transform())
ax.legend(loc='upper left')
ax.text(datetime(2017, 1, 1), 1000, 'Uhuru/Jubilee Era', fontsize = 22)
ax.text(datetime(2023, 1, 1), 1000, 'Ruto/UDA Era', fontsize = 22)