In [None]:
import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"

In [None]:
full = pd.read_feather("PIAAC_selected.arrow")

In [None]:
def set_country(sr):
    cats = pd.CategoricalDtype(sr.unique().to_numpy(), ordered=False)
    return sr.astype(cats)

In [None]:
def at_least_once_a_month_to_one(sr):
    cats = [
        "Less than once a week but at least once a month",
        "At least once a week but not every day",
        "Every day",
    ]
    return sr.isin(cats).astype(pd.Float64Dtype())

In [None]:
all_vars = ["country", "age_group", "sk_computer", "sk_programming"]
_tmp = full.query("country in ['United States', 'Netherlands', 'Germany', 'Austria']")
_tmp = _tmp[all_vars].dropna(how="all", axis="index")
four_countries = pd.DataFrame(index=_tmp.index)
four_countries["country"] = set_country(_tmp["country"])
four_countries["age_group"] = _tmp["age_group"]
four_countries["sk_computer"] = _tmp["sk_computer"]
four_countries["sk_programming"] = _tmp["sk_programming"]
four_countries["programs_monthly"] = at_least_once_a_month_to_one(
    _tmp["sk_programming"],
)

### Means of computer usage / programming by country, tiny example 

In [None]:
groupby_cols = ["country"]
cat_cols = ["sk_computer", "programs_monthly"]
_grouped = four_countries[groupby_cols + cat_cols].groupby("country")
means_by_country = pd.DataFrame(index=_grouped.count().index)
means_by_country["sk_computer"] = (
    _grouped["sk_computer"].value_counts().xs(key="Yes", level="sk_computer")
    / _grouped["sk_computer"].count()
)
means_by_country["programs_monthly"] = _grouped["programs_monthly"].mean()
means_by_country[["sk_computer", "programs_monthly"]].to_feather(
    "piaac_computer_programming_by_country.arrow",
)
means_by_country

In [None]:
# # Need this to remove other countries from dtype

In [None]:
### Age cat, hours of work, computer usage / programming for 20 Germans

In [None]:
de_20 = (
    full.query("country == 'Germany' & age_group in ['Aged 30-34', 'Aged 55-59']")
    .sample(20, random_state=4)
    .reset_index(drop=True)[
        ["age_group", "gender", "hours_per_week", "sk_computer", "sk_programming"]
    ]
)
de_20.to_feather("piaac_de_20.arrow")
de_20