In [None]:
import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = False
pd.options.plotting.backend = "plotly"

In [None]:
data = pd.read_stata("merged_PIAAC.dta", convert_categoricals=True)

In [None]:
varlist = [
    "cntryid",
    "gender_r",
    "ageg5lfs",
    "b_q01a",
    "b_q12c",
    "b_q12e",
    "b_q12g",
    "c_q01a",
    "c_q02a",
    "c_d05",
    "d_q06a",
    "d_q08a",
    "d_q10",
    "d_q14",
    "d_q16a",
    "d_q16b",
    "g_q01d",
    "g_q03f",
    "g_q04",
    "g_q05g",
    "computerexperience",
]
data_selected = data[varlist]

In [None]:
names = {
    "ageg5lfs": "age_g5",
    "b_q01a": "highest_educ",
    "b_q12c": "job_training",
    "b_q12e": "workshops",
    "b_q12g": "priv_lessons",
    "c_q01a": "paid_work",
    "c_q02a": "search_paid_work",
    "c_d05": "empl_status",
    "d_q06a": "empl_size",
    "d_q08a": "supervisor",
    "d_q10": "hours_week",
    "d_q14": "job_satisfaction",
    "d_q16a": "earnings_per_x",
    "d_q16b": "gross_pay",
    "g_q01d": "sk_read_articles",
    "g_q03f": "sk_prepare_graphs",
    "g_q04": "sk_computer",
    "g_q05g": "sk_programming",
}
data_selected = data_selected.rename(columns=names)

In [None]:
data_selected.dropna(axis="index", how="all")

In [None]:
data_selected["sk_programming"].unique()

In [None]:
def select_countries(raw, countries):
    return raw.query(f"cntryid in {countries}").copy()


def set_country(sr):
    cats = pd.CategoricalDtype(sr.unique(), ordered=False)
    return sr.astype(cats)

In [None]:
def yes_to_one(sr):
    return (sr == "Yes").astype(pd.Float64Dtype())


def at_least_once_a_month_to_one(sr):
    cats = [
        "Less than once a week but at least once a month",
        "At least once a week but not every day",
        "Every day",
    ]
    return sr.isin(cats).astype(pd.Float64Dtype())

In [None]:
all_vars = ["cntryid", "age_g5", "sk_computer", "sk_programming"]
_tmp = select_countries(
    raw=data_selected,
    countries=["United States", "Netherlands", "Germany", "Austria"],
)
_tmp = _tmp[all_vars].dropna(how="all", axis="index")
df = pd.DataFrame(index=_tmp.index)
df["country"] = set_country(_tmp["cntryid"])
df["age_group"] = _tmp["age_g5"]
df["sk_computer"] = _tmp["sk_computer"]
df["sk_programming"] = _tmp["sk_programming"]
df["uses_computer"] = yes_to_one(df["sk_computer"])
df["programs_monthly"] = at_least_once_a_month_to_one(df["sk_programming"])

In [None]:
tiny_example = (
    df[["country", "uses_computer", "programs_monthly"]].groupby("country").mean()
)
tiny_example[["uses_computer", "programs_monthly"]].to_pickle(
    "piaac_computer_programming_by_country.pkl",
)
tiny_example.reset_index().dtypes

In [None]:
df["age_group"]

In [None]:
excerpt_country_age = df.loc[[150099, 152382, 133944, 136042]].reset_index(drop=True)
excerpt_country_age[
    ["country", "age_group", "uses_computer", "programs_monthly"]
].to_pickle("piaac_exerpt_country_age.pkl")
excerpt_country_age[["country", "age_group", "uses_computer", "programs_monthly"]]

In [None]:
data_selected.to_pickle("PIAAC_selected.pkl")