In [None]:
import numpy as np
import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"

In [None]:
full = pd.read_feather("PIAAC_selected.arrow")

In [None]:
full.dtypes

In [None]:
def set_cats(sr, ordered):
    cats = pd.CategoricalDtype(sorted(sr.unique().to_numpy()), ordered=ordered)
    return sr.astype(cats)

In [None]:
def at_least_once_a_month_to_one(sr):
    cats = [
        "Less than once a week but at least once a month",
        "At least once a week but not every day",
        "Every day",
    ]
    return sr.isin(cats).astype(pd.Float64Dtype())

In [None]:
all_vars = [
    "country",
    "age_group",
    "gender",
    "hours_per_week",
    "use_computer_at_work",
    "programming_at_work",
]
_tmp = full.query("country in ['United States', 'Netherlands', 'Germany', 'Austria']")
_tmp = _tmp[all_vars].dropna(how="all", axis="index")
four_countries = pd.DataFrame(index=_tmp.index)
four_countries["country"] = set_cats(_tmp["country"], ordered=False)
four_countries["age_group"] = _tmp["age_group"]
four_countries["gender"] = _tmp["gender"]
four_countries["hours_per_week"] = _tmp["hours_per_week"]
four_countries["use_computer_at_work"] = _tmp["use_computer_at_work"]
four_countries["programming_at_work"] = _tmp["programming_at_work"]
four_countries["programs_monthly"] = at_least_once_a_month_to_one(
    _tmp["programming_at_work"],
)

In [None]:
two_countries = four_countries.query("country in ['Germany', 'Netherlands']")
two_countries["country"] = set_cats(two_countries["country"], ordered=False)
two_countries

### Means of computer usage / programming by country, tiny example 

- used in bird's eye view of Pandas
- used in first statistical steps part

In [None]:
groupby_cols = ["country"]
cat_cols = ["use_computer_at_work", "programs_monthly"]
_grouped = four_countries[groupby_cols + cat_cols].groupby(groupby_cols)
means_by_country = pd.DataFrame(index=_grouped.count().index)
means_by_country["use_computer_at_work"] = (
    _grouped["use_computer_at_work"]
    .value_counts()
    .xs(key="Yes", level="use_computer_at_work")
    / _grouped["use_computer_at_work"].count()
)
means_by_country["programs_monthly"] = _grouped["programs_monthly"].mean()
means_by_country[["use_computer_at_work", "programs_monthly"]].to_feather(
    "piaac_computer_programming_by_country.arrow",
)
means_by_country

### Means of some variables by country and age group

- used in groupby example
- 

In [None]:
groupby_cols = ["country", "age_group"]
cat_cols = ["use_computer_at_work", "programs_monthly"]

selected = two_countries.query("age_group in ['Aged 30-34', 'Aged 55-59']")
selected["age_group"] = set_cats(selected["age_group"], ordered=True)

_grouped = selected[groupby_cols + cat_cols].groupby(groupby_cols)
means_by_country_and_age = pd.DataFrame(index=_grouped.count().index)
means_by_country_and_age["use_computer_at_work"] = (
    _grouped["use_computer_at_work"]
    .value_counts()
    .xs(key="Yes", level="use_computer_at_work")
    / _grouped["use_computer_at_work"].count()
)
means_by_country_and_age["programs_monthly"] = _grouped["programs_monthly"].mean()
means_by_country_and_age[["use_computer_at_work", "programs_monthly"]].to_feather(
    "piaac_computer_programming_by_country_and_age_group.arrow",
)
print(means_by_country_and_age.to_html(border=0))

### country, gender, hours of work, computer usage / programming for 5 people

- Used as example in pandas datatypes lecture

In [None]:
some_5 = (
    four_countries.query("programming_at_work.notna()")
    .groupby(["gender", "programming_at_work"])
    .sample(1, random_state=109783751)[
        [
            "country",
            "gender",
            "hours_per_week",
            "programming_at_work",
        ]
    ]
    .dropna(how="any", axis="index")
    .sample(5, random_state=495)
    .reset_index(drop=True)
)
some_5.to_feather("piaac_some_5.arrow")
some_5["country"] = set_cats(some_5["country"], ordered=False)
some_5

### Age cat, hours of work, computer usage / programming for 20 Germans

- Used as example in first metrics lecture

In [None]:
de_20 = (
    full.query("country == 'Germany' & age_group in ['Aged 30-34', 'Aged 55-59']")
    .sample(20, random_state=4)
    .reset_index(drop=True)[
        [
            "age_group",
            "gender",
            "hours_per_week",
            "use_computer_at_work",
            "programming_at_work",
        ]
    ]
)
de_20["age_group"] = set_cats(de_20["age_group"], ordered=True)
de_20.to_feather("piaac_de_20.arrow")
de_20

### Tertiary education, and computer experience, computer usage at work (Cross-Country)

- Used as example for first statsmodels regression

In [None]:
full["highest_educ"].unique().tolist()

In [None]:
piaac_education_and_computer = full.copy()

tertiary = [
    "ISCED 5B",
    "ISCED 5A, master degree",
    "ISCED 5A, bachelor degree",
    "ISCED 6",
    "ISCED 5A bachelor degree, 5A master degree, and 6 (without distinction)",
]
nan = [np.nan, "Foreign qualification"]

piaac_education_and_computer["fraction_with_tertiary_education"] = np.where(
    piaac_education_and_computer["highest_educ"].isin(tertiary),
    1,
    np.where(piaac_education_and_computer["highest_educ"].isin(nan), np.nan, 0),
)

piaac_education_and_computer["fraction_with_general_computer_experience"] = np.where(
    piaac_education_and_computer["computer_experience_in_general"] == "Yes",
    1,
    np.where(
        piaac_education_and_computer["computer_experience_in_general"] == "No",
        0,
        np.nan,
    ),
)

piaac_education_and_computer["fraction_using_computer_at_work"] = np.where(
    piaac_education_and_computer["use_computer_at_work"] == "Yes",
    1,
    np.where(piaac_education_and_computer["use_computer_at_work"] == "No", 0, np.nan),
)

by_country = piaac_education_and_computer.groupby("country")[
    [
        "fraction_with_tertiary_education",
        "fraction_with_general_computer_experience",
        "fraction_using_computer_at_work",
    ]
].mean()

by_country.to_feather(
    "piaac_education_and_computer_usage_by_country.arrow",
)

In [None]:
by_country_5 = by_country.loc[
    ["Slovak Republic", "Austria", "Germany", "United Kingdom", "Norway"]
]
by_country_5.to_feather(
    "piaac_education_and_computer_usage_by_country_5.arrow",
)

### France: Earnings, hours per week, and job training

- used in an exercise during the lecture
- regression with continuous and binary independent variable

In [None]:
france_labor = full.query("country=='France'").copy()

france_labor["took_job_training_binary"] = np.where(
    france_labor["took_job_training"] == "Yes",
    1,
    np.where(france_labor["took_job_training"] == "No", 0, np.nan),
)
france_labor = france_labor.dropna(
    subset=["hourly_earnings_incl_bonus", "hours_per_week", "took_job_training_binary"],
)

france_labor.to_feather(
    "france_labor.arrow",
)