In [3]:
import pandas as pd
import numpy as np
import world_bank_data as wb
import polars as pl
from scipy.stats import gmean
import os 
os.chdir('..')

In [1]:
    def adjust(df):
        """
        Function for calculating the adjustment coefficient using the atkinson's method

        Parameters
        ----------
        df : <pd.DataFrame>
            dataframe with the index of the idh index

        Returns
        -------
        <float>
            coefficient of the adjustment
        <float>
            mean of the index
        <float>
            geometric mean of the index
        <float>
            atkinson's coefficient of the index
        """
        gemetric = gmean(df)
        amean = df.mean()
        atkinson = 1 - gemetric/amean
        coef = 1 - atkinson
        return coef, amean, gemetric, atkinson

    def to_category(value):
        mapping = {4:1, 5:2, 6:3, 7:4, 8:5, 
                   9:6, 10:7, 11:8, 2:9, 13:10,
                   14: 11,15: 11, 16:12, 17:12, 
                   18:12.5, 19:13, 20:24, 21:16,
        }
        return mapping.get(value, 0) if value <= 21 else 18

In [4]:
# create the dataframe and loop through the files
empty_df = [
    pl.Series("Year", [], dtype=pl.Int64),
    pl.Series("edu_index", [], dtype=pl.Float64),
    pl.Series("edu_index_adjusted", [], dtype=pl.Float64),
]
edu_index = pl.DataFrame(empty_df).clear()

for file in os.listdir('data/raw/'):
    if file.startswith('data_ppr'):
        df = pl.read_csv(f"data/raw/{file}",ignore_errors=True)
        df = df.select(pl.col("AGEP", "SCH", "SCHL"))

In [7]:
df = pl.read_csv("data/raw/data_ppr_2020_raw.csv", ignore_errors=True)
df = df.select(pl.col("AGEP", "SCH", "SCHL"))

# calcualte the mean of years of schooling
edu_sch = df.filter(pl.col("AGEP") >= 25)
edu_sch = edu_sch.with_columns(
    pl.when(pl.col("SCHL") < 4).then(0)
      .when(pl.col("SCHL") == 4).then(1)
      .when(pl.col("SCHL") == 5).then(2)
      .when(pl.col("SCHL") == 6).then(3)
      .when(pl.col("SCHL") == 7).then(4)
      .when(pl.col("SCHL") == 8).then(5)
      .when(pl.col("SCHL") == 9).then(6)
      .when(pl.col("SCHL") == 10).then(7)
      .when(pl.col("SCHL") == 11).then(8)
      .when(pl.col("SCHL") == 12).then(9)
      .when(pl.col("SCHL") == 13).then(10)
      .when(pl.col("SCHL") == 14).then(11)
      .when(pl.col("SCHL") == 15).then(11)
      .when(pl.col("SCHL") == 16).then(12)
      .when(pl.col("SCHL") == 17).then(12)
      .when(pl.col("SCHL") == 18).then(12.5)
      .when(pl.col("SCHL") == 19).then(13)
      .when(pl.col("SCHL") == 20).then(14)
      .when(pl.col("SCHL") == 21).then(16)
      .otherwise(18).alias("schooling")
)
edu_sch


AGEP,SCH,SCHL,schooling
i64,i64,i64,f64
32,1,14,11.0
62,1,6,3.0
67,1,9,6.0
49,1,19,13.0
51,1,16,12.0
…,…,…,…
33,1,20,14.0
72,1,11,8.0
75,1,8,5.0
57,1,13,10.0


In [13]:
edu_sch = edu_sch.with_columns(
    pl.when(pl.col("schooling") > 1).then(1).otherwise(0).alias("enroled")
)

# get coeficient of ajustment
mean =  edu_sch.select(pl.col("schooling")).mean()
mean

schooling
f64
12.227718


In [None]:
edu_sch['scholing'] = edu_sch['SCHL']
edu_sch.reset_index(inplace=True)
edu_sch['scholing'] = edu_sch['scholing'].apply(lambda x: self.to_category(x))
edu_sch['enroled'] = np.where(edu_sch['scholing'] > 1, 1, 0)
mean_sch = edu_sch['scholing'].mean()

In [None]:
# create the dataframe and loop through the files
edu_index = pd.DataFrame([],columns=['Year', 'edu_index', 'edu_index_ajusted'])
for file in os.listdir(folder_path):
    if file.startswith('data_ppr'):
        df = pd.read_csv(folder_path + file, engine="pyarrow")
        df = df[['AGEP', 'SCH', 'SCHL']]
        
        # calcualte the mean of years of schooling
        edu_sch = df[df['AGEP'] >= 25].copy()
        edu_sch['scholing'] = edu_sch['SCHL']
        edu_sch.reset_index(inplace=True)
        edu_sch['scholing'] = edu_sch['scholing'].apply(lambda x: self.to_category(x))
        edu_sch['enroled'] = np.where(edu_sch['scholing'] > 1, 1, 0)
        mean_sch = edu_sch['scholing'].mean()

        # get coeficient of ajustment
        edu_sch['no_zero_schooling'] = 1 + edu_sch['scholing']
        coef, amean, gemetric, atkinson = self.adjust(edu_sch['no_zero_schooling'])

        # calculate the expected years of schooling
        edu_exp = df[df['AGEP'] < 25].copy()
        edu_exp['enrolled'] = edu_exp['SCH'].apply(lambda x: 1 if x > 1 else 0)
        edu_age = edu_exp.groupby(['AGEP'])[['AGEP','enrolled']].count()
        edu_age['enrolled'] = edu_exp.groupby(['AGEP'])['enrolled'].sum()
        edu_age['enrollment_rate'] = edu_age['enrolled'] / edu_age['AGEP']
        edu_age = edu_age.rename (columns = {'AGEP': 'count'})
        edu_age = edu_age.reset_index()
        edu_age.drop([0,1,2,3,4], inplace=True)
        exp_sch = edu_age['enrollment_rate'].sum()

        # calculate index
        edu_value = (mean_sch/15 + exp_sch/18) / 2
        edu_value_ajusted = coef * edu_value
        year = file.split('_')[2]
        edu_index = pd.concat([
            edu_index if not edu_index.empty else None,
            pd.DataFrame([[year, edu_value, edu_value_ajusted, atkinson, mean_sch, exp_sch]], columns=['Year', 'edu_index', 'edu_index_ajusted', 'atkinson', "Mean years of schooling", "Expected years of schooling"])], ignore_index=True)
        edu_index = edu_index.sort_values(by='Year', ascending=True)
    else:
        continue
# growth rate for edu index & edu index ajusted
edu_index['growth_rate'] = edu_index['edu_index'].pct_change() * 100
edu_index['growth_rate_ajusted'] = edu_index['edu_index_ajusted'].pct_change() * 100
if debug:
    return edu_index    
else:
    # round to 2 decimals
    # edu_index = edu_index.round(2)
    edu_index.to_csv('data/processed/edu_index.csv', index=False)