> In this notebook, we import the three formatted data frames: UIS, PISA, and World Bank. We scale each feature to have a minimum of .9 and maximum of 1.1, and multiply each feature to create our three indexes: school climate, school resources, and learning outcomes. We rescale our indexes and multiply again to create our overall education quality index.

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
uis = pd.read_csv('../data/UIS/uis.csv', index_col='Year_Country')
pisa = pd.read_csv('../data/pisa/pisa.csv', index_col='Year_Country')
world_bank = pd.read_csv('../data/world_bank/world_bank.csv', index_col='Year_Country')

In [3]:
df = pd.merge(pd.merge(uis, pisa, left_index=True, right_index=True),
              world_bank, left_index=True, right_index=True)

In [4]:
df.to_csv('../data/modeling/original_stats.csv', index='Year_Country')

In [5]:
df.shape

(408, 57)

In [6]:
negative_features = ([feature for feature in df.columns if 'out' in feature][:-1]
                    + [feature for feature in df.columns if 'petition' in feature]
                    + [feature for feature in df.columns if 'peaters' in feature])

In [7]:
negative_features

['Cumulative drop-out rate to the last grade of lower secondary general education, both sexes (%)',
 'Rate of out-of-school adolescents of lower secondary school age, both sexes (%)',
 'Rate of out-of-school children of primary school age, both sexes (%)',
 'Rate of out-of-school youth of upper secondary school age, both sexes (%)',
 'Rate of out-of-school children, adolescents and youth of primary and secondary school age, both sexes (%)',
 'Rate of out-of-school adolescents and youth of secondary school age, both sexes (%)',
 'Repetition rate in lower secondary general education (all grades), both sexes (%)',
 'Repetition rate in primary education (all grades), both sexes (%)',
 'Percentage of repeaters in primary education, all grades, both sexes (%)',
 'Percentage of repeaters in lower secondary general education, all grades, both sexes (%)']

In [8]:
for feature in negative_features:
    df[feature] = 1 - df[feature]

In [9]:
mm_scaler = preprocessing.MinMaxScaler(feature_range=(0.9, 1.1))
mm_df = pd.DataFrame(mm_scaler.fit_transform(df),
                     index=df.index, columns=df.columns)
mm_df.head()

Unnamed: 0_level_0,"Gross intake ratio to Grade 1 of primary education, both sexes (%)","Gross enrolment ratio, primary, both sexes (%)","Gross enrolment ratio, primary to tertiary, both sexes (%)","Gross enrolment ratio, primary and secondary, both sexes (%)","Gross enrolment ratio, upper secondary, both sexes (%)","Gross enrolment ratio, lower secondary, both sexes (%)","Gross enrolment ratio, tertiary, both sexes (%)","Net enrolment rate, primary, both sexes (%)","Net enrolment rate, secondary, both sexes (%)","Net enrolment rate, lower secondary, both sexes (%)",...,Expenditure on secondary education (% of government expenditure on education),"Government expenditure per student, secondary (% of GDP per capita)",Expenditure on primary education (% of government expenditure on education),"Government expenditure per student, primary (% of GDP per capita)","Current education expenditure, total (% of total expenditure in public institutions)","Current education expenditure, tertiary (% of total expenditure in tertiary public institutions)","Current education expenditure, secondary (% of total expenditure in secondary public institutions)","Current education expenditure, primary (% of total expenditure in primary public institutions)","Literacy rate, adult total (% of people ages 15 and above)","Literacy rate, youth total (% of people ages 15-24)"
Year_Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000_Albania,0.94266,0.965837,0.924936,0.925246,0.906309,0.952408,0.916788,1.054992,0.989347,,...,,,,,,,,,,
2000_Algeria,0.931875,0.966554,,0.931706,0.90344,0.938056,,0.981459,,,...,,0.95489,,0.922813,,,,,,
2000_Argentina,0.968821,1.013195,0.990274,1.009598,0.937758,0.993139,0.982925,1.090874,1.029752,1.05025,...,0.973779,0.948253,1.004107,0.93629,,,,,,
2000_Australia,,0.94972,,,,,,1.035708,,,...,0.983932,,0.99672,0.960785,1.059723,1.064629,1.0662,1.068148,,
2000_Austria,,0.961927,,0.985765,0.962058,0.963278,,,1.061749,,...,1.007935,0.994732,0.931337,0.992694,1.071869,1.088068,1.070098,1.068489,,


In [10]:
school_climate = list(uis.columns[:-2])
school_climate

['Gross intake ratio to Grade 1 of primary education, both sexes (%)',
 'Gross enrolment ratio, primary, both sexes (%)',
 'Gross enrolment ratio, primary to tertiary, both sexes (%)',
 'Gross enrolment ratio, primary and secondary, both sexes (%)',
 'Gross enrolment ratio, upper secondary, both sexes (%)',
 'Gross enrolment ratio, lower secondary, both sexes (%)',
 'Gross enrolment ratio, tertiary, both sexes (%)',
 'Net enrolment rate, primary, both sexes (%)',
 'Net enrolment rate, secondary, both sexes (%)',
 'Net enrolment rate, lower secondary, both sexes (%)',
 'Adjusted net intake rate to Grade 1 of primary education, both sexes (%)',
 'Adjusted net enrolment rate, lower secondary, both sexes (%)',
 'Adjusted net enrolment rate, primary, both sexes (%)',
 'Total net enrolment rate, primary, both sexes (%)',
 'Total net enrolment rate, lower secondary, both sexes (%)',
 'Net enrolment rate, upper secondary, both sexes (%)',
 'Gross intake ratio to Grade 1 of lower secondary educ

In [11]:
school_resources = list(world_bank.columns[:-2])
school_resources

['Government expenditure on education, total (% of GDP)',
 'Government expenditure on education, total (% of government expenditure)',
 'Expenditure on tertiary education (% of government expenditure on education)',
 'Government expenditure per student, tertiary (% of GDP per capita)',
 'Expenditure on secondary education (% of government expenditure on education)',
 'Government expenditure per student, secondary (% of GDP per capita)',
 'Expenditure on primary education (% of government expenditure on education)',
 'Government expenditure per student, primary (% of GDP per capita)',
 'Current education expenditure, total (% of total expenditure in public institutions)',
 'Current education expenditure, tertiary (% of total expenditure in tertiary public institutions)',
 'Current education expenditure, secondary (% of total expenditure in secondary public institutions)',
 'Current education expenditure, primary (% of total expenditure in primary public institutions)']

In [12]:
learning_outcomes = list(pisa.columns) + list(world_bank.columns[-2:])
learning_outcomes

['pisa_math',
 'pisa_reading',
 'pisa_science',
 'Literacy rate, adult total (% of people ages 15 and above)',
 'Literacy rate, youth total (% of people ages 15-24)']

In [13]:
mm_df['climate_index'] = 1

for feature in school_climate:
    mm_df['climate_index'] = mm_df['climate_index'].multiply(mm_df[feature], fill_value=1)

In [14]:
mm_df['resources_index'] = 1

for feature in school_resources:
    mm_df['resources_index'] = mm_df['resources_index'].multiply(mm_df[feature], fill_value=1)

In [15]:
mm_df['learning_index'] = 1

for feature in learning_outcomes:
    mm_df['learning_index'] = mm_df['learning_index'].multiply(mm_df[feature], fill_value=1)

In [16]:
df_indices = ['climate_index', 'resources_index', 'learning_index']

In [18]:
mm_scaler = preprocessing.MinMaxScaler(feature_range=(0.9, 1.1))
index_df = pd.DataFrame(mm_scaler.fit_transform(mm_df),
                     index=df.index, columns=mm_df.columns)

In [19]:
index_df['ed_qual_index'] = (index_df['climate_index'] * index_df['resources_index']
                          * index_df['learning_index'])
index_df.head()

Unnamed: 0_level_0,"Gross intake ratio to Grade 1 of primary education, both sexes (%)","Gross enrolment ratio, primary, both sexes (%)","Gross enrolment ratio, primary to tertiary, both sexes (%)","Gross enrolment ratio, primary and secondary, both sexes (%)","Gross enrolment ratio, upper secondary, both sexes (%)","Gross enrolment ratio, lower secondary, both sexes (%)","Gross enrolment ratio, tertiary, both sexes (%)","Net enrolment rate, primary, both sexes (%)","Net enrolment rate, secondary, both sexes (%)","Net enrolment rate, lower secondary, both sexes (%)",...,"Current education expenditure, total (% of total expenditure in public institutions)","Current education expenditure, tertiary (% of total expenditure in tertiary public institutions)","Current education expenditure, secondary (% of total expenditure in secondary public institutions)","Current education expenditure, primary (% of total expenditure in primary public institutions)","Literacy rate, adult total (% of people ages 15 and above)","Literacy rate, youth total (% of people ages 15-24)",climate_index,resources_index,learning_index,ed_qual_index
Year_Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000_Albania,0.94266,0.965837,0.924936,0.925246,0.906309,0.952408,0.916788,1.054992,0.989347,,...,,,,,,,0.910092,0.952688,0.939724,0.814772
2000_Algeria,0.931875,0.966554,,0.931706,0.90344,0.938056,,0.981459,,,...,,,,,,,0.904644,0.952158,0.962724,0.829256
2000_Argentina,0.968821,1.013195,0.990274,1.009598,0.937758,0.993139,0.982925,1.090874,1.029752,1.05025,...,,,,,,,0.952198,0.926799,0.962724,0.849601
2000_Australia,,0.94972,,,,,,1.035708,,,...,1.059723,1.064629,1.0662,1.068148,,,0.913879,1.020629,0.988161,0.921688
2000_Austria,,0.961927,,0.985765,0.962058,0.963278,,,1.061749,,...,1.071869,1.088068,1.070098,1.068489,,,0.922336,1.027773,0.978419,0.927494


In [20]:
index_df.to_csv('../data/modeling/scaled_stats.csv', index='Year_Country')