> In this notebook, we import and format the PISA assessment data for modeling. Features from this data help to measure "learning outcomes".

In [405]:
import pandas as pd
import numpy as np

In [406]:
math = pd.read_csv('../data/pisa/math_report-Table 1.csv')
reading = pd.read_csv('../data/pisa/reading_report-Table 1.csv')
science = pd.read_csv('../data/pisa/science_report-Table 1.csv')

In [407]:
years = [year for year in range(2000, 2016, 3)]
years

[2000, 2003, 2006, 2009, 2012, 2015]

In [408]:
countries = ['Albania', 'Algeria', 'Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil',
             'Bulgaria', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Croatia',
             'Cyprus', 'Czechia', 'Denmark', 'Dominican Republic', 'Estonia', 'Finland',
             'France', 'Georgia', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Indonesia',
             'Ireland', 'Israel', 'Italy', 'Japan', 'Jordan', 'Kazakhstan', 'Latvia',
             'Lebanon', 'Lithuania', 'Luxembourg', 'Malaysia', 'Malta', 'Mexico',
             'Montenegro', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway',
             'Peru', 'Poland', 'Portugal', 'Qatar', 'Republic of Korea', 'Republic of Moldova',
             'Romania', 'Russian Federation', 'Singapore', 'Slovakia', 'Slovenia', 'Spain',
             'Sweden', 'Switzerland', 'Thailand', 'Trinidad and Tobago', 'Tunisia', 'Turkey',
             'United Arab Emirates', 'United Kingdom of Great Britain and Northern Ireland',
             'United States of America', 'Uruguay', 'Viet Nam']

In [409]:
rename_dict = {'Argentina (2015)': 'Argentina',
                          'B-S-J-G (China)': 'China',
                          'Hong Kong (China)': 'China',
                          'Macao (China)': 'China',
                          'Chinese Taipei': 'China',
                          'Czech Republic': 'Czechia',
                          'FYROM': 'North Macedonia',
                          'Korea': 'Republic of Korea',
                          'Moldova': 'Republic of Moldova',
                          'Russia': 'Russian Federation',
                          'Slovak Republic': 'Slovakia',
                          'United Kingdom':
                          'United Kingdom of Great Britain and Northern Ireland',
                          'United States': 'United States of America'}

In [410]:
for country in math['Jurisdiction']:
    if country in rename_dict.keys():
        math.loc[math['Jurisdiction'] == country, 'Jurisdiction'] = rename_dict[country]

In [411]:
for country in reading['Jurisdiction']:
    if country in rename_dict.keys():
        reading.loc[reading['Jurisdiction'] == country, 'Jurisdiction'] = rename_dict[country]

In [412]:
for country in science['Jurisdiction']:
    if country in rename_dict.keys():
        science.loc[science['Jurisdiction'] == country, 'Jurisdiction'] = rename_dict[country]

In [421]:
year_country_index = []

for year in years:
    for country in countries:
        year_country_index.append(f"{year}_{country}")

columns = [f"pisa_{subject}" for subject in ['math', 'reading', 'science']]

pisa = pd.DataFrame(index=year_country_index, columns=columns)

In [414]:
pisa.shape

(408, 3)

In [415]:
pisa.columns

Index(['pisa_math', 'pisa_reading', 'pisa_science'], dtype='object')

In [416]:
for year in years:
    year_filter = (math['Year'] == year)

    for country in countries:
        country_filter = (math['Jurisdiction'] == country)

        value_list = math[year_filter & country_filter]['Average'].values
        
        if len(value_list) == 1:
            try:
                mean_math = math[year_filter & country_filter]['Average'].values[0]
                pisa.loc[f"{year}_{country}", 'pisa_math'] = float(mean_math)
            
            except:
                pisa.loc[f"{year}_{country}", 'pisa_math'] = np.nan
                
        elif len(value_list) == 0:
            try:
                mean_math = science[year_filter & country_filter]['Average']
                pisa.loc[f"{year}_{country}", 'pisa_math'] = float(mean_math)
                
            except:
                pisa.loc[f"{year}_{country}", 'pisa_math'] = np.nan
        else:
            pisa.loc[f"{year}_{country}", 'pisa_math'] = np.nan

In [417]:
for year in years:
    year_filter = (reading['Year'] == year)

    for country in countries:
        country_filter = (reading['Jurisdiction'] == country)

        value_list = reading[year_filter & country_filter]['Average'].values
        
        if len(value_list) == 1:
            try:
                mean_reading = reading[year_filter & country_filter]['Average'].values[0]
                pisa.loc[f"{year}_{country}", 'pisa_reading'] = float(mean_reading)
            
            except:
                pisa.loc[f"{year}_{country}", 'pisa_reading'] = np.nan
                
        elif len(value_list) == 0:
            try:
                mean_reading = reading[year_filter & country_filter]['Average']
                pisa.loc[f"{year}_{country}", 'pisa_reading'] = float(mean_reading)
                
            except:
                pisa.loc[f"{year}_{country}", 'pisa_reading'] = np.nan
        else:
            pisa.loc[f"{year}_{country}", 'pisa_math'] = np.nan

In [418]:
for year in years:
    year_filter = (science['Year'] == year)

    for country in countries:
        country_filter = (science['Jurisdiction'] == country)

        value_list = science[year_filter & country_filter]['Average'].values
        
        if len(value_list) == 1:
            try:
                mean_science = science[year_filter & country_filter]['Average'].values[0]
                pisa.loc[f"{year}_{country}", 'pisa_science'] = float(mean_science)
            
            except:
                pisa.loc[f"{year}_{country}", 'pisa_science'] = np.nan
                
        elif len(value_list) == 0:
            try:
                mean_science = science[year_filter & country_filter]['Average']
                pisa.loc[f"{year}_{country}", 'pisa_science'] = float(mean_science)
                
            except:
                pisa.loc[f"{year}_{country}", 'pisa_science'] = np.nan
        else:
            pisa.loc[f"{year}_{country}", 'pisa_math'] = np.nan

In [420]:
pisa.to_csv('../data/pisa/pisa.csv', index_label='Year_Country')